# ─────────────────────────────────────────────
# 📘 PySpark Notebook for Data Preprocessing
# ─────────────────────────────────────────────

In [1]:
import os
!java -version
print("JAVA_HOME:", os.environ.get("JAVA_HOME"))

openjdk version "11.0.26" 2025-01-21
OpenJDK Runtime Environment (build 11.0.26+4-post-Ubuntu-1ubuntu122.04)
OpenJDK 64-Bit Server VM (build 11.0.26+4-post-Ubuntu-1ubuntu122.04, mixed mode, sharing)
JAVA_HOME: /usr/lib/jvm/java-11-openjdk-amd64


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# 🔹 Start Spark session
spark = SparkSession.builder \
    .appName("TelecomDataProcessing") \
    .getOrCreate()

# 🔹 Paths
RAW_PATH = "../data/raw/"
PROCESSED_PATH = "../data/processed/parquet/"

In [5]:
import os
import shutil

def save_dataframe(df, path, format="parquet", single_file=True, overwrite=True):
    """
    Save Spark DataFrame to disk in an easy-to-process format.

    Args:
        df (DataFrame): Spark DataFrame to save.
        path (str): Destination path (e.g., 'data/processed/users').
        format (str): 'csv' or 'parquet'.
        single_file (bool): If True, output will be a single file (coalesce + move).
        overwrite (bool): If True, overwrite existing files.

    Returns:
        str: Final path to the saved file or directory.
    """
    import os
    import shutil

    if overwrite and os.path.exists(path):
        shutil.rmtree(path)

    temp_path = path + "_tmp"
    df_to_save = df.coalesce(1) if single_file else df
    writer = df_to_save.write.option("header", True).mode("overwrite")

    if format == "csv":
        writer.csv(temp_path)
    elif format == "parquet":
        writer.parquet(temp_path)
    else:
        raise ValueError("Unsupported format. Use 'csv' or 'parquet'.")

    if single_file:
        part_file = None
        for fname in os.listdir(temp_path):
            if fname.startswith("part-") and fname.endswith(f".{format}"):
                part_file = fname
                break
        if part_file:
            final_path = path + f".{format}"
            shutil.move(os.path.join(temp_path, part_file), final_path)
            shutil.rmtree(temp_path)
            return final_path
    else:
        if os.path.exists(path):
            shutil.rmtree(path)
        shutil.move(temp_path, path)
        return path

# ─────────────────────────────────────────────
# 1️⃣ Process: Traffic Volume (For Forecasting)
# ─────────────────────────────────────────────

In [6]:
# Load CDR data
cdr = spark.read.csv(RAW_PATH + "synthetic_cdr.csv", header=True, inferSchema=True)

# Aggregate traffic by tower and hour
traffic = cdr.withColumn("hour", F.date_trunc("hour", F.col("timestamp"))) \
    .groupBy("tower_id", "hour") \
    .agg(
        F.count("*").alias("total_calls"),
        F.sum("data_usage_mb").alias("total_data_used_mb")
    )

# Save using the helper
output_path = save_dataframe(
    df=traffic,
    path=PROCESSED_PATH + "traffic_volume",
    format="parquet",
    single_file=True
)

print(f"✅ traffic_volume saved to {output_path}")

✅ traffic_volume saved to ../data/processed/parquet/traffic_volume.parquet


# ─────────────────────────────────────────────
# 2️⃣ Process: Graph Edges (For Graph ML)
# ─────────────────────────────────────────────

In [7]:
edges = cdr.groupBy("user_id", "tower_id") \
    .agg(F.count("*").alias("interaction_weight"))

# Save using the helper
output_path = save_dataframe(
    df=edges,
    path=PROCESSED_PATH + "telecom_graph_edges",
    format="parquet",
    single_file=True
)

print(f"✅ telecom_graph_edges saved to {output_path}")

print("✅ telecom_graph_edges saved.")

✅ telecom_graph_edges saved to ../data/processed/parquet/telecom_graph_edges.parquet
✅ telecom_graph_edges saved.


# ─────────────────────────────────────────────
# 3️⃣ Process: Network Stats (For Anomaly/VAE)
# ─────────────────────────────────────────────

In [8]:
logs = spark.read.csv(RAW_PATH + "anomaly_logs.csv", header=True, inferSchema=True)

features = logs.select("latency_ms", "packet_loss", "cpu_load", "anomaly")

# Save using the helper
output_path = save_dataframe(
    df=features,
    path=PROCESSED_PATH + "network_behaviors",
    format="parquet",
    single_file=True
)

print(f"✅ network_behaviors saved to {output_path}")

print("✅ network_behaviors saved.")

✅ network_behaviors saved to ../data/processed/parquet/network_behaviors.parquet
✅ network_behaviors saved.


# ─────────────────────────────────────────────
# 4️⃣ Process: User Churn Features (For Churn Modeling)
# ─────────────────────────────────────────────

In [10]:
plans = spark.read.csv(PROCESSED_PATH + "user_plan_history.csv", header=True, inferSchema=True)

churn_ds = plans.groupBy("user_id") \
    .agg(
        F.max("churn_flag").alias("churned"),
        F.avg("minutes_used").alias("avg_minutes"),
        F.avg("data_used_gb").alias("avg_data_gb"),
        F.avg("satisfaction_score").alias("avg_satisfaction")
    )

# Save using the helper
output_path = save_dataframe(
    df=churn_ds,
    path=PROCESSED_PATH + "user_churn_dataset",
    format="parquet",
    single_file=True
)

print(f"✅ user_churn_dataset saved to {output_path}")

print("✅ user_churn_dataset saved.")

✅ user_churn_dataset saved to ../data/processed/parquet/user_churn_dataset.parquet
✅ user_churn_dataset saved.


# ─────────────────────────────────────────────
# 4️⃣ Process: Copy Raw files to Processed
# ─────────────────────────────────────────────

In [14]:
import pandas as pd

files_to_copy = [
    (RAW_PATH+"user_profiles.csv", PROCESSED_PATH+"user_profiles.parquet"),
    (RAW_PATH+"tower_locations.csv", PROCESSED_PATH+"tower_locations.parquet"),
    (PROCESSED_PATH+"user_plan_history.csv", PROCESSED_PATH+"user_plan_history.parquet")
]

# Copy files
for src, dst in files_to_copy:
    df = pd.read_csv(src)
    df.to_parquet(dst)
    print(f"✅ Copied: {src} → {dst}")

✅ Copied: ../data/raw/user_profiles.csv → ../data/processed/parquet/user_profiles.parquet
✅ Copied: ../data/raw/tower_locations.csv → ../data/processed/parquet/tower_locations.parquet


FileNotFoundError: [Errno 2] No such file or directory: '../data/raw/user_plan_history.csv'