# ─────────────────────────────────────────────
# 📘 PySpark Notebook for Data Preprocessing
# ─────────────────────────────────────────────

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# 🔹 Start Spark session
spark = SparkSession.builder \
    .appName("TelecomDataProcessing") \
    .getOrCreate()

# 🔹 Paths
RAW_PATH = "data/raw/"
PROCESSED_PATH = "data/processed/"

Py4JJavaError: An error occurred while calling None.org.apache.spark.api.java.JavaSparkContext.
: java.lang.UnsupportedOperationException: getSubject is not supported
	at java.base/javax.security.auth.Subject.getSubject(Subject.java:277)
	at org.apache.hadoop.security.UserGroupInformation.getCurrentUser(UserGroupInformation.java:577)
	at org.apache.spark.util.Utils$.$anonfun$getCurrentUserName$1(Utils.scala:2416)
	at scala.Option.getOrElse(Option.scala:189)
	at org.apache.spark.util.Utils$.getCurrentUserName(Utils.scala:2416)
	at org.apache.spark.SparkContext.<init>(SparkContext.scala:329)
	at org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
	at java.base/jdk.internal.reflect.DirectConstructorHandleAccessor.newInstance(DirectConstructorHandleAccessor.java:62)
	at java.base/java.lang.reflect.Constructor.newInstanceWithCaller(Constructor.java:499)
	at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:483)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:238)
	at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
	at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1447)


# ─────────────────────────────────────────────
# 1️⃣ Process: Traffic Volume (For Forecasting)
# ─────────────────────────────────────────────

In [None]:
cdr = spark.read.csv(RAW_PATH + "synthetic_cdr.csv", header=True, inferSchema=True)

traffic = cdr.withColumn("hour", F.date_trunc("hour", F.col("timestamp"))) \
    .groupBy("tower_id", "hour") \
    .agg(
        F.count("*").alias("total_calls"),
        F.sum("data_usage_mb").alias("total_data_used_mb")
    )

traffic.write.csv(PROCESSED_PATH + "traffic_volume.csv", header=True, mode="overwrite")
print("✅ traffic_volume.csv saved.")

# ─────────────────────────────────────────────
# 2️⃣ Process: Graph Edges (For Graph ML)
# ─────────────────────────────────────────────

In [None]:
edges = cdr.groupBy("user_id", "tower_id") \
    .agg(F.count("*").alias("interaction_weight"))

edges.write.csv(PROCESSED_PATH + "telecom_graph_edges.csv", header=True, mode="overwrite")
print("✅ telecom_graph_edges.csv saved.")

# ─────────────────────────────────────────────
# 3️⃣ Process: Network Stats (For Anomaly/VAE)
# ─────────────────────────────────────────────

In [None]:
logs = spark.read.csv(RAW_PATH + "anomaly_logs.csv", header=True, inferSchema=True)

features = logs.select("latency_ms", "packet_loss", "cpu_load", "anomaly")

features.write.csv(PROCESSED_PATH + "network_behaviors.csv", header=True, mode="overwrite")
print("✅ network_behaviors.csv saved.")

# ─────────────────────────────────────────────
# 4️⃣ Process: User Churn Features (For Churn Modeling)
# ─────────────────────────────────────────────

In [None]:
plans = spark.read.csv(PROCESSED_PATH + "user_plan_history.csv", header=True, inferSchema=True)

churn_ds = plans.groupBy("user_id") \
    .agg(
        F.max("churn_flag").alias("churned"),
        F.avg("minutes_used").alias("avg_minutes"),
        F.avg("data_used_gb").alias("avg_data_gb"),
        F.avg("satisfaction_score").alias("avg_satisfaction")
    )

churn_ds.write.csv(PROCESSED_PATH + "user_churn_dataset.csv", header=True, mode="overwrite")
print("✅ user_churn_dataset.csv saved.")