In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when
from datetime import date

# -----------------------------------------------------------------------------
# 1️⃣ Initialize Spark session
# -----------------------------------------------------------------------------
spark = SparkSession.builder.appName("Create_Label_Store_2").getOrCreate()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/30 06:51:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/10/30 06:51:08 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
# -----------------------------------------------------------------------------
# 2️⃣ Define paths
# -----------------------------------------------------------------------------
latest_txn_path = "/app/datamart/silver/latest_transactions"
transactions_path = "/app/datamart/silver/transactions"
label_store_path = "/app/datamart/gold/label_store_2"


In [3]:

# -----------------------------------------------------------------------------
# 3️⃣ Load data
# -----------------------------------------------------------------------------
df_latest = spark.read.parquet(latest_txn_path)
df_txn = spark.read.parquet(transactions_path)


In [4]:
# -----------------------------------------------------------------------------
# 4️⃣ Filter users whose membership expires on snapshot date
# -----------------------------------------------------------------------------
snapshot_date = "2017-03-01"

df_expiring = (
    df_latest
    .filter(col("membership_expire_date") == snapshot_date)
    .select("msno")
    .distinct()
)


In [5]:
# -----------------------------------------------------------------------------
# 5️⃣ Find renewals within 30 days (Mar 2 – Mar 31, 2017)
# -----------------------------------------------------------------------------
df_renewals = (
    df_txn
    .filter(
        (col("transaction_date") >= "2017-03-02") &
        (col("transaction_date") <= "2017-03-31") &
        (col("is_cancel") == 0)
    )
    .select("msno")
    .distinct()
    .withColumn("renewed_within_30_days", lit(1))
)


In [6]:
# -----------------------------------------------------------------------------
# 6️⃣ Join and assign churn label
# -----------------------------------------------------------------------------
df_label = (
    df_expiring
    .join(df_renewals, on="msno", how="left")
    .withColumn(
        "is_churn",
        when(col("renewed_within_30_days").isNotNull(), lit(0)).otherwise(lit(1))
    )
    .withColumn("snapshot_date", lit(snapshot_date))
    .select("msno", "snapshot_date", "is_churn")
)


In [7]:
# -----------------------------------------------------------------------------
# 7️⃣ Save as gold label store
# -----------------------------------------------------------------------------
(
    df_label
    .write
    .mode("overwrite")
    .parquet(label_store_path)
)

print(f"✅ Label store successfully created at {label_store_path}")
print(f"Total labeled users: {df_label.count()}")
df_label.show(10, truncate=False)

25/10/30 06:52:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/30 06:52:38 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/30 06:52:42 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
25/10/30 06:52:42 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 84.44% for 9 writers
25/10/30 06:52:42 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 76.00% for 10 writers
25/10/30 06:52:42 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 69.09% for 11 writers
25/10/30 06:52:42 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 63.33% for

✅ Label store successfully created at /app/datamart/gold/label_store_2


                                                                                

Total labeled users: 7839


25/10/30 06:52:48 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/30 06:52:50 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
                                                                                

+--------------------------------------------+-------------+--------+
|msno                                        |snapshot_date|is_churn|
+--------------------------------------------+-------------+--------+
|+7Z3QJTroH8GLHLzqV7cuNqR+/QBBUhGCbsnSi6z1VI=|2017-03-01   |0       |
|239LrMXw/kwlE/PQUx6hm+OKmlI0fxqQ7gQQSpyrTrk=|2017-03-01   |0       |
|2ASaLSs9SaKesSBJ9AFR3K199n2/52vl0ti4iH5+Pkk=|2017-03-01   |0       |
|3lSBGvEmNOTkyPsM/GUZCVbZ10QdR3GkfJfNDdTXMYQ=|2017-03-01   |0       |
|4S6M3KY0veHjY47mn7NW4caWE/VyYkbCAEYs+zKJdy0=|2017-03-01   |0       |
|4tL0SxdppYSdk4ViJFqXTLsMIqu1LLwFOoIVLzDdSoM=|2017-03-01   |0       |
|5PzxHdtJsGb1zHdoEErhoTn7vpX0ZSLTtopMANWpnws=|2017-03-01   |0       |
|5bOS9KAxceMrFB469Jv7JsZ8tDSKSvpnQJLTEI6sszU=|2017-03-01   |1       |
|6jHM/YUOA0E2Fxv26cGwRCNzZdiwRSGbNReS0FQ7QFw=|2017-03-01   |0       |
|A/cWeqg7yaos4bxvnmWyx9cHtI8oert2PdZFs6S/W6Y=|2017-03-01   |0       |
+--------------------------------------------+-------------+--------+
only showing top 10 

In [8]:
# Check churn distribution 

from pyspark.sql.functions import count, round, col

# Load your label store
label_path = "/app/datamart/gold/label_store_2"
df_label = spark.read.parquet(label_path)

# Count total records
total = df_label.count()

# Group by churn label and compute percentage
df_summary = (
    df_label
    .groupBy("is_churn")
    .agg(count("*").alias("count"))
    .withColumn("percentage", round((col("count") / total) * 100, 2))
    .orderBy("is_churn")
)

print(f"Total labeled users: {total}")
df_summary.show(truncate=False)

Total labeled users: 7839
+--------+-----+----------+
|is_churn|count|percentage|
+--------+-----+----------+
|0       |6256 |79.81     |
|1       |1583 |20.19     |
+--------+-----+----------+

