In [2]:
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pprint
import pyspark
import pyspark.sql.functions as F

from pyspark.sql.functions import col
from pyspark.sql.types import StringType, IntegerType, FloatType, DateType


In [3]:
# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("dev") \
    .config("spark.driver.memory", "4g") \
    .master("local[*]") \
    .getOrCreate()

# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/18 11:34:07 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Datasets
- user_logs.csv ‚Äî contain data from 2015/01/01 - 2017/02/28
- user_logs_v2.csv ‚Äî contain data from 2017/03/01 - 2017/03/31
- transactions.csv ‚Äî (transaction_date) 2015/01/01 - 2017/02/28
- transactions_v2.csv ‚Äî (transaction_date) 2015/01/01 - 2017/03/31 (has lesser rows tho ??)
- members_v3.csv ‚Äî 6mill members
- train.csv ‚Äî contains user ids and whether they have churned.
- train_v2.csv ‚Äî contains the churn data for March, 2017.


# User Logs

In [1]:

df = (spark.read
      .option("header", True)
      .option("inferSchema", True)
      .parquet("datamart/bronze/user_logs/year=2015/month=01"))


NameError: name 'spark' is not defined

In [4]:
df.cache()

DataFrame[msno: string, date: int, num_25: int, num_50: int, num_75: int, num_985: int, num_100: int, num_unq: int, total_secs: double]

In [5]:
df.show()

                                                                                

+--------------------+--------+------+------+------+-------+-------+-------+----------+
|                msno|    date|num_25|num_50|num_75|num_985|num_100|num_unq|total_secs|
+--------------------+--------+------+------+------+-------+-------+-------+----------+
|C4StQg63rttGAOqto...|20150120|     2|     2|     1|      0|      6|      9|  1699.446|
|8lZ3LnlnX1kWyKkGt...|20150109|     5|     2|     1|      0|     82|     55| 17811.605|
|GCqC99i1NrtvDGMkx...|20150129|     2|     1|     0|      1|    129|      4| 34703.904|
|Dnpj5VLwwN5mUR8HA...|20150128|     0|     0|     0|      1|      7|      7|  2162.177|
|l1PoAgkJoUkdBz9aH...|20150112|     2|     0|     0|      0|     34|     27|  9284.661|
|xrIHhq+6/NnEwklQp...|20150120|     4|     1|     1|      1|    161|    115| 40552.138|
|9iUqNqp4tOo/Gu+dk...|20150115|    20|     6|     1|      5|     30|     21|  9404.582|
|fkYsxuAJ0fyDu8wqj...|20150107|     2|     0|     0|      0|     36|     32|  8196.252|
|vEB1w4iqIf3SvU37X...|20150106| 

In [None]:
# Check for duplicate user logs for the same day
duplicate_count = df.groupBy("msno", "date").count().where("count > 1").count()

if duplicate_count > 0:
    print(f"Found {duplicate_count} instances of users with multiple logs on the same day.")
else:
    print("‚úÖ No duplicate user-date entries found.")



In [21]:
from pyspark.sql.functions import col

# Check for negative values in any of the numeric columns
negative_counts = df.where(
    (col("num_25") < 0) |
    (col("num_50") < 0) |
    (col("num_75") < 0) |
    (col("num_985") < 0) |
    (col("num_100") < 0) |
    (col("num_unq") < 0) |
    (col("total_secs") < 0)
).count()

print(f"Found {negative_counts} rows with negative numeric values.")

# Check for listening times that are impossibly long (more than 24 hours)
impossible_time_count = df.where(col("total_secs") > 86400).count()
print(f"Found {impossible_time_count} rows where total_secs exceeds 24 hours.")

Found 3 rows with negative numeric values.


[Stage 35:>                                                       (0 + 12) / 13]

Found 7226 rows where total_secs exceeds 24 hours.


                                                                                

In [23]:
# Get a statistical summary to identify outliers
df.select("num_25", "num_100", "num_unq", "total_secs").describe().show()



+-------+------------------+------------------+-----------------+--------------------+
|summary|            num_25|           num_100|          num_unq|          total_secs|
+-------+------------------+------------------+-----------------+--------------------+
|  count|          12897706|          12897706|         12897706|            12897706|
|   mean| 6.098357645925562|31.652541157318982|29.88532697209876|-2.14534248473938...|
| stddev|13.421142334833364|46.369546292679864|34.34555369998261|4.448300153104476E12|
|    min|                 0|                 0|                1|-9.22337203685156...|
|    max|             11328|             34696|             1694|    1.420659527338E9|
+-------+------------------+------------------+-----------------+--------------------+



                                                                                

In [24]:
from pyspark.sql.functions import col

# This is a rough approximation for sanity checking
df_with_approx_secs = df.withColumn("approx_secs",
    col("num_25") * 25 +
    col("num_50") * 50 +
    col("num_75") * 75 +
    col("num_100") * 240  # Assuming an average song length of 4 minutes for num_100
)

# Show rows where the reported total_secs is drastically different from our approximation
# This could indicate data quality issues.
df_with_approx_secs.select("msno", "date", "total_secs", "approx_secs").show()

+--------------------+--------+----------+-----------+
|                msno|    date|total_secs|approx_secs|
+--------------------+--------+----------+-----------+
|C4StQg63rttGAOqto...|20150120|  1699.446|       1665|
|8lZ3LnlnX1kWyKkGt...|20150109| 17811.605|      19980|
|GCqC99i1NrtvDGMkx...|20150129| 34703.904|      31060|
|Dnpj5VLwwN5mUR8HA...|20150128|  2162.177|       1680|
|l1PoAgkJoUkdBz9aH...|20150112|  9284.661|       8210|
|xrIHhq+6/NnEwklQp...|20150120| 40552.138|      38865|
|9iUqNqp4tOo/Gu+dk...|20150115|  9404.582|       8075|
|fkYsxuAJ0fyDu8wqj...|20150107|  8196.252|       8690|
|vEB1w4iqIf3SvU37X...|20150106|  2326.174|       2185|
|ccqIvziyLZs/Ivbob...|20150110|  1605.832|       1465|
|xKKR50oC2MlZ0zd+o...|20150105|    70.019|         75|
|TUWjZbEJkEvyq8rv4...|20150131|    20.651|         25|
|k6F2zRxgWsYSNWHK2...|20150117|  3155.356|       2525|
|dmvdNxpnayMipYpp0...|20150123|  2821.996|       2070|
|7rP5PTRRZkv88mQaW...|20150129|   562.586|        480|
|bYBeBROvV

In [7]:
df2 = (spark.read
      .option("header", True)
      .option("inferSchema", True)
      .csv("data/user_logs_v2.csv"))


                                                                                

In [8]:
df2.count()

                                                                                

18396362

In [52]:
from pyspark.sql import functions as F

df2.select(
    F.min("date").alias("start_date"),
    F.max("date").alias("end_date")
).show()

[Stage 93:>                                                       (0 + 12) / 12]

+----------+--------+
|start_date|end_date|
+----------+--------+
|  20170301|20170331|
+----------+--------+



                                                                                

# Transactions

In [3]:
df_transactions = (spark.read
      .option("header", True)
      .option("inferSchema", True)
      .csv("data/transactions.csv"))

                                                                                

In [4]:
df_transactions.show()

+--------------------+-----------------+-----------------+---------------+------------------+-------------+----------------+----------------------+---------+
|                msno|payment_method_id|payment_plan_days|plan_list_price|actual_amount_paid|is_auto_renew|transaction_date|membership_expire_date|is_cancel|
+--------------------+-----------------+-----------------+---------------+------------------+-------------+----------------+----------------------+---------+
|YyO+tlZtAXYXoZhNr...|               41|               30|            129|               129|            1|        20150930|              20151101|        0|
|AZtu6Wl0gPojrEQYB...|               41|               30|            149|               149|            1|        20150930|              20151031|        0|
|UkDFI97Qb6+s2LWci...|               41|               30|            129|               129|            1|        20150930|              20160427|        0|
|M1C56ijxozNaGD0t2...|               39|            

In [5]:
df_transactions.count()

                                                                                

21547746

In [6]:
from pyspark.sql import functions as F

df_transactions.select(
    F.min("transaction_date").alias("start_date"),
    F.max("transaction_date").alias("end_date")
).show()




+----------+--------+
|start_date|end_date|
+----------+--------+
|  20150101|20170228|
+----------+--------+



                                                                                

In [7]:
df_transactions_v2 = (spark.read
      .option("header", True)
      .option("inferSchema", True)
      .csv("data/transactions_v2.csv"))

                                                                                

In [8]:
df_transactions_v2.count()

1431009

In [56]:
from pyspark.sql import functions as F

df_transactions_v2.select(
    F.min("transaction_date").alias("start_date"),
    F.max("transaction_date").alias("end_date")
).show()

+----------+--------+
|start_date|end_date|
+----------+--------+
|  20150101|20170331|
+----------+--------+



# Members

In [4]:
df_members = (spark.read
      .option("header", True)
      .option("inferSchema", True)
      .parquet("datamart/bronze/members"))

                                                                                

In [5]:
df_members.show(20)

                                                                                

+--------------------+----+---+------+--------------+----------------------+----+-----+
|                msno|city| bd|gender|registered_via|registration_init_time|year|month|
+--------------------+----+---+------+--------------+----------------------+----+-----+
|QJ9CySwCo+iomREZt...|  14| 23|female|             9|              20070318|2007|   03|
|y7sBbp5P3iywwZ/uV...|  13| 28|  male|             9|              20070318|2007|   03|
|tkHWaNHpDxbqhLj6f...|   6| 29|female|             9|              20070319|2007|   03|
|PZs2zouRsd1AlSdGb...|  13| 47|female|             9|              20070320|2007|   03|
|XS+J5kXqiIjEfBpQ3...|  15| 23|female|             9|              20070320|2007|   03|
|l2+LvmYRptafj7Jnu...|   4| 30|female|             9|              20070322|2007|   03|
|Dwtb8GfTATdc8AxVQ...|   1|  0|  NULL|             9|              20140711|2014|   07|
|Yw3kUW6IO6tFrMErl...|   1|  0|  male|             3|              20140711|2014|   07|
|cG2zXGAlh/Pnoxdlg...|   4| 27| 

In [38]:
df_members.count()

6769473

In [39]:
df_members.groupBy("bd").count().orderBy("bd").show(1000)

+-----+-------+
|   bd|  count|
+-----+-------+
|-7168|      1|
|-6998|      1|
|-6807|      1|
|-6445|      1|
|-5978|      1|
|-3152|      1|
|-2828|      1|
|-1970|      1|
| -974|      1|
| -958|      1|
| -956|      1|
| -951|      1|
| -540|      1|
| -529|      1|
| -527|      1|
| -526|      2|
| -525|      1|
| -524|      1|
| -523|      5|
| -522|      2|
| -521|      2|
| -520|      1|
| -519|      6|
| -518|      5|
| -517|      5|
| -516|      6|
| -515|      3|
| -514|      4|
| -513|      2|
| -512|      1|
| -511|      4|
| -510|      4|
| -509|      3|
| -508|      2|
| -507|      1|
| -506|      3|
| -505|      2|
| -504|      4|
| -503|      5|
| -502|      1|
| -501|      1|
| -500|      2|
| -498|      1|
| -497|      1|
| -496|      1|
| -493|      1|
| -489|      1|
| -488|      1|
| -484|      1|
| -482|      1|
| -178|      1|
| -176|      1|
|  -52|      4|
|  -51|     18|
|  -50|     13|
|  -49|     16|
|  -48|     19|
|  -47|      6|
|  -46|      8|
|  -45| 

In [6]:
# ========= 0) Setup =========
from pyspark.sql import functions as F

SNAPSHOT_DATE_STR = "2017-02-28"   # cutoff cho train
SNAPSHOT_YEAR     = 2017           # (kh√¥ng d√πng n·∫øu b·∫°n gi·ªØ rule 14‚Äì68)

# ========= 1) Field Format =========
dfm = (
    df_members
      .withColumn("msno", F.lower(F.trim(F.col("msno"))))
      .withColumn("city", F.col("city").cast("int"))
      .withColumn("bd", F.col("bd").cast("int"))
      .withColumn("gender", F.lower(F.trim(F.col("gender"))))
      .withColumn("registered_via", F.col("registered_via").cast("int"))
      .withColumn("registration_init_time", F.col("registration_init_time").cast("string"))
)

# ========= 2) Date =========
dfm = dfm.withColumn(
    "registration_date",
    F.to_date(F.col("registration_init_time"), "yyyyMMdd")
)

# ========= 3) City clean =========
dfm = dfm.withColumn(
    "city_clean",
    F.when(F.col("city") <= 0, None).otherwise(F.col("city"))
)

# ========= 4.1) Gender clean =========
dfm = dfm.withColumn(
    "gender_norm",
    F.when(F.col("gender").isin("male", "female"), F.col("gender")).otherwise(F.lit("unknown"))
)

# ========= 4.2) Gender one-hot =========
dfm = (dfm
    .drop("gender_male","gender_female","gender_unknown")
    .withColumn("gender_male",    (F.col("gender_norm")=="male").cast("int"))
    .withColumn("gender_female",  (F.col("gender_norm")=="female").cast("int"))
    .withColumn("gender_unknown", (F.col("gender_norm")=="unknown").cast("int"))
)

# ========= 5) BD clean rule: 14‚Äì68 & count >= 1000 =========
bd_hist = dfm.groupBy("bd").agg(F.count("*").alias("bd_count"))
dfm = (
    dfm.join(bd_hist, on="bd", how="left")
       .withColumn("bd_count", F.coalesce(F.col("bd_count"), F.lit(0)))
       .withColumn(
           "bd_clean",
           F.when((F.col("bd").between(14, 68)) & (F.col("bd_count") >= 1000), F.col("bd"))
            .otherwise(F.lit(None).cast("int"))
       )
       .drop("bd_count")
)

# ========= 6) Tenure to cutoff =========
dfm = dfm.withColumn(
    "tenure_days_at_snapshot",
    F.datediff(F.to_date(F.lit(SNAPSHOT_DATE_STR)), F.col("registration_date"))
)

# ========= 7) Frequency enrich (Silver+) =========
# 7a) registered_via frequency
total_cnt = dfm.count()  # n·∫øu b·∫£ng r·∫•t l·ªõn, c√≥ th·ªÉ approx b·∫±ng sample ratio
via_freq = (
    dfm.groupBy("registered_via")
       .agg((F.count("*") / F.lit(total_cnt)).alias("registered_via_freq"))
)

# 7b) city frequency
city_freq = (
    dfm.groupBy("city_clean")
       .agg((F.count("*") / F.lit(total_cnt)).alias("city_freq"))
)

# 7c) Join freq
dfm = (
    dfm.drop("registered_via_freq", "city_freq")
       .join(via_freq, on="registered_via", how="left")
       .join(city_freq, on="city_clean",  how="left")
       .fillna({"registered_via_freq": 0.0, "city_freq": 0.0})
)

# ========= 8) SILVER (clean + enrich) =========
silver_cols = [
    "msno",
    "city_clean",
    "bd_clean",
    "gender_norm", "gender_male","gender_female","gender_unknown",
    "registered_via",
    "registration_date",
    "tenure_days_at_snapshot",
    "registered_via_freq",
    "city_freq"
]
silver_members = dfm.select(*silver_cols)

# ========= 9) QC =========
silver_members.selectExpr(
    "count(*) as n_rows",
    "sum(case when bd_clean is null then 1 else 0 end) as n_age_null",
    "sum(case when registration_date is null then 1 else 0 end) as n_regdate_null",
    "sum(case when city_clean is null then 1 else 0 end) as n_city_null",
    "sum(case when gender_norm = 'unknown' then 1 else 0 end) as n_gender_unknown",
    "sum(case when gender_norm = 'unknown' then 1 else 0 end) as n_gender_unknown",
    "sum(gender_male) as n_male",
    "sum(gender_female) as n_female",
    "sum(gender_unknown) as n_unknown_flag"
).show()

# ========= 10) In top-5 frequency =========
print("üèôÔ∏è Top 5 city by frequency")
(silver_members.groupBy("city_clean")
               .agg(F.avg("city_freq").alias("city_freq"))
               .orderBy(F.desc("city_freq"))
               .limit(5)
               .show(truncate=False))

print("üì± Top 5 registered_via by frequency")
(silver_members.groupBy("registered_via")
               .agg(F.avg("registered_via_freq").alias("registered_via_freq"))
               .orderBy(F.desc("registered_via_freq"))
               .limit(5)
               .show(truncate=False))

                                                                                

+-------+----------+--------------+-----------+----------------+----------------+-------+--------+--------------+
| n_rows|n_age_null|n_regdate_null|n_city_null|n_gender_unknown|n_gender_unknown| n_male|n_female|n_unknown_flag|
+-------+----------+--------------+-----------+----------------+----------------+-------+--------+--------------+
|6769473|   4556689|             0|          0|         4429505|         4429505|1195355| 1144613|       4429505|
+-------+----------+--------------+-----------+----------------+----------------+-------+--------+--------------+

üèôÔ∏è Top 5 city by frequency


                                                                                

+----------+--------------------+
|city_clean|city_freq           |
+----------+--------------------+
|1         |0.7097045811751799  |
|5         |0.05688315766983344 |
|13        |0.047415507824558806|
|4         |0.036464876955557016|
|22        |0.031081740040747853|
+----------+--------------------+

üì± Top 5 registered_via by frequency


                                                                                

+--------------+--------------------+
|registered_via|registered_via_freq |
+--------------+--------------------+
|4             |0.41261897343928855 |
|3             |0.24273795021439126 |
|9             |0.21905146826199465 |
|7             |0.11904841041541657 |
|11            |0.003699992599127816|
+--------------+--------------------+



In [7]:
silver_members.orderBy(F.rand()).show(10, False)



+--------------------------------------------+----------+--------+-----------+-----------+-------------+--------------+--------------+-----------------+-----------------------+-------------------+--------------------+
|msno                                        |city_clean|bd_clean|gender_norm|gender_male|gender_female|gender_unknown|registered_via|registration_date|tenure_days_at_snapshot|registered_via_freq|city_freq           |
+--------------------------------------------+----------+--------+-----------+-----------+-------------+--------------+--------------+-----------------+-----------------------+-------------------+--------------------+
|q4g94akkjxs/zgqda6qunkz0he00a3uwf8xwv6/sjri=|1         |NULL    |unknown    |0          |0            |1             |4             |2017-01-30       |29                     |0.41261897344150716|0.7097045811394772  |
|8hue8gdk134vz1x/hx/vlccqjevesz/bvyxvo5/1z0g=|1         |NULL    |unknown    |0          |0            |1             |4        

                                                                                

# Train

In [61]:
df_train = (spark.read
      .option("header", True)
      .option("inferSchema", True)
      .csv("data/train.csv"))

In [62]:
df_train.show()

+--------------------+--------+
|                msno|is_churn|
+--------------------+--------+
|waLDQMmcOu2jLDaV1...|       1|
|QA7uiXy8vIbUSPOkC...|       1|
|fGwBva6hikQmTJzrb...|       1|
|mT5V8rEpa+8wuqi6x...|       1|
|XaPhtGLk/5UvvOYHc...|       1|
|GBy8qSz16X5iYWD+3...|       1|
|lYLh7TdkWpIoQs3i3...|       1|
|T0FF6lumjKcqEO0O+...|       1|
|Nb1ZGEmagQeba5E+n...|       1|
|MkuWz0Nq6/Oq5fKqR...|       1|
|I8dFN2EjFN1mt4Xel...|       1|
|0Ip2rzeoa44alqEw3...|       1|
|piVhWxrWDmiNQFY6x...|       1|
|wEUOkYvyz3xTOx2p9...|       1|
|xt4EjWRyXBMgEgKBJ...|       1|
|QS3ob4zLlWcWzBIlb...|       1|
|9iW/UpqRoviya9CQh...|       1|
|d7QVMhAzjj4yc1Ojj...|       1|
|uV7rJjHPrpNssDMmY...|       1|
|TZxhkfZ9NwxqnUrNs...|       1|
+--------------------+--------+
only showing top 20 rows



In [63]:
df_train.count()

992931

In [64]:
df_train_v2 = (spark.read
      .option("header", True)
      .option("inferSchema", True)
      .csv("data/train_v2.csv"))

                                                                                

In [66]:
df_train_v2.count()

970960

In [9]:
df_members.head()

NameError: name 'df_members' is not defined

In [None]:
df_train_v2.show()

In [1]:
df_train_v2.show(5)

NameError: name 'df_train_v2' is not defined