In [2]:
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pprint
import pyspark
import pyspark.sql.functions as F

from pyspark.sql.functions import col
from pyspark.sql.types import StringType, IntegerType, FloatType, DateType

import utils.data_processing_bronze_table
import utils.data_processing_silver_table
import utils.data_processing_gold_table


In [3]:
# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("dev") \
    .master("local[*]") \
    .getOrCreate()

# Set log level to ERROR to hide warnings
spark.sparkContext.setLogLevel("ERROR")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/07 12:56:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# create bronze datalake
bronze_lms_directory = "datamart/bronze/lms/"

if not os.path.exists(bronze_lms_directory):
    os.makedirs(bronze_lms_directory)

In [30]:
# ----------------------------
# 1️⃣ Read lms_loan_daily.csv
# ----------------------------
lms_path = "data/lms_loan_daily.csv"
df_lms = spark.read.csv(lms_path, header=True, inferSchema=True)
print("✅ LMS Loan Daily Data:")
df_lms.show(5, truncate=False)
df_lms.printSchema()

# ----------------------------
# 2️⃣ Read features_attributes.csv
# ----------------------------
attr_path = "data/features_attributes.csv"
df_attr = spark.read.csv(attr_path, header=True, inferSchema=True)
print("\n✅ Customer Attributes Data:")
df_attr.show(5, truncate=False)
df_attr.printSchema()

# ----------------------------
# 3️⃣ Read features_financials.csv
# ----------------------------
fin_path = "data/features_financials.csv"
df_fin = spark.read.csv(fin_path, header=True, inferSchema=True)
print("\n✅ Financial Features Data:")
df_fin.show(5, truncate=False)
df_fin.printSchema()

# ----------------------------
# 4️⃣ Read feature_clickstream.csv
# ----------------------------
click_path = "data/feature_clickstream.csv"
df_click = spark.read.csv(click_path, header=True, inferSchema=True)
print("\n✅ Clickstream Features Data:")
df_click.show(5, truncate=False)
df_click.printSchema()

# ----------------------------
# Optional: Count rows for a quick sanity check
# ----------------------------
print(f"""
Row Counts:
  LMS: {df_lms.count()}
  ATTR: {df_attr.count()}
  FIN: {df_fin.count()}
  CLICK: {df_click.count()}
""")



✅ LMS Loan Daily Data:
+---------------------+-----------+---------------+------+---------------+--------+-------+--------+-----------+-------+-------------+
|loan_id              |Customer_ID|loan_start_date|tenure|installment_num|loan_amt|due_amt|paid_amt|overdue_amt|balance|snapshot_date|
+---------------------+-----------+---------------+------+---------------+--------+-------+--------+-----------+-------+-------------+
|CUS_0x1000_2023_05_01|CUS_0x1000 |2023-05-01     |10    |0              |10000   |0.0    |0.0     |0.0        |10000.0|2023-05-01   |
|CUS_0x1000_2023_05_01|CUS_0x1000 |2023-05-01     |10    |1              |10000   |1000.0 |1000.0  |0.0        |9000.0 |2023-06-01   |
|CUS_0x1000_2023_05_01|CUS_0x1000 |2023-05-01     |10    |2              |10000   |1000.0 |1000.0  |0.0        |8000.0 |2023-07-01   |
|CUS_0x1000_2023_05_01|CUS_0x1000 |2023-05-01     |10    |3              |10000   |1000.0 |0.0     |1000.0     |8000.0 |2023-08-01   |
|CUS_0x1000_2023_05_01|CUS_0x100

## Check df_attr

In [7]:
df_attr.show()

+-----------+-----------------+---+-----------+-------------+-------------+
|Customer_ID|             Name|Age|        SSN|   Occupation|snapshot_date|
+-----------+-----------------+---+-----------+-------------+-------------+
| CUS_0x1000|   Alistair Barrf| 18|913-74-1218|       Lawyer|   2023-05-01|
| CUS_0x1009|           Arunah| 26|063-67-6938|     Mechanic|   2025-01-01|
| CUS_0x100b|         Shirboni| 19|  #F%$D@*&8|Media_Manager|   2024-03-01|
| CUS_0x1011|        Schneyerh| 44|793-05-8223|       Doctor|   2023-11-01|
| CUS_0x1013|         Cameront| 44|930-49-9615|     Mechanic|   2023-12-01|
| CUS_0x1015|          Holtono| 27|810-97-7024|   Journalist|   2023-08-01|
| CUS_0x1018|      Felsenthalq| 15|731-19-8119|   Accountant|   2023-11-01|
| CUS_0x1026|          Josephv| 52|500-62-9044|      Manager|   2023-10-01|
| CUS_0x102d| Neil Chatterjeex| 31|692-71-7552| Entrepreneur|   2024-01-01|
| CUS_0x102e|            Rhysn| 26|  #F%$D@*&8|    Scientist|   2024-04-01|
| CUS_0x1032

A few problematic inconsistencies are seen. Age is in string and some have an underscore after the value. Occupation has empty values that are filled with underscores. SSN has problematic values that are == #F%$D@*&8

### Fix SSN

In [8]:
df_attr.filter(F.col("SSN") == "#F%$D@*&8").count()

703

In [9]:
df_attr = df_attr.withColumn(
    "SSN",
    F.when(F.col("SSN") == "#F%$D@*&8", "000-00-0000")
     .otherwise(F.col("SSN"))
)

In [10]:
df_attr.show()

+-----------+-----------------+---+-----------+-------------+-------------+
|Customer_ID|             Name|Age|        SSN|   Occupation|snapshot_date|
+-----------+-----------------+---+-----------+-------------+-------------+
| CUS_0x1000|   Alistair Barrf| 18|913-74-1218|       Lawyer|   2023-05-01|
| CUS_0x1009|           Arunah| 26|063-67-6938|     Mechanic|   2025-01-01|
| CUS_0x100b|         Shirboni| 19|000-00-0000|Media_Manager|   2024-03-01|
| CUS_0x1011|        Schneyerh| 44|793-05-8223|       Doctor|   2023-11-01|
| CUS_0x1013|         Cameront| 44|930-49-9615|     Mechanic|   2023-12-01|
| CUS_0x1015|          Holtono| 27|810-97-7024|   Journalist|   2023-08-01|
| CUS_0x1018|      Felsenthalq| 15|731-19-8119|   Accountant|   2023-11-01|
| CUS_0x1026|          Josephv| 52|500-62-9044|      Manager|   2023-10-01|
| CUS_0x102d| Neil Chatterjeex| 31|692-71-7552| Entrepreneur|   2024-01-01|
| CUS_0x102e|            Rhysn| 26|000-00-0000|    Scientist|   2024-04-01|
| CUS_0x1032

### Fix Occupation

In [169]:
df_attr.select("Occupation") \
    .distinct() \
    .orderBy("Occupation") \
    .show(100, truncate=False)


+-------------+
|Occupation   |
+-------------+
|Accountant   |
|Architect    |
|Developer    |
|Doctor       |
|Engineer     |
|Entrepreneur |
|Journalist   |
|Lawyer       |
|Manager      |
|Mechanic     |
|Media_Manager|
|Musician     |
|Scientist    |
|Teacher      |
|Writer       |
|_______      |
+-------------+



In [170]:
df_attr.filter(F.col("Occupation") == "_______").count()


880

In [171]:
df_attr = df_attr.withColumn(
    "Occupation",
    F.when(F.col("Occupation") == "_______", "Unemployed")
     .otherwise(F.col("Occupation"))
)

In [172]:
df_attr.select("Occupation") \
    .distinct() \
    .orderBy("Occupation") \
    .show(100, truncate=False)


+-------------+
|Occupation   |
+-------------+
|Accountant   |
|Architect    |
|Developer    |
|Doctor       |
|Engineer     |
|Entrepreneur |
|Journalist   |
|Lawyer       |
|Manager      |
|Mechanic     |
|Media_Manager|
|Musician     |
|Scientist    |
|Teacher      |
|Unemployed   |
|Writer       |
+-------------+



### Fix Age

In [11]:
df_attr.filter(~F.col("Age").rlike("^[0-9]+$")).select("Customer_ID", "Name", "Age").show(50, truncate=False)


+-----------+----------------------+-----+
|Customer_ID|Name                  |Age  |
+-----------+----------------------+-----+
|CUS_0x1032 |Wahbap                |40_  |
|CUS_0x1057 |David Sheppardv       |46_  |
|CUS_0x10e7 |Carewj                |3843_|
|CUS_0x10ee |Hudsonb               |30_  |
|CUS_0x111c |Deepaa                |24_  |
|CUS_0x1135 |Baertleinc            |22_  |
|CUS_0x1139 |Copleyp               |32_  |
|CUS_0x11ac |Liana B.v             |26_  |
|CUS_0x1204 |Lashg                 |46_  |
|CUS_0x1288 |Taylorq               |34_  |
|CUS_0x13a8 |Baileyz               |41_  |
|CUS_0x13c2 |Menonv                |40_  |
|CUS_0x13e4 |Edward Krudyl         |1248_|
|CUS_0x1430 |Chiango               |30_  |
|CUS_0x14a3 |Prustyx               |-500 |
|CUS_0x14f4 |Richardc              |23_  |
|CUS_0x14f5 |Richard Leongd        |41_  |
|CUS_0x157d |Thompsonf             |24_  |
|CUS_0x1600 |Megan Daviesb         |30_  |
|CUS_0x1604 |Sam Forgioneb         |43_  |
|CUS_0x163c

In [12]:
df_attr.filter((F.col("Age") < 0) | (F.col("Age") > 100)) \
    .select("Age") \
    .distinct() \
    .orderBy("Age") \
    .show()

+----+
| Age|
+----+
|-500|
|1004|
|1022|
|1066|
|1087|
|1094|
|1149|
|1203|
|1220|
|1265|
|1388|
|1418|
|1520|
|1683|
| 169|
|1733|
|1792|
|1810|
|1814|
|1990|
+----+
only showing top 20 rows



In [13]:
df_attr = df_attr.withColumn(
    "Age",
    F.when((F.col("Age") >= 0) & (F.col("Age") <= 100), F.col("Age"))
     .otherwise(None)
)


In [14]:
df_attr.filter((F.col("Age") < 0) | (F.col("Age") > 100)) \
    .select("Age") \
    .distinct() \
    .orderBy("Age") \
    .show()

+---+
|Age|
+---+
+---+



Set age to None if problematic

## DF fin

### Clean all numeric variables

In [31]:
def clean_numeric_column(df, column_name):
    """
    Cleans a numeric column in a PySpark DataFrame:
    1. Removes stray underscores from numeric values.
    2. Casts the column to FloatType.
    3. Uses IQR to detect outliers.
    4. Replaces outliers with None.
    
    Args:
        df (DataFrame): Input PySpark DataFrame
        column_name (str): Name of the numeric column to clean

    Returns:
        DataFrame: Cleaned DataFrame with outliers replaced by None
    """
    # Remove underscores and cast to float
    dtype_lookup = dict(df.dtypes).get(column_name)
    if dtype_lookup == "string":
        df = df.withColumn(column_name, F.regexp_replace(F.col(column_name), "_", ""))

    df = df.withColumn(column_name, F.col(column_name).cast(FloatType()))

    # Compute IQR boundaries
    q1, q3 = df.approxQuantile(column_name, [0.25, 0.75], 0.01)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr

    # Replace outliers with iqr ranges
    df = df.withColumn(
        column_name,
        F.when((F.col(column_name) >= lower) & (F.col(column_name) <= upper),
               F.col(column_name))
         .otherwise(F.lit(None))
    )

    print(f"[CLEAN] Column '{column_name}' cleaned. IQR bounds: ({lower:.2f}, {upper:.2f})")

    return df

In [32]:
df_fin = spark.read.csv(fin_path, header=True, inferSchema=True)

In [33]:
df_fin.dtypes

[('Customer_ID', 'string'),
 ('Annual_Income', 'string'),
 ('Monthly_Inhand_Salary', 'double'),
 ('Num_Bank_Accounts', 'int'),
 ('Num_Credit_Card', 'int'),
 ('Interest_Rate', 'int'),
 ('Num_of_Loan', 'string'),
 ('Type_of_Loan', 'string'),
 ('Delay_from_due_date', 'int'),
 ('Num_of_Delayed_Payment', 'string'),
 ('Changed_Credit_Limit', 'string'),
 ('Num_Credit_Inquiries', 'double'),
 ('Credit_Mix', 'string'),
 ('Outstanding_Debt', 'string'),
 ('Credit_Utilization_Ratio', 'double'),
 ('Credit_History_Age', 'string'),
 ('Payment_of_Min_Amount', 'string'),
 ('Total_EMI_per_month', 'double'),
 ('Amount_invested_monthly', 'string'),
 ('Payment_Behaviour', 'string'),
 ('Monthly_Balance', 'string'),
 ('snapshot_date', 'date')]

In [34]:


df_fin = clean_numeric_column(df_fin, "Annual_Income")
df_fin = clean_numeric_column(df_fin, "Monthly_Inhand_Salary")
df_fin = clean_numeric_column(df_fin, "Num_Bank_Accounts")
df_fin = clean_numeric_column(df_fin, "Num_Credit_Card")
df_fin = clean_numeric_column(df_fin, "Interest_Rate")
df_fin = clean_numeric_column(df_fin, "Num_of_Loan")
df_fin = clean_numeric_column(df_fin, "Delay_from_due_date")
df_fin = clean_numeric_column(df_fin, "Num_of_Delayed_Payment")
df_fin = clean_numeric_column(df_fin, "Changed_Credit_Limit")
df_fin = clean_numeric_column(df_fin, "Num_Credit_Inquiries")
df_fin = clean_numeric_column(df_fin, "Outstanding_Debt")
df_fin = clean_numeric_column(df_fin, "Credit_Utilization_Ratio")
df_fin = clean_numeric_column(df_fin, "Total_EMI_per_month")
df_fin = clean_numeric_column(df_fin, "Amount_invested_monthly")
df_fin = clean_numeric_column(df_fin, "Monthly_Balance")



[CLEAN] Column 'Annual_Income' cleaned. IQR bounds: (-59626.93, 150522.66)
[CLEAN] Column 'Monthly_Inhand_Salary' cleaned. IQR bounds: (-4820.54, 12303.13)
[CLEAN] Column 'Num_Bank_Accounts' cleaned. IQR bounds: (-3.00, 13.00)
[CLEAN] Column 'Num_Credit_Card' cleaned. IQR bounds: (-0.50, 11.50)
[CLEAN] Column 'Interest_Rate' cleaned. IQR bounds: (-12.50, 39.50)
[CLEAN] Column 'Num_of_Loan' cleaned. IQR bounds: (-5.00, 11.00)
[CLEAN] Column 'Delay_from_due_date' cleaned. IQR bounds: (-17.00, 55.00)
[CLEAN] Column 'Num_of_Delayed_Payment' cleaned. IQR bounds: (-4.50, 31.50)
[CLEAN] Column 'Changed_Credit_Limit' cleaned. IQR bounds: (-8.52, 28.40)
[CLEAN] Column 'Num_Credit_Inquiries' cleaned. IQR bounds: (-7.50, 20.50)
[CLEAN] Column 'Outstanding_Debt' cleaned. IQR bounds: (-1474.93, 3928.24)
[CLEAN] Column 'Credit_Utilization_Ratio' cleaned. IQR bounds: (14.98, 49.45)
[CLEAN] Column 'Total_EMI_per_month' cleaned. IQR bounds: (-173.28, 369.76)
[CLEAN] Column 'Amount_invested_monthly' cle

### Change credit history age to num of months

In [35]:
df_fin = df_fin.withColumn(
    "Credit_History_Age",
    F.regexp_replace(F.col("Credit_History_Age"), "_", "")  # remove underscores if any
)

df_fin = df_fin.withColumn(
    "Years",
    F.regexp_extract(F.col("Credit_History_Age"), r"(\d+)\s+Years", 1).cast("int")
).withColumn(
    "Months",
    F.regexp_extract(F.col("Credit_History_Age"), r"(\d+)\s+Months", 1).cast("int")
)

df_fin = df_fin.withColumn(
    "Credit_History_Age_Months",
    (F.col("Years") * 12 + F.col("Months"))
)

# Drop the old columns
df_fin = df_fin.drop("Years", "Months", "Credit_History_Age")

# rename for simplicity
df_fin = df_fin.withColumnRenamed("Credit_History_Age_Months", "Credit_History_Age")

df_fin.select("Customer_ID", "Credit_History_Age").show(10, truncate=False)


+-----------+------------------+
|Customer_ID|Credit_History_Age|
+-----------+------------------+
|CUS_0x1000 |129               |
|CUS_0x1009 |372               |
|CUS_0x100b |190               |
|CUS_0x1011 |190               |
|CUS_0x1013 |214               |
|CUS_0x1015 |257               |
|CUS_0x1018 |171               |
|CUS_0x1026 |248               |
|CUS_0x102d |363               |
|CUS_0x102e |274               |
+-----------+------------------+
only showing top 10 rows



### Fix Categorical variables: (Payment_of_Min_Amount, Payment_behaviour)

In [36]:
df_fin = df_fin.withColumn(
    "Payment_of_Min_Amount",
    F.trim(F.lower(F.col("Payment_of_Min_Amount")))  # normalize case
)

df_fin = df_fin.withColumn(
    "Payment_of_Min_Amount",
    F.when(F.col("Payment_of_Min_Amount").isin("yes", "y"), "Yes")
     .when(F.col("Payment_of_Min_Amount").isin("no", "n"), "No")
     .when(F.col("Payment_of_Min_Amount").isin("nm", "not mentioned", "na", "none"), None)
     .otherwise(F.col("Payment_of_Min_Amount"))  # keep valid ones
)

df_fin.select("Payment_of_Min_Amount").distinct().show()

+---------------------+
|Payment_of_Min_Amount|
+---------------------+
|                   No|
|                  Yes|
|                 NULL|
+---------------------+



In [37]:
df_fin = df_fin.withColumn(
    "Payment_Behaviour",
    F.when(F.col("Payment_Behaviour") == "!@9#%8", "Unknown")
     .otherwise(F.col("Payment_Behaviour"))
)


## Load all silver parquet files (EDA for gold)

In [200]:
parquet_files = glob.glob(os.path.join("datamart/silver/attributes", "*.parquet"))

df_attr = spark.read.parquet(*parquet_files)
df_attr

DataFrame[Customer_ID: string, Name: string, Age: int, SSN: string, Occupation: string, snapshot_date: date]

In [201]:
df_attr.count()

12500

In [202]:
# check for null values

df_attr.select([
    F.count(F.when(F.col(c).isNull(), c)).alias(c) 
    for c in df_attr.columns
]).show()


# drop age, name, ssn,
df_attr = df_attr.drop("Age", "Name", "SSN")  # drop columns with too many nulls
df_attr.show()

+-----------+----+---+---+----------+-------------+
|Customer_ID|Name|Age|SSN|Occupation|snapshot_date|
+-----------+----+---+---+----------+-------------+
|          0|   0|319|  0|         0|            0|
+-----------+----+---+---+----------+-------------+

+-----------+------------+-------------+
|Customer_ID|  Occupation|snapshot_date|
+-----------+------------+-------------+
| CUS_0x10ac|   Developer|   2024-08-01|
| CUS_0x10c5|  Unemployed|   2024-08-01|
| CUS_0x1145|     Teacher|   2024-08-01|
| CUS_0x11ac|  Journalist|   2024-08-01|
| CUS_0x122c|Entrepreneur|   2024-08-01|
| CUS_0x1274|   Scientist|   2024-08-01|
| CUS_0x1288|      Doctor|   2024-08-01|
| CUS_0x12cc|   Developer|   2024-08-01|
| CUS_0x1338|  Unemployed|   2024-08-01|
| CUS_0x1370|      Writer|   2024-08-01|
| CUS_0x1378|    Mechanic|   2024-08-01|
| CUS_0x139b|   Scientist|   2024-08-01|
| CUS_0x13a9|     Manager|   2024-08-01|
| CUS_0x13ce|     Manager|   2024-08-01|
| CUS_0x147a|      Writer|   2024-08-01|
|

In [43]:
parquet_files = glob.glob(os.path.join("datamart/silver/fin", "*.parquet"))

df_fin = spark.read.parquet(*parquet_files)
pd_df = df_fin.limit(5).toPandas()
pd_df



AnalysisException: [UNABLE_TO_INFER_SCHEMA] Unable to infer schema for Parquet. It must be specified manually.

In [44]:
# remove type_of_loan

df_fin = df_fin.drop("Type_of_Loan")  # drop useless col

df_fin.count()

12500

In [205]:
null_counts = df_fin.select([
    F.count(F.when(F.col(c).isNull(), c)).alias(c)
    for c in df_fin.columns
]).toPandas()

print(null_counts.T)  


                             0
Customer_ID                  0
Annual_Income              371
Monthly_Inhand_Salary      298
Num_Bank_Accounts          168
Num_Credit_Card            296
Interest_Rate              270
Num_of_Loan                582
Delay_from_due_date        522
Num_of_Delayed_Payment     105
Changed_Credit_Limit       429
Num_Credit_Inquiries       202
Credit_Mix                   0
Outstanding_Debt           743
Credit_Utilization_Ratio     0
Payment_of_Min_Amount     1438
Total_EMI_per_month        995
Amount_invested_monthly   1406
Payment_Behaviour            0
Monthly_Balance           1029
snapshot_date                0
Credit_History_Age           0


In [206]:
rows = df_fin.count()
nulls = df_fin.select([
    (F.count(F.when(F.col(c).isNull(), c)) / rows * 100).alias(c)
    for c in df_fin.columns
])

# Convert to Pandas and transpose for readability
nulls_pdf = nulls.toPandas().T
nulls_pdf.columns = ['% Null']
print(nulls_pdf)



                          % Null
Customer_ID                0.000
Annual_Income              2.968
Monthly_Inhand_Salary      2.384
Num_Bank_Accounts          1.344
Num_Credit_Card            2.368
Interest_Rate              2.160
Num_of_Loan                4.656
Delay_from_due_date        4.176
Num_of_Delayed_Payment     0.840
Changed_Credit_Limit       3.432
Num_Credit_Inquiries       1.616
Credit_Mix                 0.000
Outstanding_Debt           5.944
Credit_Utilization_Ratio   0.000
Payment_of_Min_Amount     11.504
Total_EMI_per_month        7.960
Amount_invested_monthly   11.248
Payment_Behaviour          0.000
Monthly_Balance            8.232
snapshot_date              0.000
Credit_History_Age         0.000


There is a significant portion of Null values inside df_fin. We will drop those with more than 5% null values. 

In [42]:
# Calculate null percentages
rows = df_fin.count()
nulls = df_fin.select([
    (F.count(F.when(F.col(c).isNull(), c)) / rows * 100).alias(c)
    for c in df_fin.columns
])

# Collect null percentages into a dict
nulls_dict = nulls.collect()[0].asDict()

# Get columns to drop
cols_to_drop = [col for col, pct in nulls_dict.items() if pct > 5]

# Drop columns from DataFrame
df_fin = df_fin.drop(*cols_to_drop)

print("Dropped columns:", cols_to_drop)

Dropped columns: ['Type_of_Loan', 'Outstanding_Debt', 'Payment_of_Min_Amount', 'Total_EMI_per_month', 'Amount_invested_monthly', 'Monthly_Balance']


Since only a handful of rows are left that are null we will just drop them

In [208]:
df_fin = df_fin.dropna()

In [209]:
rows = df_fin.count()
nulls = df_fin.select([
    (F.count(F.when(F.col(c).isNull(), c)) / rows * 100).alias(c)
    for c in df_fin.columns
])

# Convert to Pandas and transpose for readability
nulls_pdf = nulls.toPandas().T
nulls_pdf.columns = ['% Null']
print(nulls_pdf)


                          % Null
Customer_ID                  0.0
Annual_Income                0.0
Monthly_Inhand_Salary        0.0
Num_Bank_Accounts            0.0
Num_Credit_Card              0.0
Interest_Rate                0.0
Num_of_Loan                  0.0
Delay_from_due_date          0.0
Num_of_Delayed_Payment       0.0
Changed_Credit_Limit         0.0
Num_Credit_Inquiries         0.0
Credit_Mix                   0.0
Credit_Utilization_Ratio     0.0
Payment_Behaviour            0.0
snapshot_date                0.0
Credit_History_Age           0.0


no more null values

In [210]:
clickstream_directory = "datamart/bronze/clickstream/"
df_click = spark.read.csv(clickstream_directory, header=True, inferSchema=True)
df_click.count()

215376

### Check for null inside df_click

In [211]:
# Count total rows
rows = df_click.count()

# Calculate % of nulls per column
nulls = df_click.select([
    (F.count(F.when(F.col(c).isNull(), c)) / rows * 100).alias(c)
    for c in df_click.columns
])

# Display neatly
nulls_df = nulls.toPandas().T
nulls_df.columns = ['% Null']
print(nulls_df)

               % Null
fe_1              0.0
fe_2              0.0
fe_3              0.0
fe_4              0.0
fe_5              0.0
fe_6              0.0
fe_7              0.0
fe_8              0.0
fe_9              0.0
fe_10             0.0
fe_11             0.0
fe_12             0.0
fe_13             0.0
fe_14             0.0
fe_15             0.0
fe_16             0.0
fe_17             0.0
fe_18             0.0
fe_19             0.0
fe_20             0.0
Customer_ID       0.0
snapshot_date     0.0


### merge all customer data

In [212]:
merged_df = (
    df_fin
    .join(df_attr, ["Customer_ID", "snapshot_date"], "inner")
    .join(df_click, ["Customer_ID", "snapshot_date"], "inner")
)



In [213]:
merged_df

DataFrame[Customer_ID: string, snapshot_date: date, Annual_Income: float, Monthly_Inhand_Salary: float, Num_Bank_Accounts: float, Num_Credit_Card: float, Interest_Rate: float, Num_of_Loan: float, Delay_from_due_date: float, Num_of_Delayed_Payment: float, Changed_Credit_Limit: float, Num_Credit_Inquiries: float, Credit_Mix: string, Credit_Utilization_Ratio: float, Payment_Behaviour: string, Credit_History_Age: int, Occupation: string, fe_1: int, fe_2: int, fe_3: int, fe_4: int, fe_5: int, fe_6: int, fe_7: int, fe_8: int, fe_9: int, fe_10: int, fe_11: int, fe_12: int, fe_13: int, fe_14: int, fe_15: int, fe_16: int, fe_17: int, fe_18: int, fe_19: int, fe_20: int]