In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month, dayofmonth, to_date

In [2]:
# Create or get Spark session
spark = SparkSession.builder.appName("Bronze_to_Silver_Transactions").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/25 10:16:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark.sparkContext.setLogLevel("ERROR")

In [6]:
# Define paths
bronze_path = "/app/datamart/bronze/transactions"  
silver_path = "/app/datamart/silver/transactions"

In [7]:
# Load all Bronze parquet files
df_bronze = spark.read.parquet(bronze_path)
print("Bronze schema:")
df_bronze.printSchema()

                                                                                

Bronze schema:
root
 |-- msno: string (nullable = true)
 |-- payment_method_id: integer (nullable = true)
 |-- payment_plan_days: integer (nullable = true)
 |-- plan_list_price: integer (nullable = true)
 |-- actual_amount_paid: integer (nullable = true)
 |-- is_auto_renew: integer (nullable = true)
 |-- transaction_date: integer (nullable = true)
 |-- membership_expire_date: integer (nullable = true)
 |-- is_cancel: integer (nullable = true)
 |-- source_file: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)



In [8]:
# Inspect quick stats (row count, nulls, distinct)

from pyspark.sql.functions import col, count, sum as fsum

print("Bronze rows:", df_bronze.count())
df_bronze.select(
    [fsum(col(c).isNull().cast("int")).alias(f"null_{c}") for c in df_bronze.columns]
).show(vertical=True, truncate=False)

for c in ["msno","payment_method_id","payment_plan_days","plan_list_price",
          "actual_amount_paid","is_auto_renew","transaction_date",
          "membership_expire_date","is_cancel"]:
    print(c, "distinct =", df_bronze.select(c).distinct().count())

                                                                                

Bronze rows: 22975416


                                                                                

-RECORD 0--------------------------
 null_msno                   | 0   
 null_payment_method_id      | 0   
 null_payment_plan_days      | 0   
 null_plan_list_price        | 0   
 null_actual_amount_paid     | 0   
 null_is_auto_renew          | 0   
 null_transaction_date       | 0   
 null_membership_expire_date | 0   
 null_is_cancel              | 0   
 null_source_file            | 0   
 null_year                   | 0   
 null_month                  | 0   



                                                                                

msno distinct = 2426143


                                                                                

payment_method_id distinct = 40


                                                                                

payment_plan_days distinct = 37


                                                                                

plan_list_price distinct = 55


                                                                                

actual_amount_paid distinct = 65


                                                                                

is_auto_renew distinct = 2


                                                                                

transaction_date distinct = 821


                                                                                

membership_expire_date distinct = 3473




is_cancel distinct = 2


                                                                                

In [9]:
# Check for duplicates

from pyspark.sql import functions as F

# Define a composite key for duplicates
dup_keys = [
    "msno", "transaction_date", "membership_expire_date",
    "payment_plan_days", "plan_list_price", "actual_amount_paid",
    "is_auto_renew", "is_cancel"
]

# Count total and unique records
total_rows = df_bronze.count()
unique_rows = df_bronze.dropDuplicates(dup_keys).count()
duplicate_rows = total_rows - unique_rows

print(f"Total rows: {total_rows:,}")
print(f"Unique rows: {unique_rows:,}")
print(f"Duplicate rows: {duplicate_rows:,}")

# If duplicates exist, show a few of them
if duplicate_rows > 0:
    df_bronze.groupBy(dup_keys).count().filter(F.col("count") > 1).show(10, truncate=False)
else:
    print("✅ No duplicate records found.")

                                                                                

Total rows: 22,975,416
Unique rows: 22,975,412
Duplicate rows: 4




+--------------------------------------------+----------------+----------------------+-----------------+---------------+------------------+-------------+---------+-----+
|msno                                        |transaction_date|membership_expire_date|payment_plan_days|plan_list_price|actual_amount_paid|is_auto_renew|is_cancel|count|
+--------------------------------------------+----------------+----------------------+-----------------+---------------+------------------+-------------+---------+-----+
|NDivZ8g3qdPP0mA3pjtgV1lMt4nyz8hJWGNyiF5MpZI=|20160425        |20160525              |30               |149            |149               |1            |0        |2    |
|X6ymkvMfcOq35s2W3nj/yfhWSpqnLE8G7EuFXiuc0cw=|20151015        |20151115              |30               |149            |149               |1            |0        |2    |
|Zy0v1E2oiYxhlPdzA9pzSmuTlj5JIWs5fjBFbixFR7U=|20150611        |20150710              |30               |149            |149               |1          

                                                                                

In [8]:
# Delete duplicate rows

# Define the same key used for duplicate detection
dup_keys = [
    "msno", "transaction_date", "membership_expire_date",
    "payment_plan_days", "plan_list_price", "actual_amount_paid",
    "is_auto_renew", "is_cancel"
]

# Drop exact duplicates
df_bronze_dedup = df_bronze.dropDuplicates(dup_keys)

# Verify
print("After deduplication:", df_bronze_dedup.count())
print("Removed rows:", df_bronze.count() - df_bronze_dedup.count())

25/10/20 14:53:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/20 14:53:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/20 14:53:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/20 14:53:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/20 14:53:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/20 14:53:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/20 14:53:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/20 14:53:21 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/20 14:53:21 WARN RowBasedKeyValueBatch: Calling spill() on

After deduplication: 22975412


25/10/20 14:53:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/20 14:53:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/20 14:53:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/20 14:53:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/20 14:53:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/20 14:53:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/20 14:53:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/20 14:53:49 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/20 14:53:50 WARN RowBasedKeyValueBatch: Calling spill() on

Removed rows: 4


                                                                                

In [None]:
# Cast transaction_date and membership_expire_date from int to date format
# Add year, month, and day columns for easier filtering and partitioning later

from pyspark.sql.functions import to_date, col, lpad, year, month, dayofmonth

from pyspark.sql.functions import to_date, col, lpad, year, month, dayofmonth

df_bronze_clean = (
    df_bronze_dedup
    # Convert integer yyyymmdd → string → date (keep same column names)
    .withColumn("transaction_date", to_date(lpad(col("transaction_date").cast("string"), 8, "0"), "yyyyMMdd"))
    .withColumn("membership_expire_date", to_date(lpad(col("membership_expire_date").cast("string"), 8, "0"), "yyyyMMdd"))
    # Add calendar columns for partitioning
    .withColumn("year", year(col("transaction_date")))
    .withColumn("month", month(col("transaction_date")))
    .withColumn("day", dayofmonth(col("transaction_date")))
)

# ✅ Verify schema and a few rows
df_bronze_clean.printSchema()
df_bronze_clean.select("msno", "transaction_date", "membership_expire_date", "year", "month", "day").show(5)

In [13]:
# Apply validity filters on columns 2-5

from pyspark.sql.functions import col

df_valid = (
    df_bronze_clean
    .filter(col("payment_plan_days") >= 0)
    .filter(col("plan_list_price") >= 0)
    .filter(col("actual_amount_paid") >= 0)
    .filter(col("membership_expire_date") >= col("transaction_date"))
)

# ✅ Check record count before & after filtering
print("Before filtering:", df_bronze_clean.count())
print("After filtering :", df_valid.count())

# ✅ Quick sanity check of min/max dates and values
df_valid.selectExpr(
    "min(transaction_date) as min_txn_date",
    "max(transaction_date) as max_txn_date",
    "min(membership_expire_date) as min_exp_date",
    "max(membership_expire_date) as max_exp_date",
    "min(plan_list_price) as min_price",
    "max(plan_list_price) as max_price",
    "min(actual_amount_paid) as min_paid",
    "max(actual_amount_paid) as max_paid"
).show()

25/10/20 15:13:11 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/20 15:13:11 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/20 15:13:11 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/20 15:13:11 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/20 15:13:11 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/20 15:13:11 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/20 15:13:11 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/20 15:13:11 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/20 15:13:11 WARN RowBasedKeyValueBatch: Calling spill() on

Before filtering: 22975412


25/10/20 15:13:38 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/20 15:13:38 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/20 15:13:38 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/20 15:13:39 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/20 15:13:39 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/20 15:13:39 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/20 15:13:39 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/20 15:13:39 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/10/20 15:13:39 WARN RowBasedKeyValueBatch: Calling spill() on

After filtering : 22816909




+------------+------------+------------+------------+---------+---------+--------+--------+
|min_txn_date|max_txn_date|min_exp_date|max_exp_date|min_price|max_price|min_paid|max_paid|
+------------+------------+------------+------------+---------+---------+--------+--------+
|  2015-01-01|  2017-03-31|  2015-01-01|  2036-10-15|        0|     2000|       0|    2000|
+------------+------------+------------+------------+---------+---------+--------+--------+



                                                                                

In [None]:
# Create unique transaction_id for each transaction row

df_with_id = df_valid.withColumn(
    "transaction_id", 
    F.expr("uuid()")
)

In [None]:
# Save to silver layer

# Save as Parquet, partitioned by year/month for efficient querying
(
    df_with_id
    .write
    .mode("overwrite")            # replace existing data if re-run
    .partitionBy("year", "month") # create folder structure
    .parquet(silver_path)
)

print(f"✅ Silver layer successfully written to: {silver_path}")