In [34]:
import os
import glob
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
import pprint
import pyspark
import pyspark.sql.functions as F

from pyspark.sql.functions import to_date, col
from pyspark.sql.types import StringType, IntegerType, FloatType, DateType
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

from pyspark.sql.functions import col, year, month, dayofmonth, to_date

In [35]:
# Initialize SparkSession
spark = pyspark.sql.SparkSession.builder \
    .appName("dev") \
    .config("spark.driver.memory", "4g") \
    .master("local[*]") \
    .getOrCreate()

In [36]:
spark.sparkContext.setLogLevel("ERROR")

In [37]:
# Define paths
silver_transactions_path = "/app/datamart/silver/transactions" 
silver_latest_transactions_path = "/app/datamart/silver/latest_transactions"

In [38]:
# Load all Silver parquet files
df_silver = spark.read.parquet(silver_transactions_path)
print("Silver Transactions schema:")
df_silver.printSchema()

Silver Transactions schema:
root
 |-- msno: string (nullable = true)
 |-- payment_method_id: integer (nullable = true)
 |-- payment_plan_days: integer (nullable = true)
 |-- plan_list_price: integer (nullable = true)
 |-- actual_amount_paid: integer (nullable = true)
 |-- is_auto_renew: integer (nullable = true)
 |-- transaction_date: date (nullable = true)
 |-- membership_expire_date: date (nullable = true)
 |-- is_cancel: integer (nullable = true)
 |-- source_file: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- transaction_id: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)



# Select inference date as a placeholder

In [39]:
inference_date = "2017-03-01"
print(f"Today's date is set to: {inference_date}")

Today's date is set to: 2017-03-01


In [40]:
# Filter transactions table based on transaction_date <= inference date

df_silver_filtered = (
    df_silver
    .filter(F.to_date(F.col("transaction_date")) <= F.to_date(F.lit(inference_date)))
)

In [41]:
# 1) Create a new column called 'Total plan days'
df_silver_filtered = df_silver_filtered.withColumn(
    "total_plan_days",
    F.datediff(F.col("membership_expire_date"), F.col("transaction_date"))
)

# Partition by msno (member ID) and order by transaction_date

window_spec = Window.partitionBy("msno").orderBy(F.col("transaction_date").desc())

In [42]:
# 2) Get max transaction_date within each user

df_silver_filtered = df_silver_filtered.withColumn(
    "max_transaction_date",
    F.max("transaction_date").over(window_spec)
)

# 3) Filter rows that match the max transaction date (tied rows)
df_tied_rows = df_silver_filtered.filter(F.col("transaction_date") == F.col("max_transaction_date"))

In [43]:
# --- Custom Logic within Tied Rows ---

# Define a window for ranking total_plan_days within the tied group
tied_row_window_rank = Window.partitionBy("msno").orderBy(F.col("total_plan_days").desc())

# Get the rank of total_plan_days within the tied group (1 is the max days)
df_ranked = df_tied_rows.withColumn("plan_days_rank", F.rank().over(tied_row_window_rank))

# Identify if a cancellation exists in the tied group (flag per user)
df_cancellation_exists = df_tied_rows.groupBy("msno").agg(
    F.max(F.col("is_cancel")).alias("cancellation_in_group")
)

df_final = df_ranked.join(df_cancellation_exists, on="msno", how="left").fillna({"cancellation_in_group": 0})

In [44]:
# 4 & 5) Create the final selection metric (final_priority)
# Logic: 
#   A. If a cancellation exists AND (this row is a cancellation AND this row has the 2nd max plan days) -> Priority 1
#   B. OR (if no cancellation exists in the group AND this row has the max plan days (rank 1)) -> Priority 1
df_final = df_final.withColumn(
    "final_priority",
    F.when(
        # FIX 1: Explicitly check is_cancel == 1 for BOOLEAN comparison
        (F.col("cancellation_in_group") == 1) & 
        (F.col("is_cancel") == 1) & 
        (F.col("plan_days_rank") == 2), 
        1 
    ).when(
        # FIX 2: Explicitly check cancellation_in_group == 0 for BOOLEAN comparison
        (F.col("cancellation_in_group") == 0) & 
        (F.col("plan_days_rank") == 1), 
        1 
    ).when(
        # FIX 3: Explicitly check is_cancel != 1 for BOOLEAN comparison
        (F.col("cancellation_in_group") == 1) & 
        (F.col("plan_days_rank") == 1) &
        (F.col("is_cancel") != 1), # <-- FIXED
        1 
    ).when(
        # FIX 4: Explicitly check cancellation_in_group == 0 for BOOLEAN comparison
        (F.col("cancellation_in_group") == 0) & 
        (F.col("plan_days_rank") == 1), 
        1 
    ).otherwise(0)
)

In [45]:
# Since the logic above is complex, we use a simpler explicit sort order to guarantee the result:
df_latest_transaction_final = df_tied_rows.withColumn(
    "custom_rank",
    F.row_number().over(
        Window.partitionBy("msno").orderBy(
            # Priority 1: Check for cancellation at 2nd max plan days (encoded by custom metric)
            F.when((F.col("is_cancel") == 1) & (F.rank().over(tied_row_window_rank) == 2), 1).otherwise(0).desc(),
            # Priority 2: Max plan days (desc) (for standard fallback)
            F.col("Total_plan_days").desc(),
            # Priority 3: Tie-breaker (transaction_date desc)
            F.col("transaction_date").desc()
        )
    )
).filter(F.col("custom_rank") == 1).drop("custom_rank")


In [46]:
# Example of showing the final DataFrame
df_latest_transaction_final.select("msno", "transaction_date", "Total_plan_days", "is_cancel").show(5)

[Stage 16:>                                                         (0 + 1) / 1]

+--------------------+----------------+---------------+---------+
|                msno|transaction_date|Total_plan_days|is_cancel|
+--------------------+----------------+---------------+---------+
|++0GCV3WGMjibrwCn...|      2016-11-25|              7|        0|
|++3hfQtTKeHLVuBHI...|      2016-02-15|              0|        1|
|++4RuqBw0Ss6bQU4o...|      2017-02-13|             28|        0|
|++5Z7z4xXBhCjID+B...|      2017-02-23|             28|        0|
|++6P09mCSJSh+Ft2p...|      2017-02-28|             44|        0|
+--------------------+----------------+---------------+---------+
only showing top 5 rows



                                                                                

In [47]:
# Save to silver layer

# Save as Parquet, without partitioning
(
    df_latest_transaction_final
    .write
    .mode("overwrite")
    .parquet(silver_latest_transactions_path)
)

print(f"✅ Silver layer (Latest_transactions) successfully written to: {silver_latest_transactions_path}")

                                                                                

✅ Silver layer (Latest_transactions) successfully written to: /app/datamart/silver/latest_transactions
