In [4]:
from pyspark.sql import SparkSession
import sys

sys.path.append('../src')

try:
    spark.stop()
except:
    pass

# Buat SparkSession baru
spark = SparkSession.builder \
    .appName("ChurnPrediction") \
    .config("spark.driver.memory", "12g") \
    .config("spark.driver.maxResultSize", "4g") \
    .getOrCreate()

# Import functions for feature engineering
from feature_demographics import aggregate_demographics 
from feature_transactions import aggregate_transactions
from feature_logs import aggregate_logs

In [5]:
# Load the datasets
train_df = spark.read.csv("../dataset/train_combined.csv", header=True, inferSchema=True)
members_df = spark.read.csv("../dataset/members_v3.csv", header=True, inferSchema=True)
transactions_df = spark.read.csv("../dataset/transactions_combined.csv", header=True, inferSchema=True)
user_logs_df = spark.read.csv("../dataset/user_logs_combined/user_logs_combined.csv", header=True, inferSchema=True)

In [6]:
demo_features = aggregate_demographics(members_df) 
demo_features.cache()

trans_features = aggregate_transactions(transactions_df) 
trans_features.cache()

log_features = aggregate_logs(user_logs_df)
log_features.cache()

Memulai pre-processing demografi...
Pre-processing demografi selesai.
Agregasi logs selesai.


DataFrame[msno: string, avg_num_25: double, avg_num_50: double, avg_num_75: double, avg_num_985: double, avg_num_100: double, avg_daily_secs: double, total_active_days: bigint, total_unq_songs: bigint, percent_songs_completed: double]

In [7]:
# simpan hasilnya sebagai Parquet karena jauh lebih cepat dan efisien
demo_features.write.mode("overwrite").parquet("data/demographic_features.parquet")
trans_features.write.mode("overwrite").parquet("data/transaction_features.parquet")
log_features.write.mode("overwrite").parquet("data/log_features.parquet")

In [8]:
master_table = train_df.join(demo_features, "msno", "left") \
                       .join(trans_features, "msno", "left") \
                       .join(log_features, "msno", "left")
# isi nilai null setelah join (misal, pengguna tanpa log)
master_table = master_table.fillna(0)
master_table.cache()

DataFrame[msno: string, is_churn: int, city: int, age_group: string, registered_via: int, membership_duration_days: int, total_transactions: bigint, total_payment_plan_days: bigint, avg_discount: double, count_auto_renew: bigint, count_cancel: bigint, last_transaction_date: int, last_expiry_date: int, avg_num_25: double, avg_num_50: double, avg_num_75: double, avg_num_985: double, avg_num_100: double, avg_daily_secs: double, total_active_days: bigint, total_unq_songs: bigint, percent_songs_completed: double]

In [9]:
master_table.write.mode("overwrite").parquet("data/master_feature_table.parquet")