In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import col, when

# Impor library MLlib
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
)
from pyspark.ml.classification import (
    LogisticRegression, RandomForestClassifier, GBTClassifier
)
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
import pandas as pd

# Hentikan SparkSession jika ada yang aktif
try:
    spark.stop()
except:
    pass

# Buat SparkSession baru
spark = SparkSession.builder \
    .appName("ChurnModeling") \
    .config("spark.driver.memory", "12g") \
    .config("spark.driver.maxResultSize", "4g") \
    .getOrCreate()

print("SparkSession dan library MLlib siap.")

SparkSession dan library MLlib siap.


# Load Data & Feature Selection

In [3]:
# 1. Muat data master_feature_table.parquet
data_path = "data/master_feature_table.parquet"
df = spark.read.parquet(data_path)
df.cache()

# 2. Daftar Fitur yang DIBUANG (Berdasarkan EDA & Korelasi)
cols_to_drop = [
    "msno",                     # ID
    "last_transaction_date",    # Format tanggal
    "last_expiry_date",         # Format tanggal
    
    # --- Berdasarkan Temuan EDA Correlation ---
    # Dibuang karena Redundant (Korelasi > 0.85)
    "count_auto_renew",
    #"total_transactions",       # Korelasi 0.91 dg count_auto_renew
    #"total_payment_plan_days",  # Korelasi 0.88 dg total_transactions
    "total_secs_last_90d",      # Korelasi 0.94 dg total_secs_last_30d
    "active_days_last_90d",     # Korelasi 0.94 dg active_days_last_30d
    
    # Dibuang karena Tidak Prediktif (Berdasarkan EDA)
    "membership_duration_days",
    "registered_via",
    #"city"
    #"lifetime_active_days",
    #"lifetime_unq_songs",
    
]

# 3. Terapkan Feature Selection
df_selected = df.drop(*cols_to_drop)

print("Feature selection selesai. Skema akhir untuk model:")
df_selected.printSchema()

Feature selection selesai. Skema akhir untuk model:
root
 |-- is_churn: integer (nullable = true)
 |-- city: integer (nullable = true)
 |-- age_group: string (nullable = true)
 |-- total_transactions: long (nullable = true)
 |-- total_payment_plan_days: long (nullable = true)
 |-- avg_discount: double (nullable = true)
 |-- count_cancel: long (nullable = true)
 |-- avg_num_25: double (nullable = true)
 |-- avg_num_50: double (nullable = true)
 |-- avg_num_75: double (nullable = true)
 |-- avg_num_985: double (nullable = true)
 |-- avg_num_100: double (nullable = true)
 |-- avg_daily_secs: double (nullable = true)
 |-- total_active_days: long (nullable = true)
 |-- total_unq_songs: long (nullable = true)
 |-- percent_songs_completed: double (nullable = true)



# Define Features Type & Preprocessing Pipeline

In [4]:
# define tipe fitur& pipeline preprocessing
# 1. Tentukan fitur kategorikal dan numerik (dari sisa kolom)
#categorical_cols = ["age_group", "city", "registered_via"]
categorical_cols = ["age_group", "city"]
#categorical_cols = ["age_group"]

# Semua kolom lain selain 'is_churn' dan kategorikal adalah numerik
numerical_cols = [
    col for col in df_selected.columns 
    if col not in categorical_cols + ["is_churn"]
]

print(f"Fitur Kategorikal: {categorical_cols}")
print(f"Fitur Numerik: {numerical_cols}")

# --- TAHAPAN PIPELINE PREPROCESSING ---

# Tahap 1: StringIndexer (Hanya untuk 'age_group' karena 'city' & 'registered_via' sudah angka)
# Kita perlu mengubah "Unknown", "18-25" menjadi 0.0, 1.0, dst.
indexer = StringIndexer(
    inputCol="age_group", 
    outputCol="age_group_idx", 
    handleInvalid="keep" # Mengubah null/unknown menjadi indeks khusus
)

# Tahap 2: OneHotEncoder (Untuk SEMUA kategorikal)
# Mengubah [0.0, 1.0, 2.0] menjadi vector [1,0,0], [0,1,0], [0,0,1]
encoder = OneHotEncoder(
    #inputCols=["age_group_idx", "city", "registered_via"],
    inputCols=["age_group_idx", "city"],
    #outputCols=["age_group_vec", "city_vec", "registered_via_vec"]
    outputCols=["age_group_vec" , "city_vec"]
)

# Tahap 3: VectorAssembler (Hanya untuk fitur NUMERIK)
assembler_num = VectorAssembler(
    inputCols=numerical_cols, 
    outputCol="numerical_features"
)

# Tahap 4: StandardScaler (Untuk fitur numerik)
# Menyamakan skala semua fitur numerik (penting untuk Logistic Regression)
scaler = StandardScaler(
    inputCol="numerical_features", 
    outputCol="scaled_numerical_features"
)

# Tahap 5: VectorAssembler Final (Menggabungkan SEMUA fitur)
assembler_final = VectorAssembler(
    inputCols=[
        "age_group_vec", 
        "city_vec", 
        #"registered_via_vec", 
        "scaled_numerical_features"
    ],
    outputCol="features" # Ini adalah kolom akhir yang dibutuhkan model
)

# Gabungkan semua tahapan preprocessing menjadi satu pipeline
preprocessing_pipeline = Pipeline(
    stages=[
        indexer, 
        encoder, 
        assembler_num, 
        scaler, 
        assembler_final
    ]
)

Fitur Kategorikal: ['age_group', 'city']
Fitur Numerik: ['total_transactions', 'total_payment_plan_days', 'avg_discount', 'count_cancel', 'avg_num_25', 'avg_num_50', 'avg_num_75', 'avg_num_985', 'avg_num_100', 'avg_daily_secs', 'total_active_days', 'total_unq_songs', 'percent_songs_completed']


# Data Splitting & Oversampling (imbalance)

In [5]:
# Data Split & Oversampling (Mild Strategy)
# 1. Bagi data menjadi set Latihan (80%) dan Uji (20%)
print("Membagi data menjadi 80% Latih, 20% Uji...")
train_data, test_data = df_selected.randomSplit([0.8, 0.2], seed=42)

# Cache data latih untuk mempercepat perhitungan count
train_data.cache()

print(f"Baris data Latih (original): {train_data.count():,}")
print(f"Baris data Uji: {test_data.count():,}")

# 2. Strategi Oversampling Ringan (Mild Oversampling)
print("Melakukan oversampling ringan pada data latih...")

# Hitung jumlah data per kelas
count_class_0 = train_data.filter(col("is_churn") == 0).count()
count_class_1 = train_data.filter(col("is_churn") == 1).count()

# Hitung rasio saat ini
current_ratio = count_class_1 / count_class_0
print(f"Proporsi Churn Asli: {current_ratio:.2%} (Jumlah: {count_class_1})")

# --- TARGET: HANYA MENAIKKAN KELAS MINORITAS SEDIKIT ---
# Misal: Kita ingin menaikkan jumlah data churn sebesar 2x lipat dari aslinya (bukan menyamakan dengan mayoritas)
# Atau menargetkan rasio tertentu, misal 0.2 (20% dari mayoritas)
target_ratio = 0.2  # Target perbandingan Churn : Tidak Churn = 1 : 5

# Hitung fraction yang dibutuhkan
# Rumus: (Target Count) / (Actual Count)
# Target Count = count_class_0 * target_ratio
required_fraction = (count_class_0 * target_ratio) / count_class_1

# PENTING: Jika required_fraction < 1.0, berarti data churn sudah cukup, tidak perlu oversample.
# Jika > 1.0, kita ambil selisihnya untuk ditambahkan.
fraction_to_add = required_fraction - 1.0

if fraction_to_add > 0:
    print(f"Target Ratio: {target_ratio:.2%}. Menambahkan sample sebesar {fraction_to_add:.2f}x dari data original.")
    
    df_minority = train_data.filter(col("is_churn") == 1)
    
    # Ambil sample tambahan
    df_oversampled_addition = df_minority.sample(
        withReplacement=True, 
        fraction=fraction_to_add, 
        seed=42
    )
    
    # Gabungkan data asli (train_data) dengan data tambahan
    train_data_balanced = train_data.unionAll(df_oversampled_addition)
else:
    print("Data churn sudah cukup proporsional, tidak dilakukan oversampling.")
    train_data_balanced = train_data

# 3. Verifikasi Hasil
print(f"Baris data Latih (setelah oversampling): {train_data_balanced.count():,}")
print("Distribusi kelas baru:")
train_data_balanced.groupBy("is_churn").count().show()

Membagi data menjadi 80% Latih, 20% Uji...
Baris data Latih (original): 866,029
Baris data Uji: 216,161
Melakukan oversampling ringan pada data latih...
Proporsi Churn Asli: 10.09% (Jumlah: 79379)
Target Ratio: 20.00%. Menambahkan sample sebesar 0.98x dari data original.
Baris data Latih (setelah oversampling): 943,876
Distribusi kelas baru:
+--------+------+
|is_churn| count|
+--------+------+
|       1|157226|
|       0|786650|
+--------+------+



# Define Model & Train Pipeline

In [None]:
# define model & train pipeline
# 1. Definisikan 3 model
lr = LogisticRegression(featuresCol="features", labelCol="is_churn")
rf = RandomForestClassifier(featuresCol="features", labelCol="is_churn", seed=42)
gbt = GBTClassifier(featuresCol="features", labelCol="is_churn", seed=42)

# 2. Buat pipeline lengkap (Preprocessing + Model)
pipeline_lr = Pipeline(stages=[preprocessing_pipeline, lr])
pipeline_rf = Pipeline(stages=[preprocessing_pipeline, rf])
pipeline_gbt = Pipeline(stages=[preprocessing_pipeline, gbt])

# 3. Latih model
# Model dilatih pada data latih yang sudah SEIMBANG (Oversampled)
print("Melatih Logistic Regression...")
model_lr = pipeline_lr.fit(train_data_oversampled)

print("Melatih Random Forest...")
model_rf = pipeline_rf.fit(train_data_oversampled)

print("Melatih GBT...")
model_gbt = pipeline_gbt.fit(train_data_oversampled)

print("Semua model selesai dilatih.")

Melatih Logistic Regression...
Melatih Random Forest...
Melatih GBT...
Semua model selesai dilatih.


# Model Evaluation on Test Data

In [None]:
# Evaluasi Model pada Data Test (evaluasi balik ke data yg ga seimbang u/ lihat performa nyata model)
# 1. Buat prediksi pada data UJI (yang tidak seimbang)
print("Membuat prediksi pada data uji (unseen & imbalanced)...")
pred_lr = model_lr.transform(test_data)
pred_rf = model_rf.transform(test_data)
pred_gbt = model_gbt.transform(test_data)

# 2. Definisikan Evaluator
# menggunakan dua metrik utama untuk data imbalance:
# AUC-ROC: Baik untuk mengukur performa keseluruhan
# AUC-PR: (AreaUnderPrecisionRecall) Sangat baik untuk kelas minoritas yang langka

evaluator_roc = BinaryClassificationEvaluator(
    labelCol="is_churn", 
    rawPredictionCol="rawPrediction", 
    metricName="areaUnderROC"
)

evaluator_pr = BinaryClassificationEvaluator(
    labelCol="is_churn", 
    rawPredictionCol="rawPrediction", 
    metricName="areaUnderPR"
)

# 3. Hitung dan Tampilkan Hasil
results = {}

print("\n--- Hasil Evaluasi Model ---")

# Logistic Regression
auc_roc_lr = evaluator_roc.evaluate(pred_lr)
auc_pr_lr = evaluator_pr.evaluate(pred_lr)
results['Logistic Regression'] = {'AUC-ROC': auc_roc_lr, 'AUC-PR': auc_pr_lr}
print(f"\nLogistic Regression:")
print(f"  AUC-ROC: {auc_roc_lr:.4f}")
print(f"  AUC-PR (Fokus Churn): {auc_pr_lr:.4f}")

# Random Forest
auc_roc_rf = evaluator_roc.evaluate(pred_rf)
auc_pr_rf = evaluator_pr.evaluate(pred_rf)
results['Random Forest'] = {'AUC-ROC': auc_roc_rf, 'AUC-PR': auc_pr_rf}
print(f"\nRandom Forest:")
print(f"  AUC-ROC: {auc_roc_rf:.4f}")
print(f"  AUC-PR (Fokus Churn): {auc_pr_rf:.4f}")

# GBT
auc_roc_gbt = evaluator_roc.evaluate(pred_gbt)
auc_pr_gbt = evaluator_pr.evaluate(pred_gbt)
results['GBT'] = {'AUC-ROC': auc_roc_gbt, 'AUC-PR': auc_pr_gbt}
print(f"\nGBT Classifier:")
print(f"  AUC-ROC: {auc_roc_gbt:.4f}")
print(f"  AUC-PR (Fokus Churn): {auc_pr_gbt:.4f}")

Membuat prediksi pada data uji (unseen & imbalanced)...

--- Hasil Evaluasi Model ---

Logistic Regression:
  AUC-ROC: 0.9064
  AUC-PR (Fokus Churn): 0.5529

Random Forest:
  AUC-ROC: 0.9118
  AUC-PR (Fokus Churn): 0.6462

GBT Classifier:
  AUC-ROC: 0.9535
  AUC-PR (Fokus Churn): 0.7538


# Confusion Matrix

In [None]:
# Confusion Matrix
from pyspark.mllib.evaluation import MulticlassMetrics

def print_confusion_matrix(predictions, model_name):
    # Mengubah prediksi menjadi RDD untuk MulticlassMetrics
    preds_and_labels = predictions.select("prediction", "is_churn").rdd.map(
        lambda r: (float(r.prediction), float(r.is_churn))
    )
    
    metrics = MulticlassMetrics(preds_and_labels)
    confusion_matrix = metrics.confusionMatrix().toArray()
    
    print(f"\n--- Confusion Matrix untuk: {model_name} ---")
    print(confusion_matrix)
    
    Overall_Accuracy = metrics.accuracy
    print(f"  Overall Accuracy:   {Overall_Accuracy:.4f}")

    # TN, FP
    # FN, TP
    TN = confusion_matrix[0][0]
    FP = confusion_matrix[0][1]
    FN = confusion_matrix[1][0]
    TP = confusion_matrix[1][1]
    
    Recall_Churn = TP / (TP + FN)
    Precision_Churn = TP / (TP + FP)
    F1_Churn = 2 * (Precision_Churn * Recall_Churn) / (Precision_Churn + Recall_Churn)
    
    print(f"  Recall (Churn=1):    {Recall_Churn:.4f}")
    print(f"  Precision (Churn=1): {Precision_Churn:.4f}")
    print(f"  F1-Score (Churn=1):  {F1_Churn:.4f}")

# Jalankan untuk model terbaik (misal, GBT)
print_confusion_matrix(pred_gbt, "GBT Classifier")

# Jalankan untuk Logistic Regression
print_confusion_matrix(pred_lr, "Logistic Regression")

# Jalankan untuk Random Forest
print_confusion_matrix(pred_rf, "Random Forest")





--- Confusion Matrix untuk: GBT Classifier ---
[[179551.  17096.]
 [  2674.  17165.]]
  Overall Accuracy:   0.9087
  Recall (Churn=1):    0.8652
  Precision (Churn=1): 0.5010
  F1-Score (Churn=1):  0.6346

--- Confusion Matrix untuk: Logistic Regression ---
[[164753.  31894.]
 [  3360.  16479.]]
  Overall Accuracy:   0.8372
  Recall (Churn=1):    0.8306
  Precision (Churn=1): 0.3407
  F1-Score (Churn=1):  0.4832

--- Confusion Matrix untuk: Random Forest ---
[[170256.  26391.]
 [  4240.  15599.]]
  Overall Accuracy:   0.8585
  Recall (Churn=1):    0.7863
  Precision (Churn=1): 0.3715
  F1-Score (Churn=1):  0.5046


# Punya peli masih an cv

In [None]:
from sklearn.model_selection import StratifiedKFold, cross_validate 
from sklearn.metrics import make_scorer, recall_score, precision_score, f1_score
import numpy as np 
from sklearn.metrics import classification_report

# Tentukan strategi Cross-Validation (disarankan K=5 atau K=10)
# StratifiedKFold WAJIB karena data churn tidak seimbang
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Definisikan metrik yang akan diukur (fokus pada kelas '1'/churn)
scoring = {
    'accuracy': 'accuracy', 
    # Metrik untuk CHURN (Kelas 1)
    'recall_churn': make_scorer(recall_score, pos_label=1),
    'precision_churn': make_scorer(precision_score, pos_label=1),
    'f1_churn': make_scorer(f1_score, pos_label=1),

    # Metrik untuk TIDAK CHURN (Kelas 0) 
    'recall_non_churn': make_scorer(recall_score, pos_label=0),
    'precision_non_churn': make_scorer(precision_score, pos_label=0),
    'f1_non_churn': make_scorer(f1_score, pos_label=0)
}

results_cv = {}    # Hasil dari cross_validate
results_split = {} # Hasil dari single fit/predict

for model_name, model in models.items():
    print(f"\n--- Melatih dan Menguji: {model_name} ---")

    full_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    # =======================================================
    # BAGIAN 1: SINGLE-SPLIT (Dibutuhkan untuk melihat report_dict lengkap)
    # =======================================================
    full_pipeline.fit(X_train, y_train) # Latih model untuk single-split
    y_pred = full_pipeline.predict(X_test)
    
    # Simpan hasil single-split (untuk visualisasi report lengkap)
    results_split[model_name] = classification_report(y_test, y_pred, output_dict=True)
    
    # =======================================================
    # BAGIAN 2: CROSS-VALIDATION (Untuk mendapatkan skor paling andal)
    # =======================================================
    cv_scores = cross_validate(
        full_pipeline, 
        X, y, # Menggunakan SEMUA data sampel
        cv=cv, 
        scoring=scoring, 
        return_train_score=False, 
        n_jobs=-1
    )
    
    # Rata-ratakan skor dari 5 fold
    # Pastikan 'accuracy' sudah ditambahkan ke dictionary 'scoring' Anda
    avg_scores = {
        'avg_accuracy': np.mean(cv_scores['test_accuracy']),
        
        # CHURN (Kelas 1)
        'avg_recall_churn': np.mean(cv_scores['test_recall_churn']),
        'avg_precision_churn': np.mean(cv_scores['test_precision_churn']),
        'avg_f1_churn': np.mean(cv_scores['test_f1_churn']),

        # TIDAK CHURN (Kelas 0) 
        'avg_recall_non_churn': np.mean(cv_scores['test_recall_non_churn']),
        'avg_precision_non_churn': np.mean(cv_scores['test_precision_non_churn']),
        'avg_f1_non_churn': np.mean(cv_scores['test_f1_non_churn']),
    }

    results_cv[model_name] = avg_scores

NameError: name 'models' is not defined

In [None]:
import pandas as pd
import numpy as np

for model_name in models.keys():
    report_split = results_split.get(model_name)
    report_cv = results_cv.get(model_name)
    
    if not report_split or not report_cv:
        print(f"\n--- Model: {model_name} (Hasil tidak lengkap) ---")
        continue

    print(f"\n--- Model: **{model_name}** ---")
    # --- 1. Hasil dari Single Split (fit/predict) ---
    print("  HASIL SPLIT TUNGGAL (80/20):")
    print(f"    Akurasi Keseluruhan: {report_split['accuracy']:.4f}")
    
    if '1' in report_split:
        print("\n     CHURN (Kelas 1):")
        print(f"      Recall:    {report_split['1']['recall']:.4f}")
        print(f"      Precision: {report_split['1']['precision']:.4f}")
        print(f"      F1-Score:  {report_split['1']['f1-score']:.4f}")
    if '0' in report_split:
        print("\n     TIDAK CHURN (Kelas 0):")
        print(f"      Recall: {report_split['0']['recall']:.4f}")
        print(f"      Precision: {report_split['0']['precision']:.4f}")
        print(f"      F1-Score: {report_split['0']['f1-score']:.4f}")
        
    # --- 2. Hasil dari Cross-Validation (Rata-rata) ---
    print("\n  HASIL CROSS-VALIDATION (Rata-rata 5-Fold):")
    print(f"    Akurasi Keseluruhan: {report_cv['avg_accuracy']:.4f}")
    
    # Hasil Rata-rata CHURN (Kelas 1)
    print("\n     CHURN (Kelas 1):")
    print(f"      Recall (Churn=1):    {report_cv['avg_recall_churn']:.4f}")
    print(f"      Precision (Churn=1): {report_cv['avg_precision_churn']:.4f}")
    print(f"      F1-Score (Churn=1):  {report_cv['avg_f1_churn']:.4f}")
    
    # Hasil Rata-rata NON-CHURN (Kelas 0) <--- TAMBAHAN BARU
    print("\n     TIDAK CHURN (Kelas 0):")
    print(f"      Recall (Non-Churn=0):    {report_cv['avg_recall_non_churn']:.4f}")
    print(f"      Precision (Non-Churn=0): {report_cv['avg_precision_non_churn']:.4f}")
    print(f"      F1-Score (Non-Churn=0):  {report_cv['avg_f1_non_churn']:.4f}")
    
    print("--------------------------------------------------------")

In [None]:
print("\n--- Ringkasan DataFrame Fokus Menggunakan CV ---")
comparison_data = []
for model_name in models.keys():
    
    # Pengecekan data karena DataFrame akan gagal jika KeyError muncul
    if model_name not in results_cv or 'avg_recall_non_churn' not in results_cv[model_name]:
        continue

    comparison_data.append({
        "Model": model_name,
        "Recall_CV (Churn=1)": results_cv[model_name]['avg_recall_churn'],
        "Precision_CV (Churn=1)": results_cv[model_name]['avg_precision_churn'],
        "Recall_CV (Non-Churn=0)": results_cv[model_name]['avg_recall_non_churn'],
        "Akurasi_CV": results_cv[model_name]['avg_accuracy'],
    })

comparison_df = pd.DataFrame(comparison_data)

# Sortir berdasarkan metrik yang paling penting (Recall Churn)
print(comparison_df.sort_values(by="Recall_CV (Churn=1)", ascending=False).reset_index(drop=True))

In [None]:
#spark.stop()