In [3]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import col, when

# Impor library MLlib
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
)
from pyspark.ml.classification import (
    LogisticRegression, RandomForestClassifier, GBTClassifier
)
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
import pandas as pd

# Hentikan SparkSession jika ada yang aktif
try:
    spark.stop()
except:
    pass

# Buat SparkSession baru
spark = SparkSession.builder \
    .appName("ChurnModeling") \
    .config("spark.driver.memory", "12g") \
    .config("spark.driver.maxResultSize", "4g") \
    .getOrCreate()

print("SparkSession dan library MLlib siap.")

SparkSession dan library MLlib siap.


# Load Data & Feature Selection

In [None]:
# 1. Muat data master_feature_table_3.parquet
data_path = "data/master_feature_table_3.parquet"
df = spark.read.parquet(data_path)
df.cache()

# 2. Daftar Fitur yang DIBUANG (Berdasarkan EDA & Korelasi)
cols_to_drop = [
    "msno",                     # ID
    "last_transaction_date",    # Format tanggal
    "last_expiry_date",         # Format tanggal
    
    # --- Berdasarkan Temuan EDA Correlation ---
    # Dibuang karena Redundant (Korelasi > 0.85)
    "count_auto_renew",
    #"total_transactions",       # Korelasi 0.91 dg count_auto_renew
    #"total_payment_plan_days",  # Korelasi 0.88 dg total_transactions
    "total_secs_last_90d",      # Korelasi 0.94 dg total_secs_last_30d
    "active_days_last_90d",     # Korelasi 0.94 dg active_days_last_30d
    
    # Dibuang karena Tidak Prediktif (Berdasarkan EDA)
    "membership_duration_days",
    "registered_via",
    #"city"
    #"lifetime_active_days",
    #"lifetime_unq_songs",
    
]

# 3. Terapkan Feature Selection
df_selected = df.drop(*cols_to_drop)

print("Feature selection selesai. Skema akhir untuk model:")
df_selected.printSchema()

Feature selection selesai. Skema akhir untuk model:
root
 |-- is_churn: integer (nullable = true)
 |-- city: integer (nullable = true)
 |-- age_group: string (nullable = true)
 |-- total_transactions: long (nullable = true)
 |-- total_payment_plan_days: long (nullable = true)
 |-- avg_discount: double (nullable = true)
 |-- count_cancel: long (nullable = true)
 |-- days_since_last_activity: integer (nullable = true)
 |-- total_secs_last_30d: double (nullable = true)
 |-- active_days_last_30d: long (nullable = true)
 |-- activity_ratio_secs: double (nullable = true)
 |-- percent_complete_last_30d: double (nullable = true)
 |-- lifetime_active_days: long (nullable = true)
 |-- lifetime_unq_songs: long (nullable = true)



: 

: 

: 

# Define Features Type & Preprocessing Pipeline

In [None]:
# define tipe fitur& pipeline preprocessing
# 1. Tentukan fitur kategorikal dan numerik (dari sisa kolom)
#categorical_cols = ["age_group", "city", "registered_via"]
categorical_cols = ["age_group", "city"]
#categorical_cols = ["age_group"]

# Semua kolom lain selain 'is_churn' dan kategorikal adalah numerik
numerical_cols = [
    col for col in df_selected.columns 
    if col not in categorical_cols + ["is_churn"]
]

print(f"Fitur Kategorikal: {categorical_cols}")
print(f"Fitur Numerik: {numerical_cols}")

# --- TAHAPAN PIPELINE PREPROCESSING ---

# Tahap 1: StringIndexer (Hanya untuk 'age_group' karena 'city' & 'registered_via' sudah angka)
# Kita perlu mengubah "Unknown", "18-25" menjadi 0.0, 1.0, dst.
indexer = StringIndexer(
    inputCol="age_group", 
    outputCol="age_group_idx", 
    handleInvalid="keep" # Mengubah null/unknown menjadi indeks khusus
)

# Tahap 2: OneHotEncoder (Untuk SEMUA kategorikal)
# Mengubah [0.0, 1.0, 2.0] menjadi vector [1,0,0], [0,1,0], [0,0,1]
encoder = OneHotEncoder(
    #inputCols=["age_group_idx", "city", "registered_via"],
    inputCols=["age_group_idx", "city"],
    #outputCols=["age_group_vec", "city_vec", "registered_via_vec"]
    outputCols=["age_group_vec" , "city_vec"]
)

# Tahap 3: VectorAssembler (Hanya untuk fitur NUMERIK)
assembler_num = VectorAssembler(
    inputCols=numerical_cols, 
    outputCol="numerical_features"
)

# Tahap 4: StandardScaler (Untuk fitur numerik)
# Menyamakan skala semua fitur numerik (penting untuk Logistic Regression)
scaler = StandardScaler(
    inputCol="numerical_features", 
    outputCol="scaled_numerical_features"
)

# Tahap 5: VectorAssembler Final (Menggabungkan SEMUA fitur)
assembler_final = VectorAssembler(
    inputCols=[
        "age_group_vec", 
        "city_vec", 
        #"registered_via_vec", 
        "scaled_numerical_features"
    ],
    outputCol="features" # Ini adalah kolom akhir yang dibutuhkan model
)

# Gabungkan semua tahapan preprocessing menjadi satu pipeline
preprocessing_pipeline = Pipeline(
    stages=[
        indexer, 
        encoder, 
        assembler_num, 
        scaler, 
        assembler_final
    ]
)

Fitur Kategorikal: ['age_group', 'city']
Fitur Numerik: ['total_transactions', 'total_payment_plan_days', 'avg_discount', 'count_cancel', 'days_since_last_activity', 'total_secs_last_30d', 'active_days_last_30d', 'activity_ratio_secs', 'percent_complete_last_30d', 'lifetime_active_days']


: 

: 

: 

# Data Splitting & Oversampling (imbalance)

In [None]:
# Data Split & Oversampling (imbalance)
# 1. Bagi data menjadi set Latihan (80%) dan Uji (20%)
# stratifikasi berdasarkan 'is_churn' agar proporsinya sama
print("Membagi data menjadi 80% Latih, 20% Uji...")
train_data, test_data = df_selected.randomSplit([0.8, 0.2], seed=42)

train_data.cache()
test_data.cache()

train_count = train_data.count()
test_count = test_data.count()

print(f"Baris data Latih (sebelum oversampling): {train_data.count():,}")
print(f"Baris data Uji: {test_data.count():,}")

# 2. Lakukan Oversampling pada Data Latih (HANYA PADA TRAIN_DATA)
print("Melakukan oversampling pada data latih...")

# Hitung rasio imbalance
count_class_0 = train_data.filter(col("is_churn") == 0).count()
count_class_1 = train_data.filter(col("is_churn") == 1).count()
ratio = 10%

print(f"Rasio (0:1): {ratio:.2f} : 1")

# Pisahkan kelas
df_majority = train_data.filter(col("is_churn") == 0)
df_minority = train_data.filter(col("is_churn") == 1)

# Lakukan oversampling (duplikasi acak) pada kelas minoritas
df_minority_oversampled = df_minority.sample(
    withReplacement=True, 
    fraction=ratio, 
    seed=42
)

# Gabungkan kembali menjadi data latih yang seimbang
train_data_oversampled = df_majority.unionAll(df_minority_oversampled)

print(f"Baris data Latih (setelah oversampling): {train_data_oversampled.count():,}")
print("Verifikasi data latih baru:")
train_data_oversampled.groupBy("is_churn").count().show()

Membagi data menjadi 80% Latih, 20% Uji...
Baris data Latih (sebelum oversampling): 865,704
Baris data Uji: 216,486
Melakukan oversampling pada data latih...
Rasio (0:1): 9.93 : 1
Baris data Latih (setelah oversampling): 1,572,027
Verifikasi data latih baru:
+--------+------+
|is_churn| count|
+--------+------+
|       0|786515|
|       1|785512|
+--------+------+



: 

: 

: 

# Define Model & Train Pipeline

In [None]:
# define model & train pipeline
# 1. Definisikan 3 model
lr = LogisticRegression(featuresCol="features", labelCol="is_churn")
rf = RandomForestClassifier(featuresCol="features", labelCol="is_churn", seed=42)
gbt = GBTClassifier(featuresCol="features", labelCol="is_churn", seed=42)

# 2. Buat pipeline lengkap (Preprocessing + Model)
pipeline_lr = Pipeline(stages=[preprocessing_pipeline, lr])
pipeline_rf = Pipeline(stages=[preprocessing_pipeline, rf])
pipeline_gbt = Pipeline(stages=[preprocessing_pipeline, gbt])

# 3. Latih model
# Model dilatih pada data latih yang sudah SEIMBANG (Oversampled)
print("Melatih Logistic Regression...")
model_lr = pipeline_lr.fit(train_data_oversampled)

print("Melatih Random Forest...")
model_rf = pipeline_rf.fit(train_data_oversampled)

print("Melatih GBT...")
model_gbt = pipeline_gbt.fit(train_data_oversampled)

print("Semua model selesai dilatih.")

Melatih Logistic Regression...
Melatih Random Forest...
Melatih GBT...
Semua model selesai dilatih.


: 

: 

: 

# Model Evaluation on Test Data

In [None]:
# Evaluasi Model pada Data Test (evaluasi balik ke data yg ga seimbang u/ lihat performa nyata model)
# 1. Buat prediksi pada data UJI (yang tidak seimbang)
print("Membuat prediksi pada data uji (unseen & imbalanced)...")
pred_lr = model_lr.transform(test_data)
pred_rf = model_rf.transform(test_data)
pred_gbt = model_gbt.transform(test_data)

# 2. Definisikan Evaluator
# menggunakan dua metrik utama untuk data imbalance:
# AUC-ROC: Baik untuk mengukur performa keseluruhan
# AUC-PR: (AreaUnderPrecisionRecall) Sangat baik untuk kelas minoritas yang langka

evaluator_roc = BinaryClassificationEvaluator(
    labelCol="is_churn", 
    rawPredictionCol="rawPrediction", 
    metricName="areaUnderROC"
)

evaluator_pr = BinaryClassificationEvaluator(
    labelCol="is_churn", 
    rawPredictionCol="rawPrediction", 
    metricName="areaUnderPR"
)

# 3. Hitung dan Tampilkan Hasil
results = {}

print("\n--- Hasil Evaluasi Model ---")

# Logistic Regression
auc_roc_lr = evaluator_roc.evaluate(pred_lr)
auc_pr_lr = evaluator_pr.evaluate(pred_lr)
results['Logistic Regression'] = {'AUC-ROC': auc_roc_lr, 'AUC-PR': auc_pr_lr}
print(f"\nLogistic Regression:")
print(f"  AUC-ROC: {auc_roc_lr:.4f}")
print(f"  AUC-PR (Fokus Churn): {auc_pr_lr:.4f}")

# Random Forest
auc_roc_rf = evaluator_roc.evaluate(pred_rf)
auc_pr_rf = evaluator_pr.evaluate(pred_rf)
results['Random Forest'] = {'AUC-ROC': auc_roc_rf, 'AUC-PR': auc_pr_rf}
print(f"\nRandom Forest:")
print(f"  AUC-ROC: {auc_roc_rf:.4f}")
print(f"  AUC-PR (Fokus Churn): {auc_pr_rf:.4f}")

# GBT
auc_roc_gbt = evaluator_roc.evaluate(pred_gbt)
auc_pr_gbt = evaluator_pr.evaluate(pred_gbt)
results['GBT'] = {'AUC-ROC': auc_roc_gbt, 'AUC-PR': auc_pr_gbt}
print(f"\nGBT Classifier:")
print(f"  AUC-ROC: {auc_roc_gbt:.4f}")
print(f"  AUC-PR (Fokus Churn): {auc_pr_gbt:.4f}")

Membuat prediksi pada data uji (unseen & imbalanced)...

--- Hasil Evaluasi Model ---

Logistic Regression:
  AUC-ROC: 0.9064
  AUC-PR (Fokus Churn): 0.5529

Random Forest:
  AUC-ROC: 0.9118
  AUC-PR (Fokus Churn): 0.6462

GBT Classifier:
  AUC-ROC: 0.9535
  AUC-PR (Fokus Churn): 0.7538


: 

: 

: 

# Confusion Matrix

In [None]:
# Confusion Matrix
from pyspark.mllib.evaluation import MulticlassMetrics

def print_confusion_matrix(predictions, model_name):
    # Mengubah prediksi menjadi RDD untuk MulticlassMetrics
    preds_and_labels = predictions.select("prediction", "is_churn").rdd.map(
        lambda r: (float(r.prediction), float(r.is_churn))
    )
    
    metrics = MulticlassMetrics(preds_and_labels)
    confusion_matrix = metrics.confusionMatrix().toArray()
    
    print(f"\n--- Confusion Matrix untuk: {model_name} ---")
    print(confusion_matrix)
    
    Overall_Accuracy = metrics.accuracy
    print(f"  Overall Accuracy:   {Overall_Accuracy:.4f}")

    # TN, FP
    # FN, TP
    TN = confusion_matrix[0][0]
    FP = confusion_matrix[0][1]
    FN = confusion_matrix[1][0]
    TP = confusion_matrix[1][1]
    
    Recall_Churn = TP / (TP + FN)
    Precision_Churn = TP / (TP + FP)
    F1_Churn = 2 * (Precision_Churn * Recall_Churn) / (Precision_Churn + Recall_Churn)
    
    print(f"  Recall (Churn=1):    {Recall_Churn:.4f}")
    print(f"  Precision (Churn=1): {Precision_Churn:.4f}")
    print(f"  F1-Score (Churn=1):  {F1_Churn:.4f}")

# model terbaik (GBT)
print_confusion_matrix(pred_gbt, "GBT Classifier")

# Logistic Regression
print_confusion_matrix(pred_lr, "Logistic Regression")

# Random Forest
print_confusion_matrix(pred_rf, "Random Forest")





--- Confusion Matrix untuk: GBT Classifier ---
[[179551.  17096.]
 [  2674.  17165.]]
  Overall Accuracy:   0.9087
  Recall (Churn=1):    0.8652
  Precision (Churn=1): 0.5010
  F1-Score (Churn=1):  0.6346

--- Confusion Matrix untuk: Logistic Regression ---
[[164753.  31894.]
 [  3360.  16479.]]
  Overall Accuracy:   0.8372
  Recall (Churn=1):    0.8306
  Precision (Churn=1): 0.3407
  F1-Score (Churn=1):  0.4832

--- Confusion Matrix untuk: Random Forest ---
[[170256.  26391.]
 [  4240.  15599.]]
  Overall Accuracy:   0.8585
  Recall (Churn=1):    0.7863
  Precision (Churn=1): 0.3715
  F1-Score (Churn=1):  0.5046


: 

: 

: 

In [None]:
import pyspark.sql.functions as F

print("Mencari nilai unik untuk kolom 'city'...")

# 1. Pilih kolom 'city'
# 2. Ambil nilai unik dengan .distinct()
# 3. Urutkan hasilnya agar mudah dibaca
unique_cities_df = master_table_spark_3.select("city").distinct().orderBy("city")

# Tampilkan hasilnya
# Kita gunakan n=30, yang seharusnya cukup untuk ID kota
unique_cities_df.show(30)

# Hitung jumlah total nilai unik
count = unique_cities_df.count()
print(f"Total nilai unik (distinct) di kolom 'city': {count}")

Mencari nilai unik untuk kolom 'city'...


NameError: name 'master_table_spark_3' is not defined

: 

: 

In [None]:
#spark.stop()

: 

: 

: 