In [1]:
from pyspark.sql import SparkSession
import sys
import os

try:
    spark.stop()
except:
    pass

# Buat SparkSession baru
spark = SparkSession.builder \
    .appName("ChurnPrediction") \
    .config("spark.driver.memory", "12g") \
    .config("spark.driver.maxResultSize", "4g") \
    .getOrCreate()


In [2]:
#BUAT SAMPLE DATA DARI MASTER FEATURE TABLE
master_table_df = spark.read.parquet("data/master_feature_table.parquet")

# Ambil sampel dan konversi ke Pandas
# .sample(False, 0.1) = ambil 10% data tanpa duplikasi
sample_pd_df = master_table_df.sample(fraction=0.2, seed=42).toPandas()

print(sample_pd_df.head())

                                           msno  is_churn  city  \
0  +3UpOSBV3eeSlYZtccBm9hBZJRbYrYvwYQZp+wyCjJY=         0     5   
1  +92zgLE/+9bBVXEZojEq6AVeFzY6Ty55bbJQPDCZz3g=         0     1   
2  +B9wZACowwEr6N/VAttXJRUBV/keTtA3dIkUk0sH7A0=         0     5   
3  +G6+OUQy1pnqrUy1lLn7vcBW8i5p7BMOGQU1cqI5/Xk=         0     5   
4  +MBCDIV792laoEFbjT1beIph7QWaZ7uYm+o+q/SESbE=         0     6   

            age_group  registered_via  membership_duration_days  \
0      46-90 (Senior)               7                      1470   
1             Unknown               4                        54   
2  36-45 (Paruh Baya)               7                      1185   
3  36-45 (Paruh Baya)               3                       187   
4      46-90 (Senior)               7                        90   

   total_transactions  total_payment_plan_days  avg_discount  \
0                  41                     1230           0.0   
1                   2                       60           0.0   
2 

In [3]:
print("Informasi Struktur DataFrame")
sample_pd_df.info()


Informasi Struktur DataFrame
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 216621 entries, 0 to 216620
Data columns (total 22 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   msno                      216621 non-null  object 
 1   is_churn                  216621 non-null  int32  
 2   city                      216621 non-null  int32  
 3   age_group                 192553 non-null  object 
 4   registered_via            216621 non-null  int32  
 5   membership_duration_days  216621 non-null  int32  
 6   total_transactions        216621 non-null  int64  
 7   total_payment_plan_days   216621 non-null  int64  
 8   avg_discount              216621 non-null  float64
 9   count_auto_renew          216621 non-null  int64  
 10  count_cancel              216621 non-null  int64  
 11  last_transaction_date     216621 non-null  int32  
 12  last_expiry_date          216621 non-null  int32  
 13  avg_num_25     

In [4]:
print(sample_pd_df.describe())

            is_churn           city  registered_via  membership_duration_days  \
count  216621.000000  216621.000000   216621.000000             216621.000000   
mean        0.091099       5.316728        6.093795               1141.317153   
std         0.287751       6.368815        2.847824               1101.594158   
min         0.000000       0.000000        0.000000                  0.000000   
25%         0.000000       1.000000        4.000000                291.000000   
50%         0.000000       1.000000        7.000000                783.000000   
75%         0.000000      11.000000        7.000000               1715.000000   
max         1.000000      22.000000       13.000000               4753.000000   

       total_transactions  total_payment_plan_days   avg_discount  \
count       216621.000000            216621.000000  216621.000000   
mean            16.298604               498.637704      -2.586135   
std              8.837168               249.792196      13.4101

In [5]:
# Menyiapkan data untuk pelatihan model
from sklearn.model_selection import train_test_split
import pandas as pd

target_col = 'is_churn'

#sesuaikan
feature_list = [
    'city',
    'age_group', 
    'registered_via',
    'total_transactions',
    'total_payment_plan_days',
    'avg_discount',
    'count_auto_renew',
    'count_cancel',
]


# 1. Pisahkan Data (X dan y)
X = sample_pd_df[feature_list]
y = sample_pd_df[target_col]

# 2. Bagi data latih dan data uji (80% latih, 20% uji) -> hanya dipakai jika menggunakan .fit
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [6]:
# Buat pipeline preprocessing
from sklearn.preprocessing import StandardScaler, PowerTransformer, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

categorical_features = ['city', 'age_group', 'registered_via']
numerical_features = [col for col in feature_list if col not in categorical_features]

numeric_transformer = Pipeline([
    ('power', PowerTransformer(method='yeo-johnson')), # Mengatasi skewness
    ('scaler', StandardScaler())                       # Menstandardisasi skala
])

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        # Terapkan numeric_transformer ke semua numerical_features
        ('num', numeric_transformer, numerical_features),
        
        # Terapkan categorical_transformer ke semua categorical_features
        ('cat', categorical_transformer, categorical_features)
    ],
    # Penting: 'drop' mencegah kolom lain (yang tidak terdaftar) diteruskan, 
    # memastikan hanya angka yang masuk ke model
    # TAPI INI DISESUAIKAN DENGAN KEBUTUHAN
    remainder='drop' 
)

In [7]:
# Model yang akan digunakan
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

models = {
    "Logistic Regression": LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000),
    
    "Decision Tree": DecisionTreeClassifier(class_weight='balanced', random_state=42),
    
    "Random Forest": RandomForestClassifier(class_weight='balanced', random_state=42, n_jobs=-1)
    
}

PEMBAGIAN DATA MENGGUNAKAN SPLIT & CROSS VALIDATION

In [8]:
from sklearn.model_selection import StratifiedKFold, cross_validate 
from sklearn.metrics import make_scorer, recall_score, precision_score, f1_score
import numpy as np 
from sklearn.metrics import classification_report

# Tentukan strategi Cross-Validation (disarankan K=5 atau K=10)
# StratifiedKFold WAJIB karena data churn tidak seimbang
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Definisikan metrik yang akan diukur (fokus pada kelas '1'/churn)
scoring = {
    'accuracy': 'accuracy', 
    # Metrik untuk CHURN (Kelas 1)
    'recall_churn': make_scorer(recall_score, pos_label=1),
    'precision_churn': make_scorer(precision_score, pos_label=1),
    'f1_churn': make_scorer(f1_score, pos_label=1),

    # Metrik untuk TIDAK CHURN (Kelas 0) 
    'recall_non_churn': make_scorer(recall_score, pos_label=0),
    'precision_non_churn': make_scorer(precision_score, pos_label=0),
    'f1_non_churn': make_scorer(f1_score, pos_label=0)
}

results_cv = {}    # Hasil dari cross_validate
results_split = {} # Hasil dari single fit/predict

for model_name, model in models.items():
    print(f"\n--- Melatih dan Menguji: {model_name} ---")

    full_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    # =======================================================
    # BAGIAN 1: SINGLE-SPLIT (Dibutuhkan untuk melihat report_dict lengkap)
    # =======================================================
    full_pipeline.fit(X_train, y_train) # Latih model untuk single-split
    y_pred = full_pipeline.predict(X_test)
    
    # Simpan hasil single-split (untuk visualisasi report lengkap)
    results_split[model_name] = classification_report(y_test, y_pred, output_dict=True)
    
    # =======================================================
    # BAGIAN 2: CROSS-VALIDATION (Untuk mendapatkan skor paling andal)
    # =======================================================
    cv_scores = cross_validate(
        full_pipeline, 
        X, y, # Menggunakan SEMUA data sampel
        cv=cv, 
        scoring=scoring, 
        return_train_score=False, 
        n_jobs=-1
    )
    
    # Rata-ratakan skor dari 5 fold
    # Pastikan 'accuracy' sudah ditambahkan ke dictionary 'scoring' Anda
    avg_scores = {
        'avg_accuracy': np.mean(cv_scores['test_accuracy']),
        
        # CHURN (Kelas 1)
        'avg_recall_churn': np.mean(cv_scores['test_recall_churn']),
        'avg_precision_churn': np.mean(cv_scores['test_precision_churn']),
        'avg_f1_churn': np.mean(cv_scores['test_f1_churn']),

        # TIDAK CHURN (Kelas 0) 
        'avg_recall_non_churn': np.mean(cv_scores['test_recall_non_churn']),
        'avg_precision_non_churn': np.mean(cv_scores['test_precision_non_churn']),
        'avg_f1_non_churn': np.mean(cv_scores['test_f1_non_churn']),
    }

    results_cv[model_name] = avg_scores


--- Melatih dan Menguji: Logistic Regression ---

--- Melatih dan Menguji: Decision Tree ---

--- Melatih dan Menguji: Random Forest ---


In [9]:
import pandas as pd
import numpy as np

for model_name in models.keys():
    report_split = results_split.get(model_name)
    report_cv = results_cv.get(model_name)
    
    if not report_split or not report_cv:
        print(f"\n--- Model: {model_name} (Hasil tidak lengkap) ---")
        continue

    print(f"\n--- Model: **{model_name}** ---")
    # --- 1. Hasil dari Single Split (fit/predict) ---
    print("  HASIL SPLIT TUNGGAL (80/20):")
    print(f"    Akurasi Keseluruhan: {report_split['accuracy']:.4f}")
    
    if '1' in report_split:
        print("\n     CHURN (Kelas 1):")
        print(f"      Recall:    {report_split['1']['recall']:.4f}")
        print(f"      Precision: {report_split['1']['precision']:.4f}")
        print(f"      F1-Score:  {report_split['1']['f1-score']:.4f}")
    if '0' in report_split:
        print("\n     TIDAK CHURN (Kelas 0):")
        print(f"      Recall: {report_split['0']['recall']:.4f}")
        print(f"      Precision: {report_split['0']['precision']:.4f}")
        print(f"      F1-Score: {report_split['0']['f1-score']:.4f}")
        
    # --- 2. Hasil dari Cross-Validation (Rata-rata) ---
    print("\n  HASIL CROSS-VALIDATION (Rata-rata 5-Fold):")
    print(f"    Akurasi Keseluruhan: {report_cv['avg_accuracy']:.4f}")
    
    # Hasil Rata-rata CHURN (Kelas 1)
    print("\n     CHURN (Kelas 1):")
    print(f"      Recall (Churn=1):    {report_cv['avg_recall_churn']:.4f}")
    print(f"      Precision (Churn=1): {report_cv['avg_precision_churn']:.4f}")
    print(f"      F1-Score (Churn=1):  {report_cv['avg_f1_churn']:.4f}")
    
    # Hasil Rata-rata NON-CHURN (Kelas 0) <--- TAMBAHAN BARU
    print("\n     TIDAK CHURN (Kelas 0):")
    print(f"      Recall (Non-Churn=0):    {report_cv['avg_recall_non_churn']:.4f}")
    print(f"      Precision (Non-Churn=0): {report_cv['avg_precision_non_churn']:.4f}")
    print(f"      F1-Score (Non-Churn=0):  {report_cv['avg_f1_non_churn']:.4f}")
    
    print("--------------------------------------------------------")


--- Model: **Logistic Regression** ---
  HASIL SPLIT TUNGGAL (80/20):
    Akurasi Keseluruhan: 0.8173

     CHURN (Kelas 1):
      Recall:    0.9078
      Precision: 0.3218
      F1-Score:  0.4751

     TIDAK CHURN (Kelas 0):
      Recall: 0.8082
      Precision: 0.9887
      F1-Score: 0.8894

  HASIL CROSS-VALIDATION (Rata-rata 5-Fold):
    Akurasi Keseluruhan: 0.8170

     CHURN (Kelas 1):
      Recall (Churn=1):    0.9059
      Precision (Churn=1): 0.3212
      F1-Score (Churn=1):  0.4742

     TIDAK CHURN (Kelas 0):
      Recall (Non-Churn=0):    0.8081
      Precision (Non-Churn=0): 0.9885
      F1-Score (Non-Churn=0):  0.8892
--------------------------------------------------------

--- Model: **Decision Tree** ---
  HASIL SPLIT TUNGGAL (80/20):
    Akurasi Keseluruhan: 0.8853

     CHURN (Kelas 1):
      Recall:    0.6760
      Precision: 0.4196
      F1-Score:  0.5178

     TIDAK CHURN (Kelas 0):
      Recall: 0.9063
      Precision: 0.9654
      F1-Score: 0.9349

  HASIL CROS

In [10]:
print("\n--- Ringkasan DataFrame Fokus Menggunakan CV ---")
comparison_data = []
for model_name in models.keys():
    
    # Pengecekan data karena DataFrame akan gagal jika KeyError muncul
    if model_name not in results_cv or 'avg_recall_non_churn' not in results_cv[model_name]:
        continue

    comparison_data.append({
        "Model": model_name,
        "Recall_CV (Churn=1)": results_cv[model_name]['avg_recall_churn'],
        "Precision_CV (Churn=1)": results_cv[model_name]['avg_precision_churn'],
        "Recall_CV (Non-Churn=0)": results_cv[model_name]['avg_recall_non_churn'],
        "Akurasi_CV": results_cv[model_name]['avg_accuracy'],
    })

comparison_df = pd.DataFrame(comparison_data)

# Sortir berdasarkan metrik yang paling penting (Recall Churn)
print(comparison_df.sort_values(by="Recall_CV (Churn=1)", ascending=False).reset_index(drop=True))


--- Ringkasan DataFrame Fokus Menggunakan CV ---
                 Model  Recall_CV (Churn=1)  Precision_CV (Churn=1)  \
0  Logistic Regression             0.905899                0.321181   
1        Decision Tree             0.697324                0.420681   
2        Random Forest             0.647106                0.460295   

   Recall_CV (Non-Churn=0)  Akurasi_CV  
0                 0.808093    0.817003  
1                 0.903742    0.884937  
2                 0.923946    0.898726  


In [11]:
#spark.stop()