In [4]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import col, when

# Impor library MLlib
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
)
from pyspark.ml.classification import (
    LogisticRegression, RandomForestClassifier, GBTClassifier
)
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
import pandas as pd

# Hentikan SparkSession jika ada yang aktif
try:
    spark.stop()
except:
    pass

# Buat SparkSession baru
spark = SparkSession.builder \
    .appName("ChurnModeling") \
    .config("spark.driver.memory", "12g") \
    .config("spark.driver.maxResultSize", "4g") \
    .getOrCreate()

print("SparkSession dan library MLlib siap.")

SparkSession dan library MLlib siap.


# Load Data & Feature Selection

In [None]:
# 1. Muat data master_feature_table_3.parquet
data_path = "data/master_feature_table_3.parquet"
df = spark.read.parquet(data_path)
df.cache()

# 2. Daftar Fitur yang DIBUANG (Berdasarkan EDA & Korelasi)
cols_to_drop = [
    "msno",                     # ID
    "last_transaction_date",    # Format tanggal
    "last_expiry_date",         # Format tanggal
    
    # --- Berdasarkan Temuan EDA Correlation ---
    # Dibuang karena Redundant (Korelasi > 0.85)
    "total_transactions",       # Korelasi 0.91 dg count_auto_renew
    "total_payment_plan_days",  # Korelasi 0.88 dg total_transactions
    "total_secs_last_90d",      # Korelasi 0.94 dg total_secs_last_30d
    "active_days_last_90d",     # Korelasi 0.94 dg active_days_last_30d
    
    # Dibuang karena Tidak Prediktif (Berdasarkan EDA)
    "membership_duration_days",
    "lifetime_active_days"
]

# 3. Terapkan Feature Selection
df_selected = df.drop(*cols_to_drop)

print("Feature selection selesai. Skema akhir untuk model:")
df_selected.printSchema()

Feature selection selesai. Skema akhir untuk model:
root
 |-- is_churn: integer (nullable = true)
 |-- city: integer (nullable = true)
 |-- age_group: string (nullable = true)
 |-- registered_via: integer (nullable = true)
 |-- avg_discount: double (nullable = true)
 |-- count_auto_renew: long (nullable = true)
 |-- count_cancel: long (nullable = true)
 |-- days_since_last_activity: integer (nullable = true)
 |-- total_secs_last_30d: double (nullable = true)
 |-- active_days_last_30d: long (nullable = true)
 |-- activity_ratio_secs: double (nullable = true)
 |-- percent_complete_last_30d: double (nullable = true)
 |-- lifetime_unq_songs: long (nullable = true)



# Define Features Type & Preprocessing Pipeline

In [6]:
# define tipe fitur& pipeline preprocessing
# 1. Tentukan fitur kategorikal dan numerik (dari sisa kolom)
categorical_cols = ["age_group", "city", "registered_via"]

# Semua kolom lain selain 'is_churn' dan kategorikal adalah numerik
numerical_cols = [
    col for col in df_selected.columns 
    if col not in categorical_cols + ["is_churn"]
]

print(f"Fitur Kategorikal: {categorical_cols}")
print(f"Fitur Numerik: {numerical_cols}")

# --- TAHAPAN PIPELINE PREPROCESSING ---

# Tahap 1: StringIndexer (Hanya untuk 'age_group' karena 'city' & 'registered_via' sudah angka)
# Kita perlu mengubah "Unknown", "18-25" menjadi 0.0, 1.0, dst.
indexer = StringIndexer(
    inputCol="age_group", 
    outputCol="age_group_idx", 
    handleInvalid="keep" # Mengubah null/unknown menjadi indeks khusus
)

# Tahap 2: OneHotEncoder (Untuk SEMUA kategorikal)
# Mengubah [0.0, 1.0, 2.0] menjadi vector [1,0,0], [0,1,0], [0,0,1]
encoder = OneHotEncoder(
    inputCols=["age_group_idx", "city", "registered_via"],
    outputCols=["age_group_vec", "city_vec", "registered_via_vec"]
)

# Tahap 3: VectorAssembler (Hanya untuk fitur NUMERIK)
assembler_num = VectorAssembler(
    inputCols=numerical_cols, 
    outputCol="numerical_features"
)

# Tahap 4: StandardScaler (Untuk fitur numerik)
# Menyamakan skala semua fitur numerik (penting untuk Logistic Regression)
scaler = StandardScaler(
    inputCol="numerical_features", 
    outputCol="scaled_numerical_features"
)

# Tahap 5: VectorAssembler Final (Menggabungkan SEMUA fitur)
assembler_final = VectorAssembler(
    inputCols=[
        "age_group_vec", 
        "city_vec", 
        "registered_via_vec", 
        "scaled_numerical_features"
    ],
    outputCol="features" # Ini adalah kolom akhir yang dibutuhkan model
)

# Gabungkan semua tahapan preprocessing menjadi satu pipeline
preprocessing_pipeline = Pipeline(
    stages=[
        indexer, 
        encoder, 
        assembler_num, 
        scaler, 
        assembler_final
    ]
)

Fitur Kategorikal: ['age_group', 'city', 'registered_via']
Fitur Numerik: ['avg_discount', 'count_auto_renew', 'count_cancel', 'days_since_last_activity', 'total_secs_last_30d', 'active_days_last_30d', 'activity_ratio_secs', 'percent_complete_last_30d', 'lifetime_unq_songs']


# Data Splitting & Oversampling (imbalance)

In [7]:
# Data Split & Oversampling (imbalance)
# 1. Bagi data menjadi set Latihan (80%) dan Uji (20%)
# stratifikasi berdasarkan 'is_churn' agar proporsinya sama
print("Membagi data menjadi 80% Latih, 20% Uji...")
train_data, test_data = df_selected.randomSplit([0.8, 0.2], seed=42)

print(f"Baris data Latih (sebelum oversampling): {train_data.count():,}")
print(f"Baris data Uji: {test_data.count():,}")

# 2. Lakukan Oversampling pada Data Latih (HANYA PADA TRAIN_DATA)
print("Melakukan oversampling pada data latih...")

# Hitung rasio imbalance
count_class_0 = train_data.filter(col("is_churn") == 0).count()
count_class_1 = train_data.filter(col("is_churn") == 1).count()
ratio = count_class_0 / count_class_1

print(f"Rasio (0:1): {ratio:.2f} : 1")

# Pisahkan kelas
df_majority = train_data.filter(col("is_churn") == 0)
df_minority = train_data.filter(col("is_churn") == 1)

# Lakukan oversampling (duplikasi acak) pada kelas minoritas
df_minority_oversampled = df_minority.sample(
    withReplacement=True, 
    fraction=ratio, 
    seed=42
)

# Gabungkan kembali menjadi data latih yang seimbang
train_data_oversampled = df_majority.unionAll(df_minority_oversampled)

print(f"Baris data Latih (setelah oversampling): {train_data_oversampled.count():,}")
print("Verifikasi data latih baru:")
train_data_oversampled.groupBy("is_churn").count().show()

Membagi data menjadi 80% Latih, 20% Uji...
Baris data Latih (sebelum oversampling): 865,704
Baris data Uji: 216,486
Melakukan oversampling pada data latih...
Rasio (0:1): 9.93 : 1
Baris data Latih (setelah oversampling): 1,572,027
Verifikasi data latih baru:
+--------+------+
|is_churn| count|
+--------+------+
|       0|786515|
|       1|785512|
+--------+------+



# Define Model & Train Pipeline

In [8]:
# define model & train pipeline
# 1. Definisikan 3 model
lr = LogisticRegression(featuresCol="features", labelCol="is_churn")
rf = RandomForestClassifier(featuresCol="features", labelCol="is_churn", seed=42)
gbt = GBTClassifier(featuresCol="features", labelCol="is_churn", seed=42)

# 2. Buat pipeline lengkap (Preprocessing + Model)
pipeline_lr = Pipeline(stages=[preprocessing_pipeline, lr])
pipeline_rf = Pipeline(stages=[preprocessing_pipeline, rf])
pipeline_gbt = Pipeline(stages=[preprocessing_pipeline, gbt])

# 3. Latih model
# Model dilatih pada data latih yang sudah SEIMBANG (Oversampled)
print("Melatih Logistic Regression...")
model_lr = pipeline_lr.fit(train_data_oversampled)

print("Melatih Random Forest...")
model_rf = pipeline_rf.fit(train_data_oversampled)

print("Melatih GBT...")
model_gbt = pipeline_gbt.fit(train_data_oversampled)

print("Semua model selesai dilatih.")

Melatih Logistic Regression...
Melatih Random Forest...
Melatih GBT...
Semua model selesai dilatih.


# Model Evaluation on Test Data

In [9]:
# Evaluasi Model pada Data Test (evaluasi balik ke data yg ga seimbang u/ lihat performa nyata model)
# 1. Buat prediksi pada data UJI (yang tidak seimbang)
print("Membuat prediksi pada data uji (unseen & imbalanced)...")
pred_lr = model_lr.transform(test_data)
pred_rf = model_rf.transform(test_data)
pred_gbt = model_gbt.transform(test_data)

# 2. Definisikan Evaluator
# menggunakan dua metrik utama untuk data imbalance:
# AUC-ROC: Baik untuk mengukur performa keseluruhan
# AUC-PR: (AreaUnderPrecisionRecall) Sangat baik untuk kelas minoritas yang langka

evaluator_roc = BinaryClassificationEvaluator(
    labelCol="is_churn", 
    rawPredictionCol="rawPrediction", 
    metricName="areaUnderROC"
)

evaluator_pr = BinaryClassificationEvaluator(
    labelCol="is_churn", 
    rawPredictionCol="rawPrediction", 
    metricName="areaUnderPR"
)

# 3. Hitung dan Tampilkan Hasil
results = {}

print("\n--- Hasil Evaluasi Model ---")

# Logistic Regression
auc_roc_lr = evaluator_roc.evaluate(pred_lr)
auc_pr_lr = evaluator_pr.evaluate(pred_lr)
results['Logistic Regression'] = {'AUC-ROC': auc_roc_lr, 'AUC-PR': auc_pr_lr}
print(f"\nLogistic Regression:")
print(f"  AUC-ROC: {auc_roc_lr:.4f}")
print(f"  AUC-PR (Fokus Churn): {auc_pr_lr:.4f}")

# Random Forest
auc_roc_rf = evaluator_roc.evaluate(pred_rf)
auc_pr_rf = evaluator_pr.evaluate(pred_rf)
results['Random Forest'] = {'AUC-ROC': auc_roc_rf, 'AUC-PR': auc_pr_rf}
print(f"\nRandom Forest:")
print(f"  AUC-ROC: {auc_roc_rf:.4f}")
print(f"  AUC-PR (Fokus Churn): {auc_pr_rf:.4f}")

# GBT
auc_roc_gbt = evaluator_roc.evaluate(pred_gbt)
auc_pr_gbt = evaluator_pr.evaluate(pred_gbt)
results['GBT'] = {'AUC-ROC': auc_roc_gbt, 'AUC-PR': auc_pr_gbt}
print(f"\nGBT Classifier:")
print(f"  AUC-ROC: {auc_roc_gbt:.4f}")
print(f"  AUC-PR (Fokus Churn): {auc_pr_gbt:.4f}")

Membuat prediksi pada data uji (unseen & imbalanced)...

--- Hasil Evaluasi Model ---

Logistic Regression:
  AUC-ROC: 0.9239
  AUC-PR (Fokus Churn): 0.5024

Random Forest:
  AUC-ROC: 0.9464
  AUC-PR (Fokus Churn): 0.6996

GBT Classifier:
  AUC-ROC: 0.9679
  AUC-PR (Fokus Churn): 0.7760


# Confusion Matrix

In [10]:
# Confusion Matrix
from pyspark.mllib.evaluation import MulticlassMetrics

def print_confusion_matrix(predictions, model_name):
    # Mengubah prediksi menjadi RDD untuk MulticlassMetrics
    preds_and_labels = predictions.select("prediction", "is_churn").rdd.map(
        lambda r: (float(r.prediction), float(r.is_churn))
    )
    
    metrics = MulticlassMetrics(preds_and_labels)
    confusion_matrix = metrics.confusionMatrix().toArray()
    
    print(f"\n--- Confusion Matrix untuk: {model_name} ---")
    print(confusion_matrix)
    
    # TN, FP
    # FN, TP
    TN = confusion_matrix[0][0]
    FP = confusion_matrix[0][1]
    FN = confusion_matrix[1][0]
    TP = confusion_matrix[1][1]
    
    Recall_Churn = TP / (TP + FN)
    Precision_Churn = TP / (TP + FP)
    F1_Churn = 2 * (Precision_Churn * Recall_Churn) / (Precision_Churn + Recall_Churn)
    
    print(f"  Recall (Churn=1):    {Recall_Churn:.4f}")
    print(f"  Precision (Churn=1): {Precision_Churn:.4f}")
    print(f"  F1-Score (Churn=1):  {F1_Churn:.4f}")

# Jalankan untuk model terbaik (misal, GBT)
print_confusion_matrix(pred_gbt, "GBT Classifier")

# Jalankan untuk Logistic Regression
print_confusion_matrix(pred_lr, "Logistic Regression")



Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 447.0 failed 1 times, most recent failure: Lost task 0.0 in stage 447.0 (TID 9756) (LAPTOP-HJ0CVK6G executor driver): org.apache.spark.SparkException: Python worker exited unexpectedly (crashed). Consider setting 'spark.sql.execution.pyspark.udf.faulthandler.enabled' or'spark.python.worker.faulthandler.enabled' configuration to 'true' for the better Python traceback.
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:621)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:599)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:35)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:945)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:925)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:532)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.mutable.Growable.addAll(Growable.scala:61)
	at scala.collection.mutable.Growable.addAll$(Growable.scala:57)
	at scala.collection.mutable.ArrayBuilder.addAll(ArrayBuilder.scala:75)
	at scala.collection.IterableOnceOps.toArray(IterableOnce.scala:1505)
	at scala.collection.IterableOnceOps.toArray$(IterableOnce.scala:1498)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$runJob$1(PythonRDD.scala:189)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2524)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:171)
	at org.apache.spark.scheduler.Task.run(Task.scala:147)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$5(Executor.scala:647)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:80)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:77)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:650)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.run(Thread.java:1623)
Caused by: java.io.IOException: Connection reset by peer
	at java.base/sun.nio.ch.SocketDispatcher.write0(Native Method)
	at java.base/sun.nio.ch.SocketDispatcher.write(SocketDispatcher.java:54)
	at java.base/sun.nio.ch.IOUtil.writeFromNativeBuffer(IOUtil.java:137)
	at java.base/sun.nio.ch.IOUtil.write(IOUtil.java:81)
	at java.base/sun.nio.ch.IOUtil.write(IOUtil.java:58)
	at java.base/sun.nio.ch.SocketChannelImpl.write(SocketChannelImpl.java:542)
	at org.apache.spark.api.python.BasePythonRunner$ReaderInputStream.writeAdditionalInputToPythonWorker(PythonRunner.scala:855)
	at org.apache.spark.api.python.BasePythonRunner$ReaderInputStream.read(PythonRunner.scala:767)
	at java.base/java.io.BufferedInputStream.fill(BufferedInputStream.java:258)
	at java.base/java.io.BufferedInputStream.implRead(BufferedInputStream.java:292)
	at java.base/java.io.BufferedInputStream.read(BufferedInputStream.java:279)
	at java.base/java.io.DataInputStream.readInt(DataInputStream.java:381)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:933)
	... 22 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$3(DAGScheduler.scala:2935)
	at scala.Option.getOrElse(Option.scala:201)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2935)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2927)
	at scala.collection.immutable.List.foreach(List.scala:334)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2927)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1295)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1295)
	at scala.Option.foreach(Option.scala:437)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1295)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3207)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3141)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3130)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:50)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:1009)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2484)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2505)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2524)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:189)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:75)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:578)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:184)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:108)
	at java.base/java.lang.Thread.run(Thread.java:1623)
Caused by: org.apache.spark.SparkException: Python worker exited unexpectedly (crashed). Consider setting 'spark.sql.execution.pyspark.udf.faulthandler.enabled' or'spark.python.worker.faulthandler.enabled' configuration to 'true' for the better Python traceback.
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:621)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:599)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:35)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:945)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:925)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:532)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.mutable.Growable.addAll(Growable.scala:61)
	at scala.collection.mutable.Growable.addAll$(Growable.scala:57)
	at scala.collection.mutable.ArrayBuilder.addAll(ArrayBuilder.scala:75)
	at scala.collection.IterableOnceOps.toArray(IterableOnce.scala:1505)
	at scala.collection.IterableOnceOps.toArray$(IterableOnce.scala:1498)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$.$anonfun$runJob$1(PythonRDD.scala:189)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2524)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:171)
	at org.apache.spark.scheduler.Task.run(Task.scala:147)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$5(Executor.scala:647)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:80)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:77)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:650)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	... 1 more
Caused by: java.io.IOException: Connection reset by peer
	at java.base/sun.nio.ch.SocketDispatcher.write0(Native Method)
	at java.base/sun.nio.ch.SocketDispatcher.write(SocketDispatcher.java:54)
	at java.base/sun.nio.ch.IOUtil.writeFromNativeBuffer(IOUtil.java:137)
	at java.base/sun.nio.ch.IOUtil.write(IOUtil.java:81)
	at java.base/sun.nio.ch.IOUtil.write(IOUtil.java:58)
	at java.base/sun.nio.ch.SocketChannelImpl.write(SocketChannelImpl.java:542)
	at org.apache.spark.api.python.BasePythonRunner$ReaderInputStream.writeAdditionalInputToPythonWorker(PythonRunner.scala:855)
	at org.apache.spark.api.python.BasePythonRunner$ReaderInputStream.read(PythonRunner.scala:767)
	at java.base/java.io.BufferedInputStream.fill(BufferedInputStream.java:258)
	at java.base/java.io.BufferedInputStream.implRead(BufferedInputStream.java:292)
	at java.base/java.io.BufferedInputStream.read(BufferedInputStream.java:279)
	at java.base/java.io.DataInputStream.readInt(DataInputStream.java:381)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:933)
	... 22 more


# Punya peli masih an cv

In [23]:
from sklearn.model_selection import StratifiedKFold, cross_validate 
from sklearn.metrics import make_scorer, recall_score, precision_score, f1_score
import numpy as np 
from sklearn.metrics import classification_report

# Tentukan strategi Cross-Validation (disarankan K=5 atau K=10)
# StratifiedKFold WAJIB karena data churn tidak seimbang
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Definisikan metrik yang akan diukur (fokus pada kelas '1'/churn)
scoring = {
    'accuracy': 'accuracy', 
    # Metrik untuk CHURN (Kelas 1)
    'recall_churn': make_scorer(recall_score, pos_label=1),
    'precision_churn': make_scorer(precision_score, pos_label=1),
    'f1_churn': make_scorer(f1_score, pos_label=1),

    # Metrik untuk TIDAK CHURN (Kelas 0) 
    'recall_non_churn': make_scorer(recall_score, pos_label=0),
    'precision_non_churn': make_scorer(precision_score, pos_label=0),
    'f1_non_churn': make_scorer(f1_score, pos_label=0)
}

results_cv = {}    # Hasil dari cross_validate
results_split = {} # Hasil dari single fit/predict

for model_name, model in models.items():
    print(f"\n--- Melatih dan Menguji: {model_name} ---")

    full_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    # =======================================================
    # BAGIAN 1: SINGLE-SPLIT (Dibutuhkan untuk melihat report_dict lengkap)
    # =======================================================
    full_pipeline.fit(X_train, y_train) # Latih model untuk single-split
    y_pred = full_pipeline.predict(X_test)
    
    # Simpan hasil single-split (untuk visualisasi report lengkap)
    results_split[model_name] = classification_report(y_test, y_pred, output_dict=True)
    
    # =======================================================
    # BAGIAN 2: CROSS-VALIDATION (Untuk mendapatkan skor paling andal)
    # =======================================================
    cv_scores = cross_validate(
        full_pipeline, 
        X, y, # Menggunakan SEMUA data sampel
        cv=cv, 
        scoring=scoring, 
        return_train_score=False, 
        n_jobs=-1
    )
    
    # Rata-ratakan skor dari 5 fold
    # Pastikan 'accuracy' sudah ditambahkan ke dictionary 'scoring' Anda
    avg_scores = {
        'avg_accuracy': np.mean(cv_scores['test_accuracy']),
        
        # CHURN (Kelas 1)
        'avg_recall_churn': np.mean(cv_scores['test_recall_churn']),
        'avg_precision_churn': np.mean(cv_scores['test_precision_churn']),
        'avg_f1_churn': np.mean(cv_scores['test_f1_churn']),

        # TIDAK CHURN (Kelas 0) 
        'avg_recall_non_churn': np.mean(cv_scores['test_recall_non_churn']),
        'avg_precision_non_churn': np.mean(cv_scores['test_precision_non_churn']),
        'avg_f1_non_churn': np.mean(cv_scores['test_f1_non_churn']),
    }

    results_cv[model_name] = avg_scores


--- Melatih dan Menguji: Logistic Regression ---

--- Melatih dan Menguji: Decision Tree ---

--- Melatih dan Menguji: Random Forest ---


In [31]:
import pandas as pd
import numpy as np

for model_name in models.keys():
    report_split = results_split.get(model_name)
    report_cv = results_cv.get(model_name)
    
    if not report_split or not report_cv:
        print(f"\n--- Model: {model_name} (Hasil tidak lengkap) ---")
        continue

    print(f"\n--- Model: **{model_name}** ---")
    # --- 1. Hasil dari Single Split (fit/predict) ---
    print("  HASIL SPLIT TUNGGAL (80/20):")
    print(f"    Akurasi Keseluruhan: {report_split['accuracy']:.4f}")
    
    if '1' in report_split:
        print("\n     CHURN (Kelas 1):")
        print(f"      Recall:    {report_split['1']['recall']:.4f}")
        print(f"      Precision: {report_split['1']['precision']:.4f}")
        print(f"      F1-Score:  {report_split['1']['f1-score']:.4f}")
    if '0' in report_split:
        print("\n     TIDAK CHURN (Kelas 0):")
        print(f"      Recall: {report_split['0']['recall']:.4f}")
        print(f"      Precision: {report_split['0']['precision']:.4f}")
        print(f"      F1-Score: {report_split['0']['f1-score']:.4f}")
        
    # --- 2. Hasil dari Cross-Validation (Rata-rata) ---
    print("\n  HASIL CROSS-VALIDATION (Rata-rata 5-Fold):")
    print(f"    Akurasi Keseluruhan: {report_cv['avg_accuracy']:.4f}")
    
    # Hasil Rata-rata CHURN (Kelas 1)
    print("\n     CHURN (Kelas 1):")
    print(f"      Recall (Churn=1):    {report_cv['avg_recall_churn']:.4f}")
    print(f"      Precision (Churn=1): {report_cv['avg_precision_churn']:.4f}")
    print(f"      F1-Score (Churn=1):  {report_cv['avg_f1_churn']:.4f}")
    
    # Hasil Rata-rata NON-CHURN (Kelas 0) <--- TAMBAHAN BARU
    print("\n     TIDAK CHURN (Kelas 0):")
    print(f"      Recall (Non-Churn=0):    {report_cv['avg_recall_non_churn']:.4f}")
    print(f"      Precision (Non-Churn=0): {report_cv['avg_precision_non_churn']:.4f}")
    print(f"      F1-Score (Non-Churn=0):  {report_cv['avg_f1_non_churn']:.4f}")
    
    print("--------------------------------------------------------")


--- Model: **Logistic Regression** ---
  HASIL SPLIT TUNGGAL (80/20):
    Akurasi Keseluruhan: 0.8173

     CHURN (Kelas 1):
      Recall:    0.9078
      Precision: 0.3218
      F1-Score:  0.4751

     TIDAK CHURN (Kelas 0):
      Recall: 0.8082
      Precision: 0.9887
      F1-Score: 0.8894

  HASIL CROSS-VALIDATION (Rata-rata 5-Fold):
    Akurasi Keseluruhan: 0.8170

     CHURN (Kelas 1):
      Recall (Churn=1):    0.9059
      Precision (Churn=1): 0.3212
      F1-Score (Churn=1):  0.4742

     TIDAK CHURN (Kelas 0):
      Recall (Non-Churn=0):    0.8081
      Precision (Non-Churn=0): 0.9885
      F1-Score (Non-Churn=0):  0.8892
--------------------------------------------------------

--- Model: **Decision Tree** ---
  HASIL SPLIT TUNGGAL (80/20):
    Akurasi Keseluruhan: 0.8853

     CHURN (Kelas 1):
      Recall:    0.6760
      Precision: 0.4196
      F1-Score:  0.5178

     TIDAK CHURN (Kelas 0):
      Recall: 0.9063
      Precision: 0.9654
      F1-Score: 0.9349

  HASIL CROS

In [None]:
print("\n--- Ringkasan DataFrame Fokus Menggunakan CV ---")
comparison_data = []
for model_name in models.keys():
    
    # Pengecekan data karena DataFrame akan gagal jika KeyError muncul
    if model_name not in results_cv or 'avg_recall_non_churn' not in results_cv[model_name]:
        continue

    comparison_data.append({
        "Model": model_name,
        "Recall_CV (Churn=1)": results_cv[model_name]['avg_recall_churn'],
        "Precision_CV (Churn=1)": results_cv[model_name]['avg_precision_churn'],
        "Recall_CV (Non-Churn=0)": results_cv[model_name]['avg_recall_non_churn'],
        "Akurasi_CV": results_cv[model_name]['avg_accuracy'],
    })

comparison_df = pd.DataFrame(comparison_data)

# Sortir berdasarkan metrik yang paling penting (Recall Churn)
print(comparison_df.sort_values(by="Recall_CV (Churn=1)", ascending=False).reset_index(drop=True))


--- Ringkasan DataFrame (Fokus pada Kedua Kelas CV) ---
                 Model  Recall_CV (Churn=1)  Precision_CV (Churn=1)  \
0  Logistic Regression             0.905899                0.321181   
1        Decision Tree             0.697324                0.420681   
2        Random Forest             0.647106                0.460295   

   Recall_CV (Non-Churn=0)  Akurasi_CV  
0                 0.808093    0.817003  
1                 0.903742    0.884937  
2                 0.923946    0.898726  


In [None]:
#spark.stop()