<a href="https://colab.research.google.com/github/flakesss/TUBES_AI/blob/hatta/model_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import joblib
import os
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score,
    roc_curve, auc, precision_recall_curve, average_precision_score
)
from sklearn.ensemble import GradientBoostingClassifier

warnings.filterwarnings('ignore')

os.makedirs('models', exist_ok=True)
os.makedirs('visualizations', exist_ok=True)

In [18]:
try:
    df = pd.read_excel("/content/Telco Customer Churn.xlsx")
    print("Data berhasil dimuat dari file Excel")
except Exception as e:
    print(f"Error saat memuat Excel: {e}")
    try:
        df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
        print("Data berhasil dimuat dari file CSV")
    except Exception as e2:
        print(f"Error saat memuat CSV: {e2}")
        try:
            files = [f for f in os.listdir() if f.endswith('.csv') or f.endswith('.xlsx')]
            if files:
                if files[0].endswith('.csv'):
                    df = pd.read_csv(files[0])
                else:
                    df = pd.read_excel(files[0])
                print(f"Data berhasil dimuat dari {files[0]}")
            else:
                raise Exception("Tidak ada file CSV atau Excel ditemukan")
        except Exception as e3:
            print(f"Error saat mencoba opsi lain: {e3}")
            raise Exception("Tidak dapat memuat data. Pastikan file telah diupload dengan benar.")


Data berhasil dimuat dari file Excel


In [3]:

print("\n2. Memeriksa data awal...")
print(f"Jumlah baris dan kolom: {df.shape}")
print("\nLima baris pertama:")
print(df.head())

if df.shape[1] == 1 and ',' in str(df.iloc[0, 0]):
    print("\nDeteksi masalah format: Data ada dalam satu kolom. Parsing data...")
    header = df.columns[0].split(',')
    parsed_rows = []
    for i in range(len(df)):
        values = df.iloc[i, 0].split(',')
        if len(values) == len(header):
            parsed_rows.append(values)
    df = pd.DataFrame(parsed_rows, columns=header)
    print("Data berhasil di-parse.")


2. Memeriksa data awal...
Jumlah baris dan kolom: (7043, 21)

Lima baris pertama:
   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService  \
0  7590-VHVEG  Female              0     Yes         No       1           No   
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection  \
0  No phone service             DSL             No  ...               No   
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ... 

In [4]:

print("\n3. Membersihkan dan memproses data...")

if 'customerID' in df.columns:
    df = df.drop('customerID', axis=1)
    print("- Kolom customerID dihapus")

if 'TotalCharges' in df.columns:
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    print("- TotalCharges dikonversi ke numerik")

missing_data = df.isnull().sum()
print("\nMissing values:")
print(missing_data[missing_data > 0])

if 'TotalCharges' in df.columns and df['TotalCharges'].isnull().sum() > 0:
    df['TotalCharges'] = df['TotalCharges'].fillna(df.groupby('tenure')['TotalCharges'].transform('median'))
    df['TotalCharges'] = df['TotalCharges'].fillna(df['TotalCharges'].median())
    print("- Missing values pada TotalCharges ditangani")

if df.duplicated().sum() > 0:
    df = df.drop_duplicates()
    print(f"- {df.duplicated().sum()} data duplikat dihapus")
else:
    print("- Tidak ada data duplikat")

if 'Churn' in df.columns and df['Churn'].dtype == 'object':
    df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})
    print("- Target 'Churn' dikonversi ke nilai biner 1/0")

if 'SeniorCitizen' in df.columns and df['SeniorCitizen'].dtype == 'object':
    df['SeniorCitizen'] = df['SeniorCitizen'].map({'Yes': 1, 'No': 0})
    print("- SeniorCitizen dikonversi ke nilai biner 1/0")



3. Membersihkan dan memproses data...
- Kolom customerID dihapus
- TotalCharges dikonversi ke numerik

Missing values:
TotalCharges    11
dtype: int64
- Missing values pada TotalCharges ditangani
- 0 data duplikat dihapus
- Target 'Churn' dikonversi ke nilai biner 1/0


In [17]:

print("\n4. Analisis Eksploratori Data...")

plt.figure(figsize=(8, 6))
sns.countplot(x='Churn', data=df, palette='Set1')
plt.title('Distribusi Target (Churn)')
plt.savefig('visualizations/churn_distribution.png')
plt.close()
print("- Visualisasi distribusi churn disimpan")

categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
if 'Contract' in categorical_cols:
    plt.figure(figsize=(10, 6))
    sns.countplot(x='Contract', hue='Churn', data=df, palette='Set2')
    plt.title('Distribusi Churn berdasarkan Contract')
    plt.savefig('visualizations/churn_by_contract.png')
    plt.close()
    print("- Visualisasi distribusi churn berdasarkan contract disimpan")

numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_cols = [col for col in numeric_cols if col != 'Churn']
if 'tenure' in numeric_cols and 'MonthlyCharges' in numeric_cols:
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    sns.boxplot(x='Churn', y='tenure', data=df, ax=axes[0])
    axes[0].set_title('Tenure by Churn Status')
    sns.boxplot(x='Churn', y='MonthlyCharges', data=df, ax=axes[1])
    axes[1].set_title('Monthly Charges by Churn Status')
    plt.tight_layout()
    plt.savefig('visualizations/numeric_features_by_churn.png')
    plt.close()
    print("- Visualisasi fitur numerik berdasarkan churn disimpan")


4. Analisis Eksploratori Data...
- Visualisasi distribusi churn disimpan
- Visualisasi distribusi churn berdasarkan contract disimpan
- Visualisasi fitur numerik berdasarkan churn disimpan


In [6]:

print("\n5. Seleksi fitur dan split data...")

important_features = [
    'Contract', 'InternetService', 'PaymentMethod', 'TotalCharges', 'tenure',
    'OnlineSecurity', 'StreamingTV', 'PaperlessBilling', 'StreamingMovies', 'MultipleLines'
]

missing_features = [feature for feature in important_features if feature not in df.columns]
if missing_features:
    print(f"- Perhatian: Fitur berikut tidak ditemukan dalam dataset: {missing_features}")
    important_features = [feature for feature in important_features if feature in df.columns]

X = df[important_features]
y = df['Churn']

print(f"- Menggunakan {len(important_features)} fitur penting: {important_features}")



5. Seleksi fitur dan split data...
- Menggunakan 10 fitur penting: ['Contract', 'InternetService', 'PaymentMethod', 'TotalCharges', 'tenure', 'OnlineSecurity', 'StreamingTV', 'PaperlessBilling', 'StreamingMovies', 'MultipleLines']


In [7]:

print("\n6. Split data untuk training dan testing...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"- Data training: {X_train.shape[0]} samples")
print(f"- Data testing: {X_test.shape[0]} samples")



6. Split data untuk training dan testing...
- Data training: 5616 samples
- Data testing: 1405 samples


In [8]:

print("\n7. Menyiapkan preprocessing pipeline...")


binary_cols = ['PaperlessBilling']
ordinal_cols = ['Contract', 'InternetService']
nominal_cols = [col for col in important_features if col not in binary_cols + ordinal_cols + ['tenure', 'TotalCharges', 'MonthlyCharges']]
numeric_cols = [col for col in important_features if col in ['tenure', 'TotalCharges', 'MonthlyCharges']]

ordinal_mapping = [
    ['Month-to-month', 'One year', 'Two year'],
    ['No', 'DSL', 'Fiber optic']
]

binary_transformer = OneHotEncoder(drop='if_binary')
ordinal_transformer = OrdinalEncoder(categories=ordinal_mapping)
nominal_transformer = OneHotEncoder(drop='first')
numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer([
    ('binary', binary_transformer, binary_cols),
    ('ordinal', ordinal_transformer, ordinal_cols),
    ('nominal', nominal_transformer, nominal_cols),
    ('numeric', numeric_transformer, numeric_cols)
], remainder='passthrough')

print(f"- Binary columns: {binary_cols}")
print(f"- Ordinal columns: {ordinal_cols}")
print(f"- Nominal columns: {nominal_cols}")
print(f"- Numeric columns: {numeric_cols}")



7. Menyiapkan preprocessing pipeline...
- Binary columns: ['PaperlessBilling']
- Ordinal columns: ['Contract', 'InternetService']
- Nominal columns: ['PaymentMethod', 'OnlineSecurity', 'StreamingTV', 'StreamingMovies', 'MultipleLines']
- Numeric columns: ['TotalCharges', 'tenure']


In [16]:

print("\n8. Training model Gradient Boosting...")

baseline_gb = GradientBoostingClassifier(random_state=42)
baseline_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', baseline_gb)
])

baseline_model.fit(X_train, y_train)
y_pred_baseline = baseline_model.predict(X_test)
print("- Model baseline berhasil dilatih")

print("\nHasil evaluasi model baseline:")
baseline_accuracy = accuracy_score(y_test, y_pred_baseline)
baseline_precision = precision_score(y_test, y_pred_baseline)
baseline_recall = recall_score(y_test, y_pred_baseline)
baseline_f1 = f1_score(y_test, y_pred_baseline)

print(f"- Accuracy: {baseline_accuracy:.4f}")
print(f"- Precision: {baseline_precision:.4f}")
print(f"- Recall: {baseline_recall:.4f}")
print(f"- F1 Score: {baseline_f1:.4f}")

cm_baseline = confusion_matrix(y_test, y_pred_baseline)
plt.figure(figsize=(8, 6))
sns.heatmap(cm_baseline, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Model Baseline')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.savefig('visualizations/baseline_confusion_matrix.png')
plt.close()
print("- Confusion matrix baseline disimpan")


8. Training model Gradient Boosting...
- Model baseline berhasil dilatih

Hasil evaluasi model baseline:
- Accuracy: 0.8050
- Precision: 0.6678
- Recall: 0.5242
- F1 Score: 0.5873
- Confusion matrix baseline disimpan


In [10]:

print("\n9. Parameter tuning dengan fokus mengurangi false positive...")

param_grid = {
    'classifier__learning_rate': [0.05, 0.1, 0.2],
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [3, 4, 5],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__subsample': [0.8, 1.0],
    'classifier__max_features': [None, 'sqrt', 'log2']
}


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


grid_search = GridSearchCV(
    baseline_model,
    param_grid=param_grid,
    cv=cv,
    scoring='precision',
    n_jobs=-1,
    verbose=1
)


print("- Memulai GridSearchCV (ini akan membutuhkan waktu)...")
grid_search.fit(X_train, y_train)
print("- GridSearchCV selesai!")

print(f"\nParameter terbaik:")
for param, value in grid_search.best_params_.items():
    print(f"- {param}: {value}")
print(f"- Best score (precision): {grid_search.best_score_:.4f}")


9. Parameter tuning dengan fokus mengurangi false positive...
- Memulai GridSearchCV (ini akan membutuhkan waktu)...
Fitting 5 folds for each of 1458 candidates, totalling 7290 fits
- GridSearchCV selesai!

Parameter terbaik:
- classifier__learning_rate: 0.05
- classifier__max_depth: 3
- classifier__max_features: sqrt
- classifier__min_samples_leaf: 1
- classifier__min_samples_split: 5
- classifier__n_estimators: 100
- classifier__subsample: 0.8
- Best score (precision): 0.6795


In [14]:

print("\n10. Evaluasi model final...")

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)

print("\nHasil evaluasi model final:")
print(f"- Accuracy: {accuracy:.4f}")
print(f"- Precision: {precision:.4f} (Fokus utama untuk mengurangi false positive)")
print(f"- Recall: {recall:.4f}")
print(f"- F1 Score: {f1:.4f}")
print(f"- ROC AUC: {roc_auc:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Model Final')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.savefig('visualizations/final_confusion_matrix.png')
plt.close()
print("- Confusion matrix final disimpan")

fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.savefig('visualizations/roc_curve.png')
plt.close()
print("- ROC curve disimpan")

precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_proba)
avg_precision = average_precision_score(y_test, y_proba)
plt.figure(figsize=(8, 6))
plt.plot(recall_curve, precision_curve, color='blue', lw=2, label=f'PR curve (AP = {avg_precision:.2f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc="lower left")
plt.savefig('visualizations/precision_recall_curve.png')
plt.close()
print("- Precision-Recall curve disimpan")


10. Evaluasi model final...

Hasil evaluasi model final:
- Accuracy: 0.7993
- Precision: 0.6585 (Fokus utama untuk mengurangi false positive)
- Recall: 0.5027
- F1 Score: 0.5701
- ROC AUC: 0.8413

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.91      0.87      1033
           1       0.66      0.50      0.57       372

    accuracy                           0.80      1405
   macro avg       0.75      0.70      0.72      1405
weighted avg       0.79      0.80      0.79      1405

- Confusion matrix final disimpan
- ROC curve disimpan
- Precision-Recall curve disimpan


In [15]:

print("\n11. Perbandingan dengan model baseline...")
print(f"                      Baseline  vs  Model Final")
print(f"Accuracy:             {baseline_accuracy:.4f}    {accuracy:.4f}")
print(f"Precision:            {baseline_precision:.4f}    {precision:.4f}")
print(f"Recall:               {baseline_recall:.4f}    {recall:.4f}")
print(f"F1 Score:             {baseline_f1:.4f}    {f1:.4f}")

improvement_precision = ((precision - baseline_precision) / baseline_precision) * 100
print(f"\nPeningkatan precision: {improvement_precision:.2f}% (mengurangi false positive)")

print("\n12. Menyimpan model final...")
joblib.dump(best_model, 'models/gradient_boosting_churn_model.pkl')
print("- Model disimpan sebagai: models/gradient_boosting_churn_model.pkl")

print("\n13. Membuat fungsi prediksi untuk data baru...")

def predict_churn(data, threshold=0.5):

    required_features = important_features
    missing_cols = [col for col in required_features if col not in data.columns]

    if missing_cols:
        raise ValueError(f"Data input tidak memiliki kolom yang diperlukan: {missing_cols}")

    # Prediksi
    proba = best_model.predict_proba(data[required_features])[:, 1]
    predictions = (proba >= threshold).astype(int)

    # Hasil
    result = data.copy()
    result['churn_probability'] = proba
    result['churn_prediction'] = predictions

    return result

sample_data = X_test.iloc[:5].copy()
sample_result = predict_churn(sample_data)

print("\nContoh prediksi untuk 5 data pertama:")
print(sample_result[['churn_probability', 'churn_prediction']])

print("\nModel Gradient Boosting untuk prediksi churn telah berhasil diimplementasikan!")


11. Perbandingan dengan model baseline...
                      Baseline  vs  Model Final
Accuracy:             0.8050    0.7993
Precision:            0.6678    0.6585
Recall:               0.5242    0.5027
F1 Score:             0.5873    0.5701

Peningkatan precision: -1.40% (mengurangi false positive)

12. Menyimpan model final...
- Model disimpan sebagai: models/gradient_boosting_churn_model.pkl

13. Membuat fungsi prediksi untuk data baru...

Contoh prediksi untuk 5 data pertama:
      churn_probability  churn_prediction
5627           0.692841                 1
6126           0.439016                 0
2361           0.461931                 0
2201           0.089295                 0
832            0.058647                 0

Model Gradient Boosting untuk prediksi churn telah berhasil diimplementasikan!
