<a href="https://colab.research.google.com/github/flakesss/TUBES_AI/blob/hatta/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.metrics import classification_report, roc_curve, auc
import xgboost as xgb
import joblib
import os
import warnings
warnings.filterwarnings('ignore')

In [7]:
os.makedirs('models', exist_ok=True)
os.makedirs('visualizations', exist_ok=True)

In [8]:
data_path = '/content/Telco Customer Churn.xlsx'

In [9]:
try:
    telco_data = pd.read_excel(data_path)
    print(f"Data berhasil dimuat dari {data_path}")
except Exception as e:
    print(f"Error saat memuat file Excel: {e}")
    try:
        csv_path = 'Telco Customer Churn.csv'
        telco_data = pd.read_csv(csv_path)
        print(f"Data berhasil dimuat dari {csv_path}")
    except Exception as e2:
        print(f"Error saat memuat file CSV: {e2}")
        raise Exception("Tidak dapat memuat data. Pastikan file data tersedia.")


Data berhasil dimuat dari /content/Telco Customer Churn.xlsx


In [10]:
print("\nInformasi data:")
print(f"Jumlah baris: {telco_data.shape[0]}")
print(f"Jumlah kolom: {telco_data.shape[1]}")
print("\nLima baris pertama:")
print(telco_data.head())


Informasi data:
Jumlah baris: 7043
Jumlah kolom: 1

Lima baris pertama:
  customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0  7590-VHVEG,Female,0,Yes,No,1,No,No phone servi...                                                                                                                                                                                                                 
1  5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,No,Y...                                                                                                                                                                                                                 
2  3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,Yes,N...                                                                                         

In [11]:
if telco_data.shape[1] == 1 and ',' in str(telco_data.iloc[0, 0]):
    print("\nDeteksi masalah format: Data ada dalam satu kolom. Parsing data...")

    header = telco_data.columns[0].split(',')

    parsed_rows = []
    for i in range(len(telco_data)):
        values = telco_data.iloc[i, 0].split(',')
        if len(values) == len(header):
            parsed_rows.append(values)

    telco_data = pd.DataFrame(parsed_rows, columns=header)
    print("Data berhasil di-parse.")


Deteksi masalah format: Data ada dalam satu kolom. Parsing data...
Data berhasil di-parse.


In [13]:
# 3. Preprocessing data
print("\n2. Preprocessing data...")

# Hapus kolom yang tidak berpengaruh
print("Menghapus kolom yang tidak berpengaruh...")
# customerID tidak relevan untuk prediksi
columns_to_drop = ['customerID']
telco_data = telco_data.drop(columns=columns_to_drop, errors='ignore')

# Konversi TotalCharges ke numerik
if 'TotalCharges' in telco_data.columns:
    telco_data['TotalCharges'] = pd.to_numeric(telco_data['TotalCharges'], errors='coerce')

# Cek missing values
missing_data = telco_data.isnull().sum()
print(f"\nMissing values per kolom:")
print(missing_data[missing_data > 0])

# Handle missing values
telco_data.fillna(telco_data.median(), inplace=True)

# Konversi target 'Churn' ke biner
print("\nDistribusi target 'Churn':")
if 'Churn' in telco_data.columns:
    print(telco_data['Churn'].value_counts())
    # Konversi 'Yes'/'No' ke 1/0
    if telco_data['Churn'].dtype == 'object':
        telco_data['Churn'] = telco_data['Churn'].map({'Yes': 1, 'No': 0})
    print("Target 'Churn' dikonversi ke nilai biner 1/0")


2. Preprocessing data...
Menghapus kolom yang tidak berpengaruh...

Missing values per kolom:
TotalCharges    11
dtype: int64


TypeError: Cannot convert [['Female' 'Male' 'Male' ... 'Female' 'Male' 'Male']
 ['0' '0' '0' ... '0' '1' '0']
 ['Yes' 'No' 'No' ... 'Yes' 'Yes' 'No']
 ...
 ['Yes' 'No' 'Yes' ... 'Yes' 'Yes' 'Yes']
 ['Electronic check' 'Mailed check' 'Mailed check' ... 'Electronic check'
  'Mailed check' 'Bank transfer (automatic)']
 ['29.85' '56.95' '53.85' ... '29.6' '74.4' '105.65']] to numeric

In [16]:
columns_to_drop = ['customerID']
telco_data = telco_data.drop(columns=columns_to_drop, errors='ignore')

if 'TotalCharges' in telco_data.columns:
    telco_data['TotalCharges'] = pd.to_numeric(telco_data['TotalCharges'], errors='coerce')

missing_data = telco_data.isnull().sum()
print(f"\nMissing values per kolom:")
print(missing_data[missing_data > 0])

numerical_cols = telco_data.select_dtypes(include=np.number).columns
telco_data[numerical_cols] = telco_data[numerical_cols].fillna(telco_data[numerical_cols].median())

print("\nDistribusi target 'Churn':")
if 'Churn' in telco_data.columns:
    print(telco_data['Churn'].value_counts())
    if telco_data['Churn'].dtype == 'object':
        telco_data['Churn'] = telco_data['Churn'].map({'Yes': 1, 'No': 0})
    print("Target 'Churn' dikonversi ke nilai biner 1/0")


2. Preprocessing data...
Menghapus kolom yang tidak berpengaruh...

Missing values per kolom:
TotalCharges    11
dtype: int64

Distribusi target 'Churn':
Churn
No     5174
Yes    1869
Name: count, dtype: int64
Target 'Churn' dikonversi ke nilai biner 1/0


In [17]:
categorical_cols = telco_data.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = telco_data.select_dtypes(include=['int64', 'float64']).columns.tolist()

if 'Churn' in numerical_cols:
    numerical_cols.remove('Churn')

print(f"Kolom kategorikal: {categorical_cols}")
print(f"Kolom numerik: {numerical_cols}")

# Menyiapkan X dan y
X = telco_data.drop('Churn', axis=1)
y = telco_data['Churn']



3. Feature selection dan engineering...
Kolom kategorikal: ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges']
Kolom numerik: ['TotalCharges']


In [18]:
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val
)

print(f"Jumlah data training: {X_train.shape[0]} ({X_train.shape[0]/len(telco_data):.1%} dari total)")
print(f"Jumlah data validasi: {X_val.shape[0]} ({X_val.shape[0]/len(telco_data):.1%} dari total)")
print(f"Jumlah data testing: {X_test.shape[0]} ({X_test.shape[0]/len(telco_data):.1%} dari total)")


4. Membagi data menjadi train, validation, dan test...
Jumlah data training: 4225 (60.0% dari total)
Jumlah data validasi: 1409 (20.0% dari total)
Jumlah data testing: 1409 (20.0% dari total)


In [19]:
numeric_transformer = StandardScaler()

categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


5. Membuat preprocessing pipeline...


In [20]:

xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    n_estimators=100,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb_model)
])

pipeline.fit(X_train, y_train)

y_pred_val = pipeline.predict(X_val)
y_pred_prob_val = pipeline.predict_proba(X_val)[:, 1]

print("\nEvaluasi model dasar pada data validasi:")
print(f"Accuracy: {accuracy_score(y_val, y_pred_val):.4f}")
print(f"Precision: {precision_score(y_val, y_pred_val):.4f}")
print(f"Recall: {recall_score(y_val, y_pred_val):.4f}")
print(f"F1-score: {f1_score(y_val, y_pred_val):.4f}")
print(f"ROC AUC: {roc_auc_score(y_val, y_pred_prob_val):.4f}")

cm = confusion_matrix(y_val, y_pred_val)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Model Dasar (Validasi)')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.savefig('visualizations/base_model_confusion_matrix.png')
plt.close()

fpr, tpr, _ = roc_curve(y_val, y_pred_prob_val)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Model Dasar (Validasi)')
plt.legend(loc="lower right")
plt.savefig('visualizations/base_model_roc_curve.png')
plt.close()


6. Melatih model dasar XGBoost...

Evaluasi model dasar pada data validasi:
Accuracy: 0.7970
Precision: 0.6507
Recall: 0.5080
F1-score: 0.5706
ROC AUC: 0.8353


In [21]:
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [3, 4, 5, 6],
    'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'classifier__subsample': [0.8, 0.9, 1.0],
    'classifier__colsample_bytree': [0.8, 0.9, 1.0],
    'classifier__gamma': [0, 0.1, 0.2],
    'classifier__min_child_weight': [1, 3, 5]
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=cv,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train_val, y_train_val)

print("\nParameter optimal:")
print(grid_search.best_params_)
print(f"\nBest score: {grid_search.best_score_:.4f}")


7. Melakukan hyperparameter tuning...
Fitting 3 folds for each of 3888 candidates, totalling 11664 fits

Parameter optimal:
{'classifier__colsample_bytree': 0.9, 'classifier__gamma': 0, 'classifier__learning_rate': 0.05, 'classifier__max_depth': 3, 'classifier__min_child_weight': 3, 'classifier__n_estimators': 100, 'classifier__subsample': 0.8}

Best score: 0.8493


In [22]:


best_model = grid_search.best_estimator_

best_model.fit(X_train_val, y_train_val)

y_pred_test = best_model.predict(X_test)
y_pred_prob_test = best_model.predict_proba(X_test)[:, 1]

print("\nEvaluasi model final pada data test:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_test):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_test):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_test):.4f}")
print(f"F1-score: {f1_score(y_test, y_pred_test):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_pred_prob_test):.4f}")

print("\nClassification report:")
print(classification_report(y_test, y_pred_test))

cm = confusion_matrix(y_test, y_pred_test)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Model Final (Test)')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.savefig('visualizations/final_model_confusion_matrix.png')
plt.close()

fpr, tpr, _ = roc_curve(y_test, y_pred_prob_test)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Model Final (Test)')
plt.legend(loc="lower right")
plt.savefig('visualizations/final_model_roc_curve.png')
plt.close()


Evaluasi model final pada data test:
Accuracy: 0.8020
Precision: 0.6678
Recall: 0.5053
F1-score: 0.5753
ROC AUC: 0.8463

Classification report:
              precision    recall  f1-score   support

           0       0.84      0.91      0.87      1035
           1       0.67      0.51      0.58       374

    accuracy                           0.80      1409
   macro avg       0.75      0.71      0.72      1409
weighted avg       0.79      0.80      0.79      1409



In [23]:
xgb_model = best_model.named_steps['classifier']

preprocessor = best_model.named_steps['preprocessor']
cat_features = preprocessor.transformers_[1][1].get_feature_names_out(categorical_cols)
feature_names = numerical_cols + list(cat_features)

importance = xgb_model.feature_importances_
indices = np.argsort(importance)[::-1]

plt.figure(figsize=(12, 8))
plt.title('Feature Importance')
plt.bar(range(len(importance)), importance[indices])
plt.xticks(range(min(20, len(importance))), [feature_names[i] for i in indices[:20]], rotation=90)
plt.tight_layout()
plt.savefig('visualizations/feature_importance.png')
plt.close()

feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importance
}).sort_values(by='Importance', ascending=False)

print("\nTop 10 fitur paling penting:")
print(feature_importance_df.head(10))

feature_importance_df.to_csv('visualizations/feature_importance.csv', index=False)

joblib.dump(best_model, 'models/telco_churn_xgboost.pkl')
print("Model disimpan sebagai: models/telco_churn_xgboost.pkl")

print("\nPelatihan model selesai!")
print("Visualisasi disimpan di folder 'visualizations'")


Top 10 fitur paling penting:
                            Feature  Importance
108         Contract_Month-to-month    0.338209
90                OnlineSecurity_No    0.063728
99                   TechSupport_No    0.053123
88      InternetService_Fiber optic    0.047890
115  PaymentMethod_Electronic check    0.026109
10                         tenure_1    0.024283
0                      TotalCharges    0.023844
111             PaperlessBilling_No    0.021087
107             StreamingMovies_Yes    0.020215
110               Contract_Two year    0.017823
Model disimpan sebagai: models/telco_churn_xgboost.pkl

Pelatihan model selesai!
Visualisasi disimpan di folder 'visualizations'
