<a href="https://colab.research.google.com/github/gulendamarici/Netflix-project/blob/main/Churnanalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install xgboost lightgbm imbalanced-learn shap lime --quiet


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/275.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m266.2/275.7 kB[0m [31m8.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for lime (setup.py) ... [?25l[?25hdone


In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from imblearn.over_sampling import SMOTE, ADASYN

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report
)

import shap
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")


In [3]:
# Dosya adını kendi dosyana göre güncelle
file_path = "/content/sample_data/WA_Fn-UseC_-Telco-Customer-Churn.csv"

df = pd.read_csv(file_path)

print("İlk 5 satır:")
display(df.head())

print("\nVeri seti bilgisi:")
print(df.info())

print("\nChurn dağılımı:")
print(df['Churn'].value_counts(normalize=True))


FileNotFoundError: [Errno 2] No such file or directory: '/content/sample_data/WA_Fn-UseC_-Telco-Customer-Churn.csv'

In [None]:
def preprocess_telco_data(df):
    df = df.copy()

    # 1) Gereksiz kolon: customerID
    if 'customerID' in df.columns:
        df.drop('customerID', axis=1, inplace=True)

    # 2) TotalCharges -> numerik
    if 'TotalCharges' in df.columns:
        df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

    # 3) Eksik değerleri sil (özellikle TotalCharges'tan gelen NaN'ler)
    df.dropna(inplace=True)

    # 4) Hedef değişken: Churn (No=0, Yes=1)
    df['Churn'] = df['Churn'].map({'No': 0, 'Yes': 1})

    # 5) Tenure_Group feature
    bins = [0, 12, 48, 100]
    labels = ['Yeni', 'Orta', 'Sadık']
    df['Tenure_Group'] = pd.cut(
        df['tenure'],
        bins=bins,
        labels=labels,
        right=True,
        include_lowest=True
    )

    # 6) Services_Count feature
    service_cols = [
        'PhoneService', 'MultipleLines',
        'InternetService', 'OnlineSecurity', 'OnlineBackup',
        'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies'
    ]

    # Bazı sütunlar eksik olursa patlamasın diye filtreleyelim
    service_cols = [c for c in service_cols if c in df.columns]

    def to_binary(x):
        return 1 if x == 'Yes' else 0

    bin_cols = []
    for col in service_cols:
        new_col = col + '_bin'
        df[new_col] = df[col].apply(to_binary)
        bin_cols.append(new_col)

    df['Services_Count'] = df[bin_cols].sum(axis=1)

    # 7) Advanced feature'lar
    # MonthlyCharges / Service sayısı (0 bölmeye karşı koruma)
    df['Monthly_per_Service'] = df['MonthlyCharges'] / df['Services_Count'].replace(0, 1)

    # Tenure başına aylık ücret
    df['Charge_per_Tenure'] = df['MonthlyCharges'] / df['tenure'].replace(0, 1)

    # TotalCharges / tenure (ortalama aylık toplam ödeme)
    df['Avg_Monthly_Total'] = df['TotalCharges'] / df['tenure'].replace(0, 1)

    # 8) X, y ayırma
    target = 'Churn'
    y = df[target]
    X = df.drop(target, axis=1)

    # 9) Sayısal ve kategorik sütunları tespit et
    numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

    # 10) One-hot encoding
    X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

    return X_encoded, y, numeric_cols


In [None]:
X, y, numeric_cols = preprocess_telco_data(df)

print("X shape:", X.shape)
print("y shape:", y.shape)
print("Pozitif sınıf oranı (Churn=1):", y.mean())
print("\nİlk 5 satır:")
display(X.head())

print("\nSayısal sütunlar (scale edeceğimiz kolonlar):")
print(numeric_cols)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 1) Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)
print("Churn oranı (train):", y_train.mean())


In [None]:
scaler = StandardScaler()

# sadece sayısal kolonları scale edeceğiz
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])

print("Scale edilmiş X_train örnek:")
display(X_train_scaled.head())


In [None]:
from imblearn.over_sampling import SMOTE, ADASYN

# SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

# ADASYN
adasyn = ADASYN(random_state=42)
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train_scaled, y_train)

print("Orijinal veri:", y_train.value_counts())
print("\nSMOTE sonrası:", y_train_smote.value_counts())
print("\nADASYN sonrası:", y_train_adasyn.value_counts())


In [None]:
def evaluate_model(model, X_test, y_test, name="Model"):
    y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    y_prob = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_prob)

    print(f"\n{name} Skorları:")
    print("-" * 40)
    print("Accuracy:", acc)
    print("Precision:", prec)
    print("Recall:", rec)
    print("F1 Score:", f1)
    print("AUC:", auc)

    return {
        "Model": name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1": f1,
        "AUC": auc
    }


In [None]:
results = []

# 1) Orijinal veri
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train_scaled, y_train)
results.append(evaluate_model(lr, X_test_scaled, y_test, "LR - Original"))

# 2) SMOTE verisi
lr_sm = LogisticRegression(max_iter=1000, random_state=42)
lr_sm.fit(X_train_smote, y_train_smote)
results.append(evaluate_model(lr_sm, X_test_scaled, y_test, "LR - SMOTE"))

# 3) ADASYN verisi
lr_ad = LogisticRegression(max_iter=1000, random_state=42)
lr_ad.fit(X_train_adasyn, y_train_adasyn)
results.append(evaluate_model(lr_ad, X_test_scaled, y_test, "LR - ADASYN"))


In [None]:
# 1) Original
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train_scaled, y_train)
results.append(evaluate_model(rf, X_test_scaled, y_test, "RF - Original"))

# 2) SMOTE
rf_sm = RandomForestClassifier(n_estimators=200, random_state=42)
rf_sm.fit(X_train_smote, y_train_smote)
results.append(evaluate_model(rf_sm, X_test_scaled, y_test, "RF - SMOTE"))

# 3) ADASYN
rf_ad = RandomForestClassifier(n_estimators=200, random_state=42)
rf_ad.fit(X_train_adasyn, y_train_adasyn)
results.append(evaluate_model(rf_ad, X_test_scaled, y_test, "RF - ADASYN"))


In [None]:
# Basic XGBoost
xgb = XGBClassifier(
    n_estimators=300,
    learning_rate=0.1,
    max_depth=5,
    subsample=0.9,
    colsample_bytree=0.9,
    eval_metric="logloss",
    random_state=42
)

xgb.fit(X_train_scaled, y_train)
results.append(evaluate_model(xgb, X_test_scaled, y_test, "XGBoost - Original"))


In [None]:
import pandas as pd

results_df = pd.DataFrame(results)
results_df


In [None]:
# LightGBM modelleri
lgb_params = {
    "n_estimators": 300,
    "learning_rate": 0.05,
    "max_depth": -1,
    "num_leaves": 31,
    "random_state": 42
}

# 1) Original
lgb = LGBMClassifier(**lgb_params)
lgb.fit(X_train_scaled, y_train)
results.append(evaluate_model(lgb, X_test_scaled, y_test, "LightGBM - Original"))

# 2) SMOTE
lgb_sm = LGBMClassifier(**lgb_params)
lgb_sm.fit(X_train_smote, y_train_smote)
results.append(evaluate_model(lgb_sm, X_test_scaled, y_test, "LightGBM - SMOTE"))

# 3) ADASYN
lgb_ad = LGBMClassifier(**lgb_params)
lgb_ad.fit(X_train_adasyn, y_train_adasyn)
results.append(evaluate_model(lgb_ad, X_test_scaled, y_test, "LightGBM - ADASYN"))


In [None]:
svm = SVC(kernel='rbf', probability=True, random_state=42)
svm.fit(X_train_scaled, y_train)
results.append(evaluate_model(svm, X_test_scaled, y_test, "SVM"))


In [None]:
from sklearn.model_selection import RandomizedSearchCV

xgb_base = XGBClassifier(
    eval_metric="logloss",
    random_state=42,
    use_label_encoder=False
)

param_grid = {
    "n_estimators": [200, 300, 400, 500],
    "max_depth": [3, 4, 5, 6, 7],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "subsample": [0.7, 0.8, 0.9, 1.0],
    "colsample_bytree": [0.7, 0.8, 0.9, 1.0],
    "gamma": [0, 1, 3, 5]
}

random_search_xgb = RandomizedSearchCV(
    estimator=xgb_base,
    param_distributions=param_grid,
    n_iter=20,
    scoring="recall",
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

random_search_xgb.fit(X_train_scaled, y_train)

print("Best Params:", random_search_xgb.best_params_)
best_xgb = random_search_xgb.best_estimator_

results.append(evaluate_model(best_xgb, X_test_scaled, y_test, "XGBoost - Tuned"))


In [None]:
from sklearn.model_selection import RandomizedSearchCV

rf_base = RandomForestClassifier(random_state=42)

rf_param_grid = {
    'n_estimators': [200, 300, 400, 500],
    'max_depth': [None, 10, 15, 20, 25],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

rf_random_search = RandomizedSearchCV(
    estimator=rf_base,
    param_distributions=rf_param_grid,
    n_iter=20,
    scoring='recall',
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

rf_random_search.fit(X_train_scaled, y_train)

print("RF Best Params:", rf_random_search.best_params_)
best_rf = rf_random_search.best_estimator_

results.append(evaluate_model(best_rf, X_test_scaled, y_test, "RandomForest - Tuned"))


In [None]:
results_df = pd.DataFrame(results)
results_df.sort_values("Recall", ascending=False)


In [None]:
!pip install lime --quiet


In [None]:
import shap
from lime.lime_tabular import LimeTabularExplainer

shap.initjs()

feature_names = X.columns.tolist()
print("SHAP ve LIME başarıyla yüklendi.")

In [None]:
# XGBoost Tuned için SHAP
explainer_xgb = shap.TreeExplainer(best_xgb)

# DataFrame yerine numpy array ile çalışmak daha stabil olabiliyor
X_test_array = X_test_scaled.values

shap_values_xgb = explainer_xgb(X_test_array)

# Global önem grafiği
shap.summary_plot(shap_values_xgb, X_test_array, feature_names=feature_names)


In [None]:
i = 0  # istediğin index
shap.plots.waterfall(shap_values_xgb[i])


In [None]:
# LightGBM - SMOTE için SHAP
explainer_lgb = shap.TreeExplainer(lgb_sm)
X_test_array = X_test_scaled.values  # aynı array'i kullanıyoruz

shap_values_lgb = explainer_lgb(X_test_array)

shap.summary_plot(shap_values_lgb, X_test_array, feature_names=feature_names)

i = 0
shap.plots.waterfall(shap_values_lgb[i])


In [None]:
from lime.lime_tabular import LimeTabularExplainer

lime_explainer = LimeTabularExplainer(
    training_data=X_train_scaled.values,
    feature_names=feature_names,
    class_names=["No Churn", "Churn"],
    mode="classification"
)


In [None]:
i = 0  # farklı satır denemek istersen değiştirirsin

exp_lr = lime_explainer.explain_instance(
    X_test_scaled.iloc[i].values,
    lr_ad.predict_proba,
    num_features=10
)

# Notebook içinde görsel göster
exp_lr.show_in_notebook()

# Konsolda liste halinde görmek istersen:
exp_lr.as_list()
