Import Libraries

In [None]:
# data processing
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("whitegrid")

# cross validation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

#ML algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm

import xgboost as xgb

# import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, f1_score,recall_score, RocCurveDisplay,precision_recall_curve, auc, classification_report

# model optimization
from sklearn.model_selection import GridSearchCV


Data Loading

In [None]:
csv = r"../raw/raw.csv"
df = pd.read_csv(csv)
df

Data Cleansing

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
df.isna().sum()
df.duplicated().sum()

In [None]:
df[df.duplicated()]
# Drop duplicates and reset index
df = df.drop_duplicates().reset_index(drop=True)

In [None]:
# drop unused columns
df = df.drop("customerID", axis=1)

# Convert to numeric
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df["TotalCharges"] = df["TotalCharges"].fillna(df["TotalCharges"].median())

Exploratory Data Analysis

In [None]:
# Change columns into category type
cat_cols = ['gender', 'Partner', 'Dependents',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
df[cat_cols] = df[cat_cols].astype('category')

In [None]:
df.columns
df.describe(include='category')


In [None]:
#  One-hot encoding
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

In [None]:
# Check variable target distribution (churn percentage in dataset)
pie_graph = df['Churn'].value_counts().plot(kind='pie', autopct='%1.1f%%')
for graph in pie_graph.texts:
    graph.set_color("black") 
plt.ylabel("")
plt.title("Churn Distribution")
plt.show()


In [None]:
fig, ax = plt.subplots(ncols=1, nrows=len(num_cols), figsize=(12, 4*len(num_cols)))

for i, col in enumerate(num_cols):
    sns.boxplot(data=df, x=col, ax=ax[i])

plt.tight_layout()
plt.show()

In [None]:
# check for outlier using z-score
z_thresh = 3

for col in num_cols:
    mean = df[col].mean()
    std = df[col].std()
    z = (df[col] - mean) / std
    outliers = (np.abs(z) > z_thresh).sum()
    print(col, "outliers:", outliers)


In [None]:
# Histogram to determine bins for Tenure
df['tenure'].hist(bins=30, edgecolor='black')
plt.xlabel('Tenure Period')
plt.ylabel('Count')
plt.title('Distribution of Tenure')
plt.show()

# Quick statistics
df['tenure'].describe()


In [None]:
# Histogram to determine bins for MonthlyCharges
df['MonthlyCharges'].hist(bins=30, edgecolor='black')
plt.xlabel('MonthlyCharges')
plt.ylabel('Count')
plt.title('Distribution of MonthlyCharges')
plt.show()

df['MonthlyCharges'].describe()


In [None]:
# Histogram to determine bins for TotalCharges
df['TotalCharges'].hist(bins=30, edgecolor='black')
plt.xlabel('TotalCharges')
plt.ylabel('Count')
plt.title('Distribution of TotalCharges')
plt.show()

df['TotalCharges'].describe()


Data Preprocessing (2)

In [None]:
df["Churn_num"] = df["Churn"].astype(str).str.strip().map({"Yes": 1, "No": 0})
df["Churn_num"].isna().sum()

feature_cols = df.select_dtypes(include=["number", "bool"]).columns.tolist()

# Correlation heat map
corr = df[feature_cols].corr()
f, ax = plt.subplots(figsize=(18, 16))
sns.heatmap(corr, annot=True, fmt=".2f", cmap='YlGnBu')
plt.show()

In [None]:
plt.figure(figsize=(6, 6))
churn_corr = corr[["Churn_num"]].drop("Churn_num").sort_values("Churn_num", ascending=False)
sns.heatmap(churn_corr, annot=True, fmt=".2f", cmap="YlGnBu",
            linewidths=0.2, linecolor="white")
plt.title("Correlation with Churn")
plt.tight_layout()
plt.show()


In [None]:
high_corr = churn_corr[abs(churn_corr['Churn_num']) > 0.25]
high_corr.sort_values(by='Churn_num', ascending=False)

Split Train-Test Data

In [None]:
df_copy = df.copy()
X_raw = df_copy.drop(columns=['Churn_num', 'Churn'])

X = X_raw
y = df['Churn_num'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print('Train - x:', X_train.shape, 'y:', y_train.shape)
print('Test - x:', X_test.shape, 'y:', y_test.shape)



Feature scaling (Standardization Data) - scales feautres of data so that they have zero mean and unit variance

In [None]:
num_df = df[num_cols]

summary = pd.DataFrame({
    "column": num_df.columns,
    "min": num_df.min().values,
    "max": num_df.max().values
}).sort_values(by="min")

summary


In [None]:
ss = StandardScaler()

X_train_ss = ss.fit_transform(X_train)
X_test_ss = ss.transform(X_test)

In [None]:
list_of_models = {
    'logistic_regression' : LogisticRegression(random_state = 42, max_iter = 10000),
    'decision_tree' : DecisionTreeClassifier(max_depth = 5, random_state = 42),
    'Random_forest' : RandomForestClassifier(n_estimators = 150, max_depth = 4, random_state = 42),
    'GBDT' : GradientBoostingClassifier(n_estimators = 150, max_depth = 4, random_state = 42),
    "XGBoost" : xgb.XGBClassifier(n_estimators = 200, max_depth = 5, random_state = 42)
}

Model Building

In [None]:
f1_train_scores = [] 
f1_test_scores = [] 
recall_test_scores = []

model_names = list_of_models.keys()

for model in model_names:
    print("\nFor Model:", model)
    list_of_models[model].fit(X_train, y_train)

    print("\nFor Training Set:")
    y_train_pred = list_of_models[model].predict(X_train)

    f1_train = f1_score(y_train, y_train_pred, average='macro')
    print("\nMacro F1 Score:", f1_train)

    print("\nConfusion Matrix:") 
    cm = metrics.confusion_matrix(y_train, y_train_pred)
    cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = [False, True])
    cm_display.plot()
    plt.show()

    print("For Test Set:")

    y_test_pred = list_of_models[model].predict(X_test)
    
    f1_test = f1_score(y_test, y_test_pred, average='macro')
    print("\nMacro F1 Score:", f1_test)

    recall_test_score = recall_score(y_test, y_test_pred, average='macro')
    
    print("\nConfusion Matrix:")
    cm = metrics.confusion_matrix(y_test, y_test_pred)
    cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = [False, True])
    cm_display.plot()
    plt.show()
    
    f1_train_scores.append(f1_train)
    f1_test_scores.append(f1_test)
    recall_test_scores.append(recall_test_score)

results = []    

for name, model in list_of_models.items():
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred  = model.predict(X_test)

    results.append({
        "Model": name,
        "F1 Train (macro)": f1_score(y_train, y_train_pred, average="macro"),
        "F1 Test (macro)":  f1_score(y_test, y_test_pred, average="macro"),
        "Recall Test (macro)": recall_score(y_test, y_test_pred, average="macro")
    })

results_df = pd.DataFrame(results).sort_values(by="F1 Test (macro)", ascending=False)
results_df

Results: Logistic Regression has the best Test macro-F1 (0.761) and Recall is also highest
Logistic Regression has the best Test macro-F1 (0.761) and Recall is also highest. Likely overfitting


Evaluate Logistic Regression properly:

In [None]:
log_reg = LogisticRegression(max_iter=2000)
log_reg.fit(X_train, y_train)

In [None]:
y_train_pred = log_reg.predict(X_train)
y_test_pred = log_reg.predict(X_test)

y_test_prob = log_reg.predict_proba(X_test)[:, 1]  # probability of churn=1


In [None]:
cm_report = confusion_matrix(y_test, y_test_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_report, display_labels=[0, 1])
disp.plot()
plt.show()

In [None]:
roc_auc = roc_auc_score(y_test, y_test_prob)
print("ROC-AUC:", roc_auc)

RocCurveDisplay.from_predictions(y_test, y_test_prob)
plt.title("ROC Curve (Test)")
plt.show()

precision, recall, thresholds = precision_recall_curve(y_test, y_test_prob)
pr_auc = auc(recall, precision)
print("PR-AUC:", pr_auc)

plt.plot(recall, precision)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve (Test)")
plt.show()


Hyperparameter tuning

In [None]:
param_grid = {
    "C": [0.01, 0.1, 1, 10],
    "penalty": ["l2"],
    "solver": ["lbfgs", "liblinear"],
    "class_weight": [None, "balanced"]
}

grid = GridSearchCV(
    LogisticRegression(max_iter=5000),
    param_grid,
    scoring="f1_macro",
    cv=5,
    n_jobs=-1
)

grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best CV score:", grid.best_score_)

best_log_reg = grid.best_estimator_


In [None]:
y_test_pred = best_log_reg.predict(X_test)
y_test_prob = best_log_reg.predict_proba(X_test)[:, 1]

print("Test F1 Macro:", f1_score(y_test, y_test_pred, average="macro"))
print("Test ROC-AUC:", roc_auc_score(y_test, y_test_prob))
print(classification_report(y_test, y_test_pred))


Feature importance

In [None]:
X = df.drop(columns=["Churn", "Churn_num"])
y = df["Churn_num"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

coef_df = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": best_log_reg.coef_[0]
})

coef_df["AbsCoeff"] = coef_df["Coefficient"].abs()
coef_df = coef_df.sort_values("AbsCoeff", ascending=False)

coef_df.head(15)


Train Model

In [None]:
final_model = LogisticRegression(**grid.best_params_, max_iter=5000)
final_model.fit(X, y)  # train on full dataset now


In [None]:
import os, joblib

joblib.dump(final_model, "../../models/logistic_churn_model.pkl")
joblib.dump(X.columns.tolist(), "../../models/model_features.pkl")
