**Customer Churn Prediction Model**

**1.** **Importing the Dependencies**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold, cross_validate
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score, roc_auc_score, roc_curve
from imblearn.under_sampling import RandomUnderSampler
import pickle
import plotly.express as px
import plotly.graph_objects as go
%matplotlib inline

In [None]:
from google.colab import files
uploaded = files.upload()

**2.** **Data Loading and Understanding**

In [None]:
# Load the csv data to pandas dataframe
df = pd.read_csv("/content/Telco-Customer-Churn.csv")
df.shape
df.head()

In [None]:
pd.set_option('display.max_columns', None)
df.info()

In [None]:
# dropping customer_ID columnas this is not required for modelling
df = df.drop(columns = ["customerID"])
df.head(2)

In [None]:
#printing the unique values in all the columns

numerical_features_list = ["tenure", "MonthlyCharges", "TotalCharges"]

for col in df.columns:
  if col not in numerical_features_list:
    print(col, df[col].unique())
    print("-" * 50)

print(df.isnull().sum())

In [None]:
# Handle missing values in TotalCharges
df["TotalCharges"] = df["TotalCharges"].replace(" ", "0.0")
df["TotalCharges"] = df["TotalCharges"].astype(float)
df.info()

In [None]:
#checking the class distribution of target column
print(df["Churn"].value_counts())

**Insights:**
1. Customer ID removed as it is not required for modelling.
2. No missing values in the dataset.
3. Missing values in the TotalCharges cloumn were replaced with 0.
4. Class imbalance identified in the target.

**3. Exploratory Data Analysis (EDA)**

**Numerical Features - Analysis**

Understanding the distribution of the numerical features

In [None]:
def plot_histogram(df, column_name):
  plt.figure(figsize=(5, 3))
  sns.histplot(df[column_name], kde = True)
  plt.title(f"Distribution of {column_name}")

  # calculate the mean and median values for the columns
  col_mean = df[column_name].mean()
  col_median = df[column_name].median()

  # add vertical lines for mean and median
  plt.axvline(col_mean, color = "red", linestyle = "--", label = "Mean")
  plt.axvline(col_median, color = "green", linestyle = "-", label = "Median")

  plt.legend()

  plt.show()

In [None]:
plot_histogram(df, "tenure")

In [None]:
plot_histogram(df, "MonthlyCharges")

In [None]:
plot_histogram(df, "TotalCharges")

***Box plot for numerical features***

In [None]:
def plot_boxplot(df, column_name):
  plt.figure(figsize=(5, 3))
  sns.boxplot(df[column_name])
  plt.title(f"Box plot of { column_name }")
  plt.ylabel(column_name)
  plt.show

In [None]:
plot_boxplot(df, "tenure")

In [None]:
plot_boxplot(df, "MonthlyCharges")

In [None]:
plot_boxplot(df, "TotalCharges")

**Correlation HeatMap for numerical columns**

In [None]:
# correlation matrix - heatmap
plt.figure(figsize=(8, 4))
sns.heatmap(df[["tenure", "MonthlyCharges", "TotalCharges"]].corr(), annot = True, cmap = "coolwarm", fmt = ".2f")
plt.title("Correlation Heatmap")
plt.show()

Categorical features - Analysis

CountPlot for categorical columns

In [None]:
object_cols = df.select_dtypes(include = "object").columns.to_list()

object_cols = ["SeniorCitizen"] + object_cols

for col in object_cols:
  plt.figure(figsize=(5, 3))
  sns.countplot(x = df[col])
  plt.title(f"Count Plot of {col}")
  plt.show()


**4. Data Prepocessing**

Label encoding of target column

In [None]:
df["Churn"] = df["Churn"].replace({"Yes": 1, "No": 0})

Label encoding of categorical features

In [None]:
# identify columns with object data type
object_columns = df.select_dtypes(include = "object").columns

In [None]:
print(object_columns)

In [None]:
# initialise a dictionary to save the encoders
encoders = {}

# apply label encodeing and store the encoders
for column in object_columns:
  label_encoder = LabelEncoder()
  df[column] = label_encoder.fit_transform(df[column])
  encoders[column]  = label_encoder

# save the encoders to a pickle file
with open("encoders.pkl", "wb") as f:
  pickle.dump(encoders, f)

In [None]:
encoders

**Training and test data split**

In [None]:
# splitting the features and target
x = df.drop(columns = ["Churn"])
y = df["Churn"]

In [None]:
# split training and test data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [None]:
print(y_train.shape)

In [None]:
print(y_train.value_counts())

**Synthetic Minority Oversampling Technique  (SMOT)**

In [None]:
# Handle imbalance using SMOTE
smote = SMOTE(random_state = 42)
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)

In [None]:
print(y_train_smote.shape)

In [None]:
print(y_train_smote.value_counts())

**5. Model Training**

Training with default hyper parameters

In [None]:
# dictionary of models
models = {
    "Decision Tree": DecisionTreeClassifier(random_state = 42),
    "Random Forest": RandomForestClassifier(random_state = 42),
    "XGBoost": XGBClassifier(random_state = 42)
}

In [None]:
# Dictionary to store cross validation results
cv_scores = {}

#perform S-fold cross validation for each model
for model_name, model in models.items():
  print(f"Training {model_name} with default parameters")
  scores = cross_val_score(model, x_train_smote, y_train_smote, cv = 5, scoring = "accuracy")
  cv_scores[model_name] = scores
  print(f"{model_name} cross validation accuracy: {np.mean(scores):.2f}")
  print("-" * 70)

In [None]:
cv_scores

**Random Forest gives the highest accuracy compred to other models with default parameters**

In [None]:
rfc = RandomForestClassifier(random_state = 42)

In [None]:
rfc.fit(x_train_smote, y_train_smote)

**6. Model Evaluation**

In [None]:
#evaluate on test data
y_test_pred = rfc.predict(x_test)

print("Accuracy Score:\n", accuracy_score(y_test, y_test_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))
print("Classification Report:\n", classification_report(y_test, y_test_pred))

In [None]:
# save the trained model as a pickle file
model_data = {
    "model": rfc,
    "features_names" : x.columns.tolist()
}

with open("customer_churn_model.pkl", "wb") as f:
  pickle.dump(model_data, f)

**7. Load the saved model and build a predictive system**

In [None]:
# load the saved model and the feature names

with open("customer_churn_model.pkl", "rb") as f:
  model_data = pickle.load(f)

  loaded_model = model_data["model"]
  feature_names = model_data["features_names"]

In [None]:
print(loaded_model)

In [None]:
print(feature_names)

In [None]:
sample_input = {
    'gender': 'Female',
    'SeniorCitizen': 0,
    'Partner': 'Yes',
    'Dependents': 'No',
    'tenure': 1,
    'PhoneService': 'No',
    'MultipleLines': 'No phone service',
    'InternetService': 'DSL',
    'OnlineSecurity': 'No',
    'OnlineBackup': 'Yes',
    'DeviceProtection': 'No',
    'TechSupport': 'No',
    'StreamingTV': 'No',
    'StreamingMovies': 'No',
    'Contract': 'Month-to-month',
    'PaperlessBilling': 'Yes',
    'PaymentMethod': 'Electronic check',
    'MonthlyCharges': 29.85,
    'TotalCharges': 29.85
}

input_data_df = pd.DataFrame([sample_input])

with open("encoders.pkl", "rb") as f:
  encoders = pickle.load(f)


# encode categorical features using the saved encoders
for column, encoder in encoders.items():
  input_data_df[column] = encoder.transform(input_data_df[column])

# make a prediction
prediction = loaded_model.predict(input_data_df)
pred_prob = loaded_model.predict_proba(input_data_df)

# print the prediction
print(prediction)

# results
print(f"Prediction: {'Churn' if prediction[0] == 1 else 'No Churn'}")
print(f"Probability Probability: { pred_prob }")

In [None]:
encoders

**8. Model Improvements and Advanced Evaluation**

In [None]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42)

x_train_rus, y_train_rus = rus.fit_resample(x_train, y_train)
print("After downsampling class distribution:", y_train_rus.value_counts())


In [None]:
# Train Random Forest on downsampled data
rfc_rus = RandomForestClassifier(random_state=42)
rfc_rus.fit(x_train_rus, y_train_rus)

# Evaluate on test set
y_test_pred_rus = rfc_rus.predict(x_test)
print("Downsampled RF Accuracy:", accuracy_score(y_test, y_test_pred_rus))

# Compare with SMOTE-trained model (rfc from above)
print("SMOTE RF Accuracy:", accuracy_score(y_test, y_test_pred))

In [None]:
# Stratified K-Fold Cross Validation
from sklearn.model_selection import StratifiedKFold, cross_validate
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

models_for_cv = {
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
}
cv_results = {}

for name, model in models_for_cv.items():
    results = cross_validate(model, x_train_smote, y_train_smote, cv=skf,
                             scoring=['accuracy','precision','recall','f1'], return_train_score=True, n_jobs=-1)
    cv_results[name] = results
    print(f"{name} - Mean Train Acc: {results['train_accuracy'].mean():.3f}, Mean Test Acc: {results['test_accuracy'].mean():.3f}")
    print(f"{name} - Mean Train F1: {results['train_f1'].mean():.3f}, Mean Test F1: {results['test_f1'].mean():.3f}")

In [None]:
# Hyperparameter Tuning (GridSearchCV) for Random Forest and XGBoost
from sklearn.model_selection import GridSearchCV

rfc_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}
rfc_grid = GridSearchCV(RandomForestClassifier(random_state=42), rfc_param_grid,
                        cv=skf, scoring='f1', n_jobs=-1, verbose=1)
rfc_grid.fit(x_train_smote, y_train_smote)
print("Best RFC params:", rfc_grid.best_params_)
best_rfc = rfc_grid.best_estimator_

xgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6],
    'learning_rate': [0.1, 0.01]
}
xgb_grid = GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
                        xgb_param_grid, cv=skf, scoring='f1', n_jobs=-1, verbose=1)
xgb_grid.fit(x_train_smote, y_train_smote)
print("Best XGB params:", xgb_grid.best_params_)
best_xgb = xgb_grid.best_estimator_

In [None]:
# Check Overfitting: compare train vs test performance for tuned models
from sklearn.metrics import classification_report, roc_auc_score, roc_curve

def evaluate_and_plot(model, X_train, y_train, X_test, y_test, name="Model"):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    print(f"--- {name} ---")
    print("Train Classification Report:")
    print(classification_report(y_train, y_train_pred))
    print("Test Classification Report:")
    print(classification_report(y_test, y_test_pred))
    # ROC AUC
    if hasattr(model, "predict_proba"):
        y_test_proba = model.predict_proba(X_test)[:,1]
        auc = roc_auc_score(y_test, y_test_proba)
        print(f"Test ROC AUC: {auc:.3f}")
        fpr, tpr, _ = roc_curve(y_test, y_test_proba)
        plt.figure(figsize=(6,4))
        plt.plot(fpr, tpr, label=f'{name} (AUC = {auc:.3f})')
        plt.plot([0,1],[0,1],'--')
        plt.xlabel('FPR')
        plt.ylabel('TPR')
        plt.title(f'ROC Curve - {name}')
        plt.legend()
        plt.show()


In [None]:
# Evaluate tuned models
evaluate_and_plot(best_rfc, x_train_smote, y_train_smote, x_test, y_test, name="Tuned Random Forest")
evaluate_and_plot(best_xgb, x_train_smote, y_train_smote, x_test, y_test, name="Tuned XGBoost")

# Visual comparisons: accuracy and f1 for models
model_scores = []
# SMOTE RFC
model_scores.append(("SMOTE_RF", accuracy_score(y_test, y_test_pred)))
# Downsample RF
model_scores.append(("Downsample_RF", accuracy_score(y_test, y_test_pred_rus)))
# Tuned RFC
y_test_pred_best_rfc = best_rfc.predict(x_test)
model_scores.append(("Tuned_RF", accuracy_score(y_test, y_test_pred_best_rfc)))
# Tuned XGB
y_test_pred_best_xgb = best_xgb.predict(x_test)
model_scores.append(("Tuned_XGB", accuracy_score(y_test, y_test_pred_best_xgb)))

scores_df = pd.DataFrame(model_scores, columns=["Model","Accuracy"])
plt.figure(figsize=(8,4))
sns.barplot(x="Model", y="Accuracy", data=scores_df)
plt.title("Model Accuracy Comparison")
plt.show()

In [None]:
from sklearn.metrics import f1_score
f1_list = [
    ("SMOTE_RF", f1_score(y_test, y_test_pred)),
    ("Downsample_RF", f1_score(y_test, y_test_pred_rus)),
    ("Tuned_RF", f1_score(y_test, y_test_pred_best_rfc)),
    ("Tuned_XGB", f1_score(y_test, y_test_pred_best_xgb))
]
f1_df = pd.DataFrame(f1_list, columns=["Model","F1"])
plt.figure(figsize=(8,4))
sns.barplot(x="Model", y="F1", data=f1_df)
plt.title("Model F1 Score Comparison")
plt.show()


In [None]:
# Save tuned model (best_rfc) as final model
model_data_tuned = {"model": best_rfc, "features_names": x.columns.tolist()}
with open("customer_churn_model_tuned.pkl", "wb") as f:
    pickle.dump(model_data_tuned, f)
print("Tuned model saved to customer_churn_model_tuned.pkl")

**9 — Final Feature Importance & Insights**

In [None]:
# Feature importance (from tuned Random Forest)
importances = best_rfc.feature_importances_
feat_imp = pd.Series(importances, index=x.columns).sort_values(ascending=False)
plt.figure(figsize=(8,6))
sns.barplot(x=feat_imp.values, y=feat_imp.index)
plt.title("Feature Importance (Tuned Random Forest)")
plt.show()

# Business insights (brief)
print("Top features influencing churn:", feat_imp.index[:5].tolist())

**10. DashBoard**

In [None]:
# Interactive summary visuals

# Model accuracy comparison
fig_acc = px.bar(scores_df, x="Model", y="Accuracy", color="Model", title="Model Accuracy Comparison", text="Accuracy")
fig_acc.update_traces(texttemplate='%{text:.3f}', textposition='outside')
fig_acc.show()

# F1 comparison
fig_f1 = px.bar(f1_df, x="Model", y="F1", color="Model", title="Model F1 Score Comparison", text="F1")
fig_f1.update_traces(texttemplate='%{text:.3f}', textposition='outside')
fig_f1.show()

In [None]:
# Feature importance
feat_imp_df = pd.DataFrame({"Feature": feat_imp.index, "Importance": feat_imp.values})
fig_feat = px.bar(feat_imp_df.head(10), x="Importance", y="Feature", orientation="h", title="Top 10 Features Influencing Churn", color="Importance", color_continuous_scale="Blues")
fig_feat.update_layout(yaxis={'categoryorder':'total ascending'})
fig_feat.show()

# Churn distribution
fig_churn = px.pie(df, names="Churn", title="Customer Churn Distribution", color_discrete_sequence=px.colors.sequential.RdBu)
fig_churn.show()

In [None]:
# Correlation heatmap
corr = df.corr()
fig_corr = px.imshow(corr, text_auto=".1f", color_continuous_scale="RdBu_r", title="Feature Correlation Heatmap")
fig_corr.show()

In [None]:
# Prediction summary
sample_results = x_test.copy()
sample_results["Actual"] = y_test.values
sample_results["Predicted"] = best_rfc.predict(x_test)
sample_results["Result"] = np.where(sample_results["Actual"] == sample_results["Predicted"], "Correct", "Incorrect")
fig_pred = px.histogram(sample_results, x="Result", color="Result", title="Prediction Results Summary", text_auto=True)
fig_pred.show()