<a href="https://colab.research.google.com/github/diya1094/Customer_Churn_Prediction/blob/main/Customer_Churn_Prediciton.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#mounting drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#importing file
import pandas as pd
file_path = '/content/drive/MyDrive/Celebal/WA_Fn-UseC_-Telco-Customer-Churn.csv.xlsx'
df = pd.read_excel(file_path)

In [None]:
# Set style for plots
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("=" * 55)
print("TELCOMMUNICATION CUSTOMER CHURN PREDICTION ANALYSIS")
print("=" * 55)

print(f"Dataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"\nMissing values:\n{df.isnull().sum()}")

TELCOMMUNICATION CUSTOMER CHURN PREDICTION ANALYSIS
Dataset shape: (7043, 21)
Columns: ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']

Missing values:
customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [None]:
# Convert TotalCharges to numeric
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Convert binary categorical variables
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
for col in binary_cols:
    df[col] = df[col].map({'Yes': 1, 'No': 0})

df['Churn'] = df['Churn'].astype(str).str.strip().str.lower()
print(df['Churn'].unique())
df['Churn'] = df['Churn'].map({'yes': 1, 'no': 0})

['no' 'yes']


In [None]:
churn_rate = df['Churn'].mean()
print(f"Overall Churn Rate: {churn_rate:.3f} ({churn_rate*100:.1f}%)")

# Key statistics by churn status
numeric_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
print(f"\nKey Statistics by Churn Status:")
print(df.groupby('Churn')[numeric_cols].mean().round(2))

Overall Churn Rate: 0.265 (26.5%)

Key Statistics by Churn Status:
       tenure  MonthlyCharges  TotalCharges
Churn                                      
0       37.57           61.27       2555.34
1       17.98           74.44       1531.80


In [None]:
from sklearn.preprocessing import LabelEncoder
df_model = df.copy()

# Encode categorical variables
le = LabelEncoder()
categorical_cols = ['gender','MultipleLines', 'InternetService', 'OnlineSecurity',
                   'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
                   'StreamingMovies', 'Contract', 'PaymentMethod']

for col in categorical_cols:
    df_model[col] = le.fit_transform(df_model[col])

# Create new features
df_model['AvgChargePerMonth'] = df_model['TotalCharges'] / (df_model['tenure'] + 1)
df_model['IsNewCustomer'] = (df_model['tenure'] <= 12).astype(int)
# Create more powerful features
df_model['TenureGroup'] = pd.cut(df_model['tenure'], bins=[0, 12, 24, 48, 72], labels=[0, 1, 2, 3])
df_model['ChargesRatio'] = df_model['MonthlyCharges'] / (df_model['TotalCharges'] + 1)
df_model['HighCharges'] = (df_model['MonthlyCharges'] > df_model['MonthlyCharges'].quantile(0.75)).astype(int)
df_model['NewCustomer'] = (df_model['tenure'] <= 6).astype(int)

service_cols = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies']
df_model['ServiceCount'] = sum([(df[col] == 1).astype(int) for col in service_cols if col in df_model.columns])

print(f"Feature engineering completed! Dataset shape: {df_model.shape}")

Feature engineering completed! Dataset shape: (7043, 28)


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

# Drop 'CustomerID' and apply one-hot encoding to categorical features
df_model_cleaned = df_model.drop(['customerID'], axis=1)

# One-hot encode categorical variables
df_encoded = pd.get_dummies(df_model_cleaned, drop_first=True)

# Now separate features and target
X = df_encoded.drop('Churn', axis=1)
y = df_encoded['Churn']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")

#  Fill ALL missing values (for numeric and dummy variables)
X_train = X_train.fillna(X_train.median(numeric_only=True))
X_test = X_test.fillna(X_train.median(numeric_only=True))  # Use training medians

# If still any NaNs in dummy columns (rare), replace them with 0
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

# Scale numerical features
scaler = StandardScaler()
numeric_features = ['tenure', 'MonthlyCharges', 'TotalCharges', 'AvgChargePerMonth']

X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[numeric_features] = scaler.fit_transform(X_train[numeric_features])
X_test_scaled[numeric_features] = scaler.transform(X_test[numeric_features])

# Define models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

# Train and evaluate models
results = {}
print("Training models...")

for name, model in models.items():
    print(f"\nTraining {name}...")

    # Use scaled data for Logistic Regression
    if name == 'Logistic Regression':
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc_roc = roc_auc_score(y_test, y_pred_proba)

    results[name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC-ROC': auc_roc,
        'Model': model
    }

    print(f"Accuracy: {accuracy:.4f}, AUC-ROC: {auc_roc:.4f}")

Training set: (5634, 28), Test set: (1409, 28)
Training models...

Training Logistic Regression...
Accuracy: 0.7935, AUC-ROC: 0.8420

Training Random Forest...
Accuracy: 0.7800, AUC-ROC: 0.8124

Training Gradient Boosting...
Accuracy: 0.8006, AUC-ROC: 0.8436


In [None]:
# Comparision of model
comparison_df = pd.DataFrame(results).T.drop('Model', axis=1)
print(comparison_df.round(4))

# Find best model based on AUC-ROC
best_model_name = comparison_df['AUC-ROC'].idxmax()
best_model = results[best_model_name]['Model']
print(f"\nBest Model: {best_model_name}")
print(f"Best AUC-ROC Score: {comparison_df.loc[best_model_name, 'AUC-ROC']:.4f}")

                     Accuracy Precision    Recall  F1-Score   AUC-ROC
Logistic Regression  0.793471  0.638796  0.510695  0.567608  0.842024
Random Forest        0.779986  0.606667  0.486631  0.540059  0.812355
Gradient Boosting    0.800568   0.66787  0.494652  0.568356   0.84356

Best Model: Gradient Boosting
Best AUC-ROC Score: 0.8436


In [None]:
# Analysis of best model

# Make predictions with best model
if best_model_name == 'Logistic Regression':
    y_pred_best = best_model.predict(X_test_scaled)
    y_pred_proba_best = best_model.predict_proba(X_test_scaled)[:, 1]
else:
    y_pred_best = best_model.predict(X_test)
    y_pred_proba_best = best_model.predict_proba(X_test)[:, 1]

# Confusion Matrix and Classification Report
cm = confusion_matrix(y_test, y_pred_best)
print("Confusion Matrix:")
print(cm)
print("\nClassification Report:")
print(classification_report(y_test, y_pred_best))

# Feature Importance (for tree-based models)
if hasattr(best_model, 'feature_importances_'):
    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)

    print("\nTop 10 Most Important Features:")
    print(feature_importance.head(10))

Confusion Matrix:
[[943  92]
 [189 185]]

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.91      0.87      1035
           1       0.67      0.49      0.57       374

    accuracy                           0.80      1409
   macro avg       0.75      0.70      0.72      1409
weighted avg       0.79      0.80      0.79      1409


Top 10 Most Important Features:
              Feature  Importance
14           Contract    0.401488
21       ChargesRatio    0.185226
17     MonthlyCharges    0.121584
8      OnlineSecurity    0.082215
11        TechSupport    0.053549
18       TotalCharges    0.035039
19  AvgChargePerMonth    0.028116
16      PaymentMethod    0.023269
4              tenure    0.018711
9        OnlineBackup    0.010922


In [None]:
#Hyperparameter Tuning
param_grids = {
    'Random Forest': {
        'n_estimators': [100, 200],
        'max_depth': [10, 15, None],
        'min_samples_split': [2, 5]
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1, 0.15],
        'max_depth': [3, 5]
    },
    'Logistic Regression': {
        'C': [0.1, 1, 10],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear']
    }
}

if best_model_name in param_grids:
    print(f"Tuning hyperparameters for {best_model_name}...")

    if best_model_name == 'Logistic Regression':
        X_tune, y_tune = X_train_scaled, y_train
    else:
        X_tune, y_tune = X_train, y_train

    grid_search = GridSearchCV(
        models[best_model_name],
        param_grids[best_model_name],
        cv=5,
        scoring='roc_auc',
        n_jobs=-1
    )
    grid_search.fit(X_tune, y_tune)

    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

    best_model_tuned = grid_search.best_estimator_
else:
    best_model_tuned = best_model

Tuning hyperparameters for Gradient Boosting...
Best parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}
Best cross-validation score: 0.8449


In [None]:
# Make final predictions
if best_model_name == 'Logistic Regression':
    y_pred_final = best_model_tuned.predict(X_test_scaled)
    y_pred_proba_final = best_model_tuned.predict_proba(X_test_scaled)[:, 1]
else:
    y_pred_final = best_model_tuned.predict(X_test)
    y_pred_proba_final = best_model_tuned.predict_proba(X_test)[:, 1]

# Calculate final metrics
final_accuracy = accuracy_score(y_test, y_pred_final)
final_precision = precision_score(y_test, y_pred_final)
final_recall = recall_score(y_test, y_pred_final)
final_f1 = f1_score(y_test, y_pred_final)
final_auc = roc_auc_score(y_test, y_pred_proba_final)

print("FINAL MODEL PERFORMANCE:")
print(f"Accuracy: {final_accuracy:.4f}")
print(f"Precision: {final_precision:.4f}")
print(f"Recall: {final_recall:.4f}")
print(f"F1-Score: {final_f1:.4f}")
print(f"AUC-ROC: {final_auc:.4f}")

FINAL MODEL PERFORMANCE:
Accuracy: 0.8020
Precision: 0.6792
Recall: 0.4813
F1-Score: 0.5634
AUC-ROC: 0.8461


In [None]:
print("\n" + "="*60)
print("ANALYSIS COMPLETED SUCCESSFULLY!")
print("="*60)

# Results summary
results_summary = {
    'best_model': best_model_name,
    'final_accuracy': final_accuracy,
    'final_precision': final_precision,
    'final_recall': final_recall,
    'final_f1': final_f1,
    'final_auc': final_auc,
    'churn_rate': churn_rate
}

print(f"\nRESULTS SUMMARY: {results_summary}")

print("\nFINAL MODEL EVALUATION")
print("-" * 30)

# Make final predictions
if best_model_name in ['SVM', 'Logistic Regression']:
    y_pred_final = best_model_tuned.predict(X_test_scaled)
    y_pred_proba_final = best_model_tuned.predict_proba(X_test_scaled)[:, 1]
else:
    y_pred_final = best_model_tuned.predict(X_test)
    y_pred_proba_final = best_model_tuned.predict_proba(X_test)[:, 1]

# Calculate final metrics
final_accuracy = accuracy_score(y_test, y_pred_final)
final_precision = precision_score(y_test, y_pred_final)
final_recall = recall_score(y_test, y_pred_final)
final_f1 = f1_score(y_test, y_pred_final)
final_auc = roc_auc_score(y_test, y_pred_proba_final)

print("FINAL MODEL PERFORMANCE:")
print(f"Accuracy: {final_accuracy:.4f}")
print(f"Precision: {final_precision:.4f}")
print(f"Recall: {final_recall:.4f}")
print(f"F1-Score: {final_f1:.4f}")
print(f"AUC-ROC: {final_auc:.4f}")


ANALYSIS COMPLETED SUCCESSFULLY!

RESULTS SUMMARY: {'best_model': 'Gradient Boosting', 'final_accuracy': 0.8019872249822569, 'final_precision': 0.6792452830188679, 'final_recall': 0.48128342245989303, 'final_f1': 0.5633802816901409, 'final_auc': np.float64(0.8460564726549381), 'churn_rate': np.float64(0.2653698707936959)}

FINAL MODEL EVALUATION
------------------------------
FINAL MODEL PERFORMANCE:
Accuracy: 0.8020
Precision: 0.6792
Recall: 0.4813
F1-Score: 0.5634
AUC-ROC: 0.8461


In [None]:
print("KEY FINDINGS:")
print(f"1. Customer churn rate: {churn_rate*100:.1f}%")

if hasattr(best_model_tuned, 'feature_importances_'):
    top_features = feature_importance.head(5)['Feature'].tolist()
    print("2. Top factors influencing churn:")
    for i, feature in enumerate(top_features, 1):
        print(f"   {i}. {feature}")

print("\nBUSINESS RECOMMENDATIONS:")
print("1. Focus retention efforts on month-to-month contract customers")
print("2. Offer incentives to customers with high monthly charges")
print("3. Improve customer experience in first 12 months")
print("4. Promote longer-term contracts with discounts")
print("5. Target customers with fiber optic internet for retention campaigns")

KEY FINDINGS:
1. Customer churn rate: 26.5%
2. Top factors influencing churn:
   1. Contract
   2. ChargesRatio
   3. MonthlyCharges
   4. OnlineSecurity
   5. TechSupport

BUSINESS RECOMMENDATIONS:
1. Focus retention efforts on month-to-month contract customers
2. Offer incentives to customers with high monthly charges
3. Improve customer experience in first 12 months
4. Promote longer-term contracts with discounts
5. Target customers with fiber optic internet for retention campaigns
