# Telco Customer Churn Prediction

## Task 1: Exploratory Data Analysis (EDA)
This notebook covers the step-by-step process of analyzing the Telco Customer Churn dataset, preprocessing the data, building classification models, and evaluating them.

### 1.1 Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, ParameterGrid
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn import tree
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE

# Set visualization style
sns.set_style('whitegrid')
%matplotlib inline

# Reproducibility for NN training
tf.keras.utils.set_random_seed(42)

import warnings
warnings.filterwarnings('ignore')


### 1.2 Load Dataset

In [None]:
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')
print(f"Dataset Shape: {df.shape}")
df.head()

### 1.3 Data Cleaning & Inspection

In [None]:
df.info()

In [None]:
# 'TotalCharges' is object but should be numeric. Coerce errors to NaN.
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

In [None]:
# Drop rows with missing TotalCharges (usually very few)
df.dropna(inplace=True)

# Remove 'customerID' as it's not a feature
if 'customerID' in df.columns:
    df.drop(columns=['customerID'], inplace=True)

print("New Shape after cleaning:", df.shape)

In [None]:
# SAVE CLEANED DATASET
df.to_csv("cleaned_telco_churn.csv", index=False)
print("Cleaned dataset saved as 'cleaned_telco_churn.csv'")


### 1.4 Visualization

In [None]:
# Target Variable Distribution
plt.figure(figsize=(6,4))
sns.countplot(x='Churn', data=df, palette='viridis')
plt.title('Distribution of Churn')
plt.show()

In [None]:
# Numerical Features Distributions
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

sns.histplot(df['tenure'], kde=True, ax=axes[0], color='skyblue')
axes[0].set_title('Tenure Distribution')

sns.histplot(df['MonthlyCharges'], kde=True, ax=axes[1], color='salmon')
axes[1].set_title('Monthly Charges Distribution')

sns.histplot(df['TotalCharges'], kde=True, ax=axes[2], color='green')
axes[2].set_title('Total Charges Distribution')

plt.tight_layout()
plt.show()

In [None]:
# Standardizing column names to lowercase for ease
df.columns = [c.lower() for c in df.columns]
print(df.columns)

In [None]:
# Churn vs Contract Type
plt.figure(figsize=(8,5))
sns.countplot(x='contract', hue='churn', data=df, palette='pastel')
plt.title('Churn Rate by Contract Type')
plt.show()

In [None]:
# Correlation Matrix
df_corr = df.copy()
df_corr['churn'] = df_corr['churn'].apply(lambda x: 1 if x == 'Yes' else 0)
numeric_df = df_corr.select_dtypes(include=['number'])

plt.figure(figsize=(10,8))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix')
plt.show()

## Task 2: Model Implementation

### 2.1 Data Preprocessing
- Encoding Categorical Variables
- Feature Scaling
- Train-Test Split

In [None]:
# 1. Drop Target from features
X = df.drop('churn', axis=1)
y = df['churn'].apply(lambda x: 1 if x == 'Yes' else 0)

# 2. Encoding Categorical Variables
# Get dummy variables for categorical features, drop_first to avoid multicollinearity
X = pd.get_dummies(X, drop_first=True)

# 3. Scaling Numerical Features
# Identify numerical cols: tenure, monthlycharges, totalcharges
num_cols = ['tenure', 'monthlycharges', 'totalcharges']
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

# 4. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Training Shape: {X_train.shape}")
print(f"Testing Shape: {X_test.shape}")
X.head()

In [None]:
# APPLY SMOTE TO TRAINING DATA

sm = SMOTE(random_state=42)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

print("Before SMOTE:", y_train.value_counts())
print("\nAfter SMOTE:", y_train_sm.value_counts())


### 2.2 Decision Tree Classifier
- Implementation
- Hyperparameter Tuning (GridSearchCV)

In [None]:
# Init Model
dt = DecisionTreeClassifier(random_state=42)

# Hyperparameter Grid
param_grid = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10],
    'criterion': ['gini', 'entropy']
}

# Grid Search
grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_sm, y_train_sm)

best_dt = grid_search.best_estimator_
print("Parameters which are best suited for the Decision Tree:", grid_search.best_params_)

# Predictions
y_pred_dt = best_dt.predict(X_test)

# Evaluation
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print("\nClassification Report:\n", classification_report(y_test, y_pred_dt))

In [None]:
plt.figure(figsize=(25,20))
tree.plot_tree(
    best_dt,
    feature_names=X_train.columns,
    class_names=["No Churn", "Churn"],
    filled=True,
    rounded=True,
    fontsize=10
)
plt.show()

### 2.3 Neural Network Classifier
- Implementation using TensorFlow/Keras
- Model Architecture: Input -> Dense(ReLU) -> Dropout -> Dense(ReLU) -> Output(Sigmoid)

In [None]:
def build_nn_model(input_dim, hidden_units_1, hidden_units_2, dropout_rate, learning_rate):
    model = Sequential([
        tf.keras.Input(shape=(input_dim,)),
        Dense(hidden_units_1, activation='relu'),
        Dropout(dropout_rate),
        Dense(hidden_units_2, activation='relu'),
        Dropout(dropout_rate),
        Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model


# Keep the grid compact to make CV practical in notebook runtime.
nn_param_grid = {
    'hidden_units_1': [32, 64],
    'hidden_units_2': [16],
    'dropout_rate': [0.1, 0.2],
    'learning_rate': [0.001, 0.0005],
    'batch_size': [32],
    'epochs': [50]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_results = []
best_params = None
best_cv_accuracy = -np.inf

for params in ParameterGrid(nn_param_grid):
    fold_accuracies = []

    for train_idx, val_idx in cv.split(X_train, y_train):
        X_fold_train, X_fold_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_fold_train, y_fold_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # Apply SMOTE inside each fold to avoid data leakage
        X_fold_train_sm, y_fold_train_sm = SMOTE(random_state=42).fit_resample(X_fold_train, y_fold_train)

        fold_model = build_nn_model(
            input_dim=X_train.shape[1],
            hidden_units_1=params['hidden_units_1'],
            hidden_units_2=params['hidden_units_2'],
            dropout_rate=params['dropout_rate'],
            learning_rate=params['learning_rate']
        )

        early_stop = EarlyStopping(
            monitor='val_loss',
            mode='min',
            patience=8,
            restore_best_weights=True,
            verbose=0
        )

        fold_model.fit(
            X_fold_train_sm,
            y_fold_train_sm,
            validation_data=(X_fold_val, y_fold_val),
            epochs=params['epochs'],
            batch_size=params['batch_size'],
            callbacks=[early_stop],
            verbose=0
        )

        y_fold_pred = (fold_model.predict(X_fold_val, verbose=0).ravel() > 0.5).astype(int)
        fold_accuracies.append(accuracy_score(y_fold_val, y_fold_pred))

    mean_acc = float(np.mean(fold_accuracies))
    std_acc = float(np.std(fold_accuracies))

    cv_results.append({
        'params': params,
        'mean_cv_accuracy': mean_acc,
        'std_cv_accuracy': std_acc
    })

    print(f"Params: {params} | Mean CV Accuracy: {mean_acc:.4f} (+/- {std_acc:.4f})")

    if mean_acc > best_cv_accuracy:
        best_cv_accuracy = mean_acc
        best_params = params

print("\nBest NN Hyperparameters:", best_params)
print(f"Best CV Accuracy: {best_cv_accuracy:.4f}")

# Train final model with best hyperparameters on full SMOTE training set
final_early_stop = EarlyStopping(
    monitor='val_loss',
    mode='min',
    patience=10,
    restore_best_weights=True,
    verbose=1
)

model = build_nn_model(
    input_dim=X_train.shape[1],
    hidden_units_1=best_params['hidden_units_1'],
    hidden_units_2=best_params['hidden_units_2'],
    dropout_rate=best_params['dropout_rate'],
    learning_rate=best_params['learning_rate']
)

history = model.fit(
    X_train_sm,
    y_train_sm,
    validation_split=0.1,
    epochs=best_params['epochs'],
    batch_size=best_params['batch_size'],
    callbacks=[final_early_stop],
    verbose=1
)

model.summary()

cv_results_df = pd.DataFrame(cv_results).sort_values(by='mean_cv_accuracy', ascending=False).reset_index(drop=True)
cv_results_df.head()


In [None]:
# Plot Training History (Accuracy + Loss)
plt.figure(figsize=(14,5))

# Accuracy Curve 
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Val Accuracy')
plt.title('Neural Network Accuracy History')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

# Loss Curve 
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Neural Network Loss History')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()


In [None]:
# Evaluation
y_prob_nn = model.predict(X_test, verbose=0).ravel()
y_pred_nn = (y_prob_nn > 0.5).astype(int)

print(f"Best CV Accuracy (NN): {best_cv_accuracy:.4f}")
print(f"Neural Network Test Accuracy: {accuracy_score(y_test, y_pred_nn):.4f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred_nn))

# Save best model
best_model_path = 'best_nn_model.keras'
model.save(best_model_path)
print(f"Best neural network saved to: {best_model_path}")


In [None]:
# CONFUSION MATRIX - DECISION TREE
cm_dt = confusion_matrix(y_test, y_pred_dt)
plt.figure(figsize=(6,4))
sns.heatmap(cm_dt, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix - Decision Tree")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

# CONFUSION MATRIX - NEURAL NETWORK
cm_nn = confusion_matrix(y_test, y_pred_nn)
plt.figure(figsize=(6,4))
sns.heatmap(cm_nn, annot=True, fmt='d', cmap='Greens')
plt.title("Confusion Matrix - Neural Network")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()


### 2.4 Model Comparison (ROC-AUC)

In [None]:
# Decision Tree Probabilities
y_prob_dt = best_dt.predict_proba(X_test)[:, 1]
fpr_dt, tpr_dt, _ = roc_curve(y_test, y_prob_dt)
auc_dt = auc(fpr_dt, tpr_dt)

# Neural Network Probabilities
if 'y_prob_nn' not in globals():
    y_prob_nn = model.predict(X_test, verbose=0).ravel()
fpr_nn, tpr_nn, _ = roc_curve(y_test, y_prob_nn)
auc_nn = auc(fpr_nn, tpr_nn)

plt.figure(figsize=(8,6))
plt.plot(fpr_dt, tpr_dt, label=f'Decision Tree (AUC = {auc_dt:.2f})')
plt.plot(fpr_nn, tpr_nn, label=f'Neural Network (AUC = {auc_nn:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend()
plt.show()


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Decision Tree Metrics 
dt_accuracy = accuracy_score(y_test, y_pred_dt)
dt_precision = precision_score(y_test, y_pred_dt)
dt_recall = recall_score(y_test, y_pred_dt)
dt_f1 = f1_score(y_test, y_pred_dt)
dt_auc = auc_dt

# Neural Network Metrics
nn_accuracy = accuracy_score(y_test, y_pred_nn)
nn_precision = precision_score(y_test, y_pred_nn)
nn_recall = recall_score(y_test, y_pred_nn)
nn_f1 = f1_score(y_test, y_pred_nn)
nn_auc = auc_nn

# Combine into lists for plotting
metrics = ["Accuracy", "Precision", "Recall", "F1-Score", "AUC"]
dt_scores = [dt_accuracy, dt_precision, dt_recall, dt_f1, dt_auc]
nn_scores = [nn_accuracy, nn_precision, nn_recall, nn_f1, nn_auc]

dt_scores, nn_scores


In [None]:
x = np.arange(len(metrics))  # number of metrics
width = 0.35  # bar width

plt.figure(figsize=(12,6))
plt.bar(x - width/2, dt_scores, width, label='Decision Tree')
plt.bar(x + width/2, nn_scores, width, label='Neural Network')

plt.xticks(x, metrics)
plt.ylabel("Score")
plt.title("Model Performance Comparison (DT vs NN)")
plt.ylim(0, 1)
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.6)

plt.show()
