# CUSTOMER CHURN PREDICTION MODEL 


In [None]:
pip install xgboost


## 1. Importing Libraries


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler , LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
import pickle
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTE 
%matplotlib inline


## 2. Loading the Dataset

In [None]:
df=pd.read_csv('Telco-Customer-Churn.csv')

In [None]:
df.head()

In [None]:
df.tail()

## 3.EDA

In [None]:
df.shape

In [None]:
df.size

In [None]:
df.dtypes

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.duplicated().sum()

In [None]:
df = df.drop(columns=['customerID'])


In [None]:
df.head()

In [None]:
df['TotalCharges'].dtype

In [None]:
#conversion of totalcharges to numeric
df['TotalCharges']=pd.to_numeric(df['TotalCharges'],errors='coerce')


In [None]:
df.isnull().sum()

In [None]:
#imputation by taking median
df['TotalCharges'].fillna(df['TotalCharges'].median(), inplace=True)

In [None]:
df['TotalCharges'].dtype

In [None]:
categorical_features=[
"gender",
"SeniorCitizen",
"Partner",            
"Dependents",              
"PhoneService",       
"MultipleLines",    
"InternetService",    
"OnlineSecurity",    
"OnlineBackup",     
"DeviceProtection",    
"TechSupport",     
"StreamingTV",      
"StreamingMovies",   
"Contract",     
"PaperlessBilling",  
"PaymentMethod"]

numerical_features=["tenure","MonthlyCharges","TotalCharges"]   

### Checking class distribution of  target column

In [None]:
print (df['Churn'].value_counts())
#imbalanced data

### Numerical Features -Analysis

In [None]:
df.skew(numeric_only=True)

In [None]:
df.corr(numeric_only=True)

Feature Distribution

In [None]:
df[numerical_features].describe()

In [None]:
df[numerical_features].hist(bins=30,figsize=(10,7))

In [None]:
fig,ax=plt.subplots(1,3,figsize=(14,4))
df[df.Churn=='No'][numerical_features].hist(bins=30,color="blue",alpha=0.5,ax=ax)
df[df.Churn=='Yes'][numerical_features].hist(bins=30,color="red",alpha=0.5,ax=ax)

In [None]:

def plot_histogram(df, column_name):
    
    print(f"Plotting histogram for column: {column_name}")

    plt.figure(figsize=(5, 3))
    sns.histplot(df[column_name], kde=True)
    plt.title(f"Distribution of {column_name}")

    # Calculate mean and median
    col_mean = df[column_name].mean()
    col_median = df[column_name].median()
    
    # Add vertical lines
    plt.axvline(col_mean, color='red', linestyle='--', label='Mean')
    plt.axvline(col_median, color='green', linestyle='-', label='Median')

   
    plt.legend()
    plt.show()



In [None]:
plot_histogram(df, "tenure")

In [None]:
plot_histogram(df, "MonthlyCharges")

In [None]:
plot_histogram(df, "TotalCharges")

### Box plot for numerical features

In [None]:

def plot_boxplot(df, column_name):
    
    print(f"Plotting histogram for column: {column_name}")

    plt.figure(figsize=(5, 3))
    sns.boxplot(y=df[column_name])
    plt.title(f"Box plot  of {column_name}")

    plt.ylabel(column_name)
    plt.show


In [None]:
plot_boxplot(df,'tenure')

In [None]:
plot_boxplot(df,'TotalCharges')

In [None]:
plot_boxplot(df,'MonthlyCharges')

### Correlation Heatmap for numerical columns 

In [None]:
plt.figure(figsize=(8,4))
sns.heatmap(df[['tenure','MonthlyCharges','TotalCharges']].corr(),annot=True,cmap='coolwarm',fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

### Categorical Features - Analysis

In [None]:
df.info()

In [None]:
for col in categorical_features:
    plt.figure(figsize=(5,3))
    sns.countplot(x=df[col])
    plt.title(f"Count plot of {col}")
    plt.show()

# 4. Data Preprocessing

In [None]:
df.head(3)

### Label Encoding for cateogrical features

In [None]:
unique_values = df['Contract'].unique()

print(unique_values)

In [None]:
df_encoded = df.copy()
label_encoders = {}

for col in categorical_features:
    if col != "Churn":  # Exclude the target column
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df_encoded[col])
        label_encoders[col] = le




In [None]:
with open("label_encoders.pkl", "wb") as f:
    pickle.dump(label_encoders, f)

In [None]:
df_encoded.head()

In [None]:
# Encode target column separately (if it's categorical)
df_encoded["Churn"] = df["Churn"].map({"Yes": 1, "No": 0})  # Adjust mapping as needed

In [None]:
df_encoded.head()

In [None]:
print(df.info())  # Look for object-type columns that were encoded


In [None]:
#Convert Target variable
df_encoded['Churn'] = df_encoded['Churn'].replace({"Yes": 1, "No": 0})

### Splitting the training and testing data

In [None]:
#splitting the features and target
X = df_encoded.drop(columns=["Churn"])
y = df_encoded["Churn"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
print(y_train.shape)

In [None]:
 print(y_train.value_counts())

### Synthetic Minority OverSampling Technique (SMOTE)

In [None]:
#we apply smote after splitting 
#smote doesnt work with missing values
smote = SMOTE(random_state=42)


In [None]:
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
df_encoded.head()

# 5. Model Training

### Training with default hyperparameters

In [None]:
models = {
    "Decision Tree": DecisionTreeClassifier(max_depth=5, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=7, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42)
}


In [None]:

cv_scores = {}
trained_models = {}

for model_name, model in models.items():
    print(f"Training {model_name} with optimized parameters")
    
    # Train & Store Model
    model.fit(X_train_resampled, y_train_resampled)
    trained_models[model_name] = model
    
    # Cross-validation
    scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=5, scoring="accuracy")
    cv_scores[model_name] = scores
    print(f"{model_name} cross-validation accuracy: {np.mean(scores):.4f}")
    print("-" * 70)


## MODEL SELECTION

In [None]:
cv_scores

XGBoost  gives the highest accuracy compared to other models with deafult parameters

In [None]:
from xgboost import XGBClassifier




xgb = XGBClassifier(
    random_state=42, 
    use_label_encoder=False, 
    eval_metric="logloss",
    scale_pos_weight=len(y_train[y_train==0]) / len(y_train[y_train==1]),  # Balances the classes
    max_depth=5,
    learning_rate=0.1,
    n_estimators=200
)
xgb.fit(X_train, y_train)


In [None]:
# Now predict on test data
y_pred = xgb.predict(X_test)

In [None]:
print(y_test.value_counts())

# 6. Model Evaluation

In [None]:
#evaluate on test data
best_model = trained_models["XGBoost"]  # Ensure you are using the correct model
y_test_pred = best_model.predict(X_test)

print("Accuracy Score :\n",accuracy_score(y_test,y_test_pred))
print("Confusion Maxtrix:\n",confusion_matrix(y_test,y_test_pred))
print("Classification Report:\n",classification_report(y_test,y_test_pred))

In [None]:
#save the trained model as a pickle file
import pickle

# Save the trained model as a pickle file
model_data = {"model": xgb, "feature_names": X.columns.tolist()}
with open("customer_churn_model.pkl", "wb") as f:
    pickle.dump(model_data, f)  # Save model_data instead of just xgb

print("Model saved to 'customer_churn_model.pkl'.")


In [None]:
plt.figure(figsize=(5, 3))
sns.heatmap(confusion_matrix(y_test, y_test_pred), annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted Label")
plt.ylabel("Actual Label")
plt.title("Confusion Matrix")
plt.show()


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train[numerical_features])  # Fit using training data


In [None]:
from xgboost import plot_importance
import matplotlib.pyplot as plt

plot_importance(best_model)
plt.show()


In [None]:
print("Total Features Before Feature Selection:", X_train.shape[1])


###### pip install shap

In [None]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

xgb_tuned = RandomizedSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
    param_distributions=param_grid,
    cv=5,
    n_iter=10,
    scoring="accuracy",
    n_jobs=-1
)

xgb_tuned.fit(X_train_resampled, y_train_resampled)
print("Best Parameters:", xgb_tuned.best_params_)


In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

# Define the parameter grid based on the best values
param_grid = {
    'n_estimators': [200, 300, 400],  # Test around the best value
    'max_depth': [5, 7, 9],  # Check if increasing depth helps
    'learning_rate': [0.05, 0.1, 0.15],  # Test slightly lower/higher values
    'colsample_bytree': [0.7, 0.8, 0.9],  # Feature selection
    'subsample': [0.7, 0.8, 0.9]  # Sample selection
}

# Initialize the XGBClassifier
xgb = XGBClassifier(
    random_state=42, 
    eval_metric="logloss",
    scale_pos_weight=len(y_train[y_train==0]) / len(y_train[y_train==1])  # Handle class imbalance
)

# Setup GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,  # 5-fold cross-validation
    verbose=2,
    n_jobs=-1  # Use all CPU cores
)

# Fit the grid search
grid_search.fit(X_train_resampled, y_train_resampled)

# Print the best parameters and accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score: {:.4f}".format(grid_search.best_score_))

# Use the best model
best_xgb = grid_search.best_estimator_

# Evaluate on the test set
y_pred = best_xgb.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))



In [None]:
import shap

# Fit the model
xgb = XGBClassifier(use_label_encoder=False, eval_metric="logloss")
xgb.fit(X_train_resampled, y_train_resampled)

# Use SHAP Explainer
explainer = shap.Explainer(xgb)
shap_values = explainer(X_train_resampled)

# Plot SHAP summary
shap.summary_plot(shap_values, X_train_resampled)


In [None]:
# Get mean absolute SHAP values
shap_importance = pd.DataFrame({
    'feature': X_train_resampled.columns,
    'shap_value': np.abs(shap_values.values).mean(axis=0)
})

# Sort features by importance
shap_importance = shap_importance.sort_values(by="shap_value", ascending=False)

# Select top N features (e.g., top 10)
top_features = shap_importance.head(14)['feature'].tolist()
print("Top Features Based on SHAP:", top_features)

# Use only selected features for training
X_train_shap = X_train_resampled[top_features]
X_test_shap = X_test[top_features]

# Train the model with selected features
xgb.fit(X_train_shap, y_train_resampled)

# Evaluate performance
y_pred = xgb.predict(X_test_shap)
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy after SHAP Feature Selection:", accuracy)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# 1️⃣ Train the Baseline Model (Logistic Regression)
baseline_model = LogisticRegression(max_iter=500, random_state=42)
baseline_model.fit(X_train_resampled, y_train_resampled)  # Use resampled training data

# 2️⃣ Make Predictions
y_pred_baseline = baseline_model.predict(X_test)

# 3️⃣ Evaluate Performance
print("🔹 Baseline Model (Logistic Regression) Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_baseline):.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_baseline))
print("Classification Report:\n", classification_report(y_test, y_pred_baseline))
print(f"AUC-ROC Score: {roc_auc_score(y_test, baseline_model.predict_proba(X_test)[:,1]):.4f}")

# 4️⃣ Compare with XGBoost Model
y_pred_xgb = best_model.predict(X_test)  # Best XGBoost model you trained

print("\n🔹 XGBoost Model Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_xgb):.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))
print("Classification Report:\n", classification_report(y_test, y_pred_xgb))
print(f"AUC-ROC Score: {roc_auc_score(y_test, best_model.predict_proba(X_test)[:,1]):.4f}")

# 5️⃣ Visual Comparison Using ROC Curve
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve

fpr_base, tpr_base, _ = roc_curve(y_test, baseline_model.predict_proba(X_test)[:,1])
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, best_model.predict_proba(X_test)[:,1])

plt.figure(figsize=(6, 5))
plt.plot(fpr_base, tpr_base, label="Logistic Regression (Baseline)", linestyle="--")
plt.plot(fpr_xgb, tpr_xgb, label="XGBoost (Optimized)", linestyle="-")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve Comparison")
plt.legend()
plt.show()


In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import numpy as np

# Define Logistic Regression Model
logreg = LogisticRegression(max_iter=1000, random_state=42)

# Perform RFE (Choose how many features you want, e.g., 10)
rfe = RFE(estimator=logreg, n_features_to_select=10, step=1)
rfe.fit(X_train, y_train)

# Get Selected Features
selected_features_rfe = X_train.columns[rfe.support_]
print("Selected Features from RFE:", selected_features_rfe.tolist())


In [None]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler

# Standardize Data (LASSO is sensitive to scale)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

#Save the scaler
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

print("Scaler successfully saved as 'scaler.pkl'")

# Fit LASSO Model
lasso = Lasso(alpha=0.01)  # Adjust alpha to control feature selection
lasso.fit(X_train_scaled, y_train)

# Get Selected Features
selected_features_lasso = X_train.columns[lasso.coef_ != 0]
print("Selected Features from LASSO:", selected_features_lasso.tolist())


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from xgboost import XGBClassifier

# Filter dataset with selected features (Change to selected_features_lasso if using LASSO)
X_train_selected = X_train[selected_features_lasso]
X_test_selected = X_test[selected_features_lasso]

# Train XGBoost on Selected Features
xgb_model = XGBClassifier(n_estimators=300, max_depth=7, learning_rate=0.1, colsample_bytree=0.8, subsample=0.8, random_state=42)
xgb_model.fit(X_train_selected, y_train)

# Predictions
y_pred = xgb_model.predict(X_test_selected)
y_pred_proba = xgb_model.predict_proba(X_test_selected)[:, 1]

# Performance Metrics
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Display Results
print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)
print(f"AUC-ROC Score: {roc_auc:.4f}")


In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 300],  
    'max_depth': [3, 5, 7],  
    'learning_rate': [0.01, 0.1, 0.2],  
    'subsample': [0.8, 1.0],  
    'colsample_bytree': [0.8, 1.0]  
}

# Initialize XGBoost model
xgb_model = XGBClassifier(random_state=42, eval_metric='logloss')

# GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train_selected, y_train)

# Best parameters
best_params = grid_search.best_params_
print("\n🔹 Best Hyperparameters:", best_params)

# Train the best model
best_xgb = XGBClassifier(**best_params, random_state=42, eval_metric='logloss')
best_xgb.fit(X_train_selected, y_train)

# Predictions
y_pred = best_xgb.predict(X_test_selected)
y_pred_proba = best_xgb.predict_proba(X_test_selected)[:, 1]

# Performance Metrics
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

# Display Results
print("\n🔹 XGBoost Results after Hyperparameter Tuning:")
print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)
print(f"AUC-ROC Score: {roc_auc:.4f}")


# MODEL PREDICTION

In [None]:
# Assume these are the names of your numerical features and the columns used during training
numerical_features = ["tenure", "MonthlyCharges", "TotalCharges"]
training_columns = X_train.columns  # Or replace with your list of training columns

# Create a new data sample with all required features
new_data = pd.DataFrame({
    'tenure': [12],
    'MonthlyCharges': [50],
    'TotalCharges': [600],
    'gender': [1],            # Example encoded value (e.g., Male=1, Female=0)
    'SeniorCitizen': [0],
    'Partner': [1],
    'Dependents': [0],
    'PhoneService': [1],
    'MultipleLines': [0],
    'InternetService': [2],   # Example encoded value
    'OnlineSecurity': [0],
    'OnlineBackup': [1],
    'DeviceProtection': [0],
    'TechSupport': [1],
    'StreamingTV': [0],
    'StreamingMovies': [1],
    'Contract': [1],
    'PaperlessBilling': [1],
    'PaymentMethod': [2]
})

# Reindex to ensure the new_data has the same columns as used during training
new_data = new_data.reindex(columns=training_columns, fill_value=0)

# Apply the scaling transformation to the numerical features
new_data[numerical_features] = scaler.transform(new_data[numerical_features])

import pickle

with open("customer_churn_model.pkl", "rb") as file:
    loaded_model = pickle.load(file)


# Get prediction probabilities using predict_proba
probabilities = loaded_model.predict_proba(new_data)

# The output probabilities are given per class; for binary classification,
# the first column is usually the probability for class 0 (No) and the second for class 1 (Yes)
print("Prediction Probabilities:", probabilities)
new_prediction = loaded_model.predict(new_data)
print("🔹 Churn Prediction:", "Yes" if new_prediction[0] == 1 else "No")

### MODEL SAVING

In [None]:
import pickle

# Save the best model and feature names
model_data = {
    "model": best_xgb, 
    "feature_names": X_train_selected.columns.tolist()
}

with open("customer_churn_model.pkl", "wb") as f:
    pickle.dump(model_data, f)

print("✅ Best XGBoost Model Saved to 'customer_churn_model.pkl'")


In [None]:
# Load the saved model
with open("customer_churn_model.pkl", "rb") as file:
    loaded_model_data = pickle.load(file)

best_model = loaded_model_data["model"]
feature_names = loaded_model_data["feature_names"]

# Ensure the test data has the same feature order
X_test_selected = X_test_selected[feature_names]

# Make predictions
y_pred = best_model.predict(X_test_selected)
y_pred_proba = best_model.predict_proba(X_test_selected)[:, 1]

# Display results
print("\n🔹 Predictions on Test Set:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print(f"AUC-ROC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")
