# 📘 Customer Churn Prediction Model - Task 2
This notebook includes model training, evaluation, and business recommendation steps based on classification models.

## 1. Load and Prepare Data

In [3]:
!pip install xgboost

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the Excel file
file_path = "Customer_Churn_Data_Large.xlsx"
excel_file = pd.ExcelFile(file_path)

# Read sheets
demographics_df = excel_file.parse('Customer_Demographics')
transactions_df = excel_file.parse('Transaction_History')
service_df = excel_file.parse('Customer_Service')
activity_df = excel_file.parse('Online_Activity')
churn_df = excel_file.parse('Churn_Status')

# Merge datasets
merged_df = churn_df.merge(demographics_df, on='CustomerID', how='left')
merged_df = merged_df.merge(activity_df, on='CustomerID', how='left')
merged_df = merged_df.merge(service_df, on='CustomerID', how='left')
merged_df = merged_df.merge(transactions_df, on='CustomerID', how='left')

# Clean and preprocess
merged_df['TransactionDate'] = pd.to_datetime(merged_df['TransactionDate'])
merged_df['InteractionDate'] = pd.to_datetime(merged_df['InteractionDate'])
merged_df['LastLoginDate'] = pd.to_datetime(merged_df['LastLoginDate'])

merged_df['InteractionType'] = merged_df['InteractionType'].fillna('No Interaction')
merged_df['ResolutionStatus'] = merged_df['ResolutionStatus'].fillna('None')
merged_df['InteractionID'] = merged_df['InteractionID'].fillna(0)
merged_df['InteractionDate'] = merged_df['InteractionDate'].fillna(pd.Timestamp('1900-01-01'))

# Encode categorical variables
encoded_df = pd.get_dummies(merged_df, columns=[
    'Gender', 'MaritalStatus', 'IncomeLevel', 'ServiceUsage',
    'InteractionType', 'ResolutionStatus', 'ProductCategory'
], drop_first=True)

# Normalize numeric features
scaler = StandardScaler()
encoded_df[['Age', 'LoginFrequency', 'AmountSpent']] = scaler.fit_transform(
    encoded_df[['Age', 'LoginFrequency', 'AmountSpent']]
)

# Final dataset
cleaned_df = encoded_df.drop(columns=['CustomerID', 'TransactionID', 'TransactionDate', 'InteractionID', 'InteractionDate', 'LastLoginDate'])

X = cleaned_df.drop('ChurnStatus', axis=1)
y = cleaned_df['ChurnStatus']


Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-win_amd64.whl (150.0 MB)
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/150.0 MB ? eta -:--:--
   ---------------------------------------- 0.1/150.0 MB 544.7 kB/s eta 0:04:36
   ---------------------------------------- 0.3/150.0 MB 2.1 MB/s eta 0:01:12
   ---------------------------------------- 0.7/150.0 MB 3.3 MB/s eta 0:00:45
   ---------------------------------------- 1.2/150.0 MB 4.7 MB/s eta 0:00:32
   ---------------------------------------- 1.8/150.0 MB 6.0 MB/s eta 0:00:25
    --------------------------------------- 2.3/150.0 MB 6.5 MB/s eta 0:00:23
    --------------------------------------- 2.7/150.0 MB 7.2 MB/s eta 0:00:21
    --------------------------------------- 3.2/150.0 MB 7.4 MB/s eta 0:00:20
    ---------

## 2. Train and Evaluate Models

In [4]:

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

model_results = {}

def evaluate_model(name, model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    probs = model.predict_proba(X_test)[:, 1]

    f1 = f1_score(y_test, preds)
    roc_auc = roc_auc_score(y_test, probs)
    cm = confusion_matrix(y_test, preds)
    report = classification_report(y_test, preds)

    model_results[name] = {
        "F1 Score": f1,
        "ROC-AUC": roc_auc,
        "Confusion Matrix": cm,
        "Report": report
    }

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
evaluate_model("Logistic Regression", log_reg, X_train, y_train, X_test, y_test)

# Random Forest
rf = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
evaluate_model("Random Forest", rf, X_train, y_train, X_test, y_test)

# XGBoost
xgb = XGBClassifier(n_estimators=100, eval_metric='logloss', scale_pos_weight=(y == 0).sum() / (y == 1).sum(), random_state=42)
evaluate_model("XGBoost", xgb, X_train, y_train, X_test, y_test)

model_results


{'Logistic Regression': {'F1 Score': 0.32213209733487835,
  'ROC-AUC': 0.5873612857007691,
  'Confusion Matrix': array([[639, 453],
         [132, 139]], dtype=int64),
  'Report': '              precision    recall  f1-score   support\n\n           0       0.83      0.59      0.69      1092\n           1       0.23      0.51      0.32       271\n\n    accuracy                           0.57      1363\n   macro avg       0.53      0.55      0.50      1363\nweighted avg       0.71      0.57      0.61      1363\n'},
 'Random Forest': {'F1 Score': 0.8929292929292929,
  'ROC-AUC': 0.9921857048240813,
  'Confusion Matrix': array([[1089,    3],
         [  50,  221]], dtype=int64),
  'Report': '              precision    recall  f1-score   support\n\n           0       0.96      1.00      0.98      1092\n           1       0.99      0.82      0.89       271\n\n    accuracy                           0.96      1363\n   macro avg       0.97      0.91      0.93      1363\nweighted avg       0.96 

## 3. Business Impact & Recommendations

In [5]:

print("""
The churn prediction model enables SmartBank to proactively identify customers at high risk of leaving.

Business teams can:
- Segment customers based on churn probability and prioritize retention strategies.
- Design targeted campaigns (e.g., offers or loyalty programs) for high-risk customers.
- Monitor changes in churn drivers over time by tracking feature importances.

Suggested improvements:
- Implement SHAP to interpret complex models like XGBoost.
- Continuously retrain the model with updated data.
- Integrate the model with CRM systems for real-time customer scoring.
""")



The churn prediction model enables SmartBank to proactively identify customers at high risk of leaving.

Business teams can:
- Segment customers based on churn probability and prioritize retention strategies.
- Design targeted campaigns (e.g., offers or loyalty programs) for high-risk customers.
- Monitor changes in churn drivers over time by tracking feature importances.

Suggested improvements:
- Implement SHAP to interpret complex models like XGBoost.
- Continuously retrain the model with updated data.
- Integrate the model with CRM systems for real-time customer scoring.

