# Churn Prediction Model - IMPROVED VERSION
**Goal**: Achieve ≥80% recall on churn class

## Improvements:
- SMOTE oversampling for class imbalance
- Class weights in XGBoost
- Feature engineering (interactions, binning)
- Extended hyperparameter tuning
- Threshold optimization


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix, 
    recall_score, precision_score, roc_curve, auc, accuracy_score, f1_score, roc_auc_score
)
from imblearn.over_sampling import SMOTE
import joblib
import shap

import sys
sys.path.append('..')


## 1. Load and Clean Data


In [None]:
df = pd.read_csv('../data/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df = df.dropna(subset=['TotalCharges'])
print(f"Dataset shape: {df.shape}")
print(f"Churn rate: {(df['Churn'] == 'Yes').mean():.2%}")


## 2. Feature Engineering (NEW!)


In [None]:
# Create new features
df_fe = df.copy()

# 1. Tenure bins
df_fe['tenure_group'] = pd.cut(df_fe['tenure'], bins=[0, 12, 24, 48, 100], 
                                labels=['0-12', '12-24', '24-48', '48+'])

# 2. Monthly charge bins
df_fe['charge_group'] = pd.cut(df_fe['MonthlyCharges'], bins=[0, 35, 70, 150], 
                                labels=['Low', 'Medium', 'High'])

# 3. Total services count
service_cols = ['OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                'TechSupport', 'StreamingTV', 'StreamingMovies']
df_fe['num_services'] = df_fe[service_cols].apply(
    lambda x: (x == 'Yes').sum(), axis=1
)

# 4. Has phone + internet
df_fe['phone_and_internet'] = ((df_fe['PhoneService'] == 'Yes') & 
                               (df_fe['InternetService'] != 'No')).astype(int)

# 5. Charge per service
df_fe['charge_per_service'] = df_fe['MonthlyCharges'] / (df_fe['num_services'] + 1)

# 6. Senior with dependents
df_fe['senior_with_dependents'] = ((df_fe['SeniorCitizen'] == 1) & 
                                    (df_fe['Dependents'] == 'Yes')).astype(int)

print(f"New features created!")
print(f"\\nNum services distribution:")
print(df_fe['num_services'].value_counts().sort_index())
