# Customer Churn Prediction Model
This notebook demonstrates a complete churn modeling workflow, including:
- Data loading and preprocessing
- Addressing class imbalance
- Feature importance analysis
- Training and comparing multiple models
- Evaluation and interpretation

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, RocCurveDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.ensemble import BalancedRandomForestClassifier

RANDOM_STATE = 42
plt.rcParams['figure.dpi'] = 110

## Load Data

In [None]:
df = pd.read_csv('/mnt/data/client_price_data_100.csv')
df.head()

## Data Cleaning

In [None]:
df = df.drop(columns=['Unnamed: 0'], errors='ignore')
df = df.dropna(subset=['churn'])
df['churn'] = df['churn'].astype(int)

print("Churn Distribution:")
print(df['churn'].value_counts())

feature_cols = [col for col in df.columns if col not in ['id', 'churn']]
X = df[feature_cols]
y = df['churn']

## Train-Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE)

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

In [None]:
def evaluate_model(pipe, X_train, y_train, cv=5):
    scoring = {'roc_auc': 'roc_auc', 'f1': 'f1', 'precision': 'precision', 'recall': 'recall'}
    scores = cross_validate(pipe, X_train, y_train, scoring=scoring, cv=cv, return_train_score=False)
    return {metric: np.mean(scores[f'test_{metric}']) for metric in scoring}

## Train Models and Compare

In [None]:
numeric_features = feature_cols
numeric_transformer = StandardScaler()

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features)
])

models = {
    'LogReg_balanced': LogisticRegression(max_iter=1000, class_weight='balanced', random_state=RANDOM_STATE),
    'RandomForest_balanced': RandomForestClassifier(n_estimators=200, class_weight='balanced', random_state=RANDOM_STATE),
    'GradientBoosting': GradientBoostingClassifier(random_state=RANDOM_STATE),
    'BalancedRandomForest': BalancedRandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE)
}

results = {}
for name, model in models.items():
    if name in ['LogReg_balanced', 'GradientBoosting']:
        pipe = ImbPipeline([
            ('pre', preprocessor),
            ('smote', SMOTE(random_state=RANDOM_STATE)),
            ('model', model)
        ])
    else:
        pipe = Pipeline([
            ('pre', preprocessor),
            ('model', model)
        ])
    results[name] = evaluate_model(pipe, X_train, y_train)

results_df = pd.DataFrame(results).T.round(3).sort_values(by='f1', ascending=False)
results_df

## Fit Best Model on Train Set and Evaluate on Test Set

In [None]:
best_model_name = results_df.index[0]
print(f"Best model: {best_model_name}")

model = models[best_model_name]
if best_model_name in ['LogReg_balanced', 'GradientBoosting']:
    pipe = ImbPipeline([
        ('pre', preprocessor),
        ('smote', SMOTE(random_state=RANDOM_STATE)),
        ('model', model)
    ])
else:
    pipe = Pipeline([
        ('pre', preprocessor),
        ('model', model)
    ])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
y_proba = pipe.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))

RocCurveDisplay.from_estimator(pipe, X_test, y_test)

## Feature Importance

In [None]:
if best_model_name in ['RandomForest_balanced', 'BalancedRandomForest']:
    importances = pipe.named_steps['model'].feature_importances_
    pd.Series(importances, index=feature_cols).sort_values().tail(15).plot.barh()
    plt.title('Top Feature Importances')
    plt.show()
elif best_model_name == 'LogReg_balanced':
    coefs = pipe.named_steps['model'].coef_[0]
    pd.Series(np.abs(coefs), index=feature_cols).sort_values().tail(15).plot.barh()
    plt.title('Top Logistic Regression Coefficients')
    plt.show()
else:
    print("Feature importance not supported for this model.")

## Next Steps
- Try additional models like XGBoost or LightGBM
- Tune hyperparameters using GridSearchCV or Optuna
- Add interaction features or domain-specific transformations
- Monitor fairness and overfitting over larger samples