In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    roc_curve
)

import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv('../data/processed/customer_features.csv')

print("Shape:", df.shape)
print("Churn rate:", df['Churn'].mean())
df.head()


Shape: (3230, 29)
Churn rate: 0.42538699690402476


Unnamed: 0,CustomerID,Churn,Recency,Frequency,TotalSpent,AvgOrderValue,UniqueProducts,TotalItems,AvgDaysBetweenPurchases,AvgBasketSize,...,Purchases_Last90Days,ProductDiversityScore,AvgPricePreference,StdPricePreference,MinPrice,MaxPrice,RecencyScore,FrequencyScore,MonetaryScore,RFM_Score
0,16384,0,135,1,302.75,15.934211,19,189,0.0,189.0,...,0.0,1.0,2.2,1.629161,0.65,6.75,2,1,2,5
1,16385,0,176,1,100.31,4.559545,22,63,0.0,63.0,...,0.0,1.0,1.608636,1.224151,0.29,5.95,1,1,1,3
2,16387,0,183,1,130.24,21.706667,6,64,0.0,64.0,...,0.0,1.0,3.081667,1.910816,1.25,5.95,1,1,1,3
3,16388,1,198,1,186.05,14.311538,13,83,0.0,83.0,...,0.0,1.0,2.869231,1.676745,0.85,5.95,1,1,1,3
4,16390,1,81,3,328.9,19.347059,13,110,8.8125,36.666667,...,2.0,0.764706,4.314706,2.085348,1.25,6.75,2,3,2,7


In [3]:
X = df.drop(columns=['CustomerID', 'Churn'])
y = df['Churn']

print(X.shape, y.shape)


(3230, 27) (3230,)


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)


Train size: (2584, 27)
Test size: (646, 27)


In [6]:
X.dtypes


Recency                      int64
Frequency                    int64
TotalSpent                 float64
AvgOrderValue              float64
UniqueProducts               int64
TotalItems                   int64
AvgDaysBetweenPurchases    float64
AvgBasketSize              float64
StdBasketSize              float64
MaxBasketSize                int64
PreferredDay                 int64
PreferredHour                int64
CountryDiversity             int64
CustomerLifetimeDays         int64
PurchaseVelocity           float64
Purchases_Last30Days       float64
Purchases_Last60Days       float64
Purchases_Last90Days       float64
ProductDiversityScore      float64
AvgPricePreference         float64
StdPricePreference         float64
MinPrice                   float64
MaxPrice                   float64
RecencyScore                 int64
FrequencyScore               int64
MonetaryScore                int64
RFM_Score                    int64
dtype: object

In [7]:
# Keep only numeric columns
X_numeric = X.select_dtypes(include=['int64', 'float64'])

print("Numeric features:", X_numeric.shape)


Numeric features: (3230, 27)


In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X_numeric,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)


In [11]:
print("NaNs in X_train:", X_train.isnull().sum().sum())
print("NaNs in X_test:", X_test.isnull().sum().sum())

print("NaNs in y_train:", y_train.isnull().sum())
print("NaNs in y_test:", y_test.isnull().sum())


NaNs in X_train: 53
NaNs in X_test: 13
NaNs in y_train: 0
NaNs in y_test: 0


In [12]:
# Fill missing values with median
X_train = X_train.fillna(X_train.median())
X_test = X_test.fillna(X_train.median())


In [13]:
y_train = y_train.astype(int)
y_test = y_test.astype(int)


In [14]:
pipe_lr = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(max_iter=1000))
])

pipe_lr.fit(X_train, y_train)

y_pred = pipe_lr.predict(X_test)
y_prob = pipe_lr.predict_proba(X_test)[:, 1]

print("Logistic Regression")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))


Logistic Regression
Accuracy: 0.6547987616099071
Precision: 0.5921985815602837
Recall: 0.6072727272727273
F1: 0.599640933572711
ROC-AUC: 0.7181867189414359


In [15]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(
    max_depth=6,
    min_samples_leaf=50,
    random_state=42
)

dt.fit(X_train, y_train)

y_pred_dt = dt.predict(X_test)
y_prob_dt = dt.predict_proba(X_test)[:, 1]

print("Decision Tree")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Precision:", precision_score(y_test, y_pred_dt))
print("Recall:", recall_score(y_test, y_pred_dt))
print("F1:", f1_score(y_test, y_pred_dt))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_dt))


Decision Tree
Accuracy: 0.6191950464396285
Precision: 0.5543071161048689
Recall: 0.5381818181818182
F1: 0.5461254612546126
ROC-AUC: 0.6897868169566284


In [16]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=8,
    min_samples_leaf=30,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:, 1]

print("Random Forest")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("F1:", f1_score(y_test, y_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_rf))


Random Forest
Accuracy: 0.6501547987616099
Precision: 0.5953307392996109
Recall: 0.5563636363636364
F1: 0.575187969924812
ROC-AUC: 0.713364371477579


In [17]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

gb.fit(X_train, y_train)

y_pred_gb = gb.predict(X_test)
y_prob_gb = gb.predict_proba(X_test)[:, 1]

print("Gradient Boosting")
print("Accuracy:", accuracy_score(y_test, y_pred_gb))
print("Precision:", precision_score(y_test, y_pred_gb))
print("Recall:", recall_score(y_test, y_pred_gb))
print("F1:", f1_score(y_test, y_pred_gb))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_gb))


Gradient Boosting
Accuracy: 0.6253869969040248
Precision: 0.5642023346303502
Recall: 0.5272727272727272
F1: 0.5451127819548872
ROC-AUC: 0.7086302376868415


In [18]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(
    hidden_layer_sizes=(64, 32),
    max_iter=300,
    random_state=42
)

mlp.fit(X_train, y_train)

y_pred_mlp = mlp.predict(X_test)
y_prob_mlp = mlp.predict_proba(X_test)[:, 1]

print("Neural Network (MLP)")
print("Accuracy:", accuracy_score(y_test, y_pred_mlp))
print("Precision:", precision_score(y_test, y_pred_mlp))
print("Recall:", recall_score(y_test, y_pred_mlp))
print("F1:", f1_score(y_test, y_pred_mlp))
print("ROC-AUC:", roc_auc_score(y_test, y_prob_mlp))


Neural Network (MLP)
Accuracy: 0.6501547987616099
Precision: 0.5890909090909091
Recall: 0.5890909090909091
F1: 0.5890909090909091
ROC-AUC: 0.7034942416074492


In [19]:
model_comparison = pd.DataFrame({
    'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest', 'Gradient Boosting', 'Neural Network'],
    'ROC_AUC': [
        roc_auc_score(y_test, y_prob),
        roc_auc_score(y_test, y_prob_dt),
        roc_auc_score(y_test, y_prob_rf),
        roc_auc_score(y_test, y_prob_gb),
        roc_auc_score(y_test, y_prob_mlp)
    ]
})

model_comparison.sort_values(by='ROC_AUC', ascending=False)


Unnamed: 0,Model,ROC_AUC
0,Logistic Regression,0.718187
2,Random Forest,0.713364
3,Gradient Boosting,0.70863
4,Neural Network,0.703494
1,Decision Tree,0.689787
