In [83]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, classification_report, precision_recall_curve
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV, RandomizedSearchCV

from collections import Counter

import warnings
warnings.filterwarnings('ignore')

In [55]:
df = pd.read_csv("churn_prediction.csv")
df.shape

(28382, 21)

In [56]:
df['churn'].value_counts()

churn
0    23122
1     5260
Name: count, dtype: int64

## Data Preprocessing

In [57]:
df.isnull().sum()

customer_id                          0
vintage                              0
age                                  0
gender                             525
dependents                        2463
occupation                          80
city                               803
customer_nw_category                 0
branch_code                          0
current_balance                      0
previous_month_end_balance           0
average_monthly_balance_prevQ        0
average_monthly_balance_prevQ2       0
current_month_credit                 0
previous_month_credit                0
current_month_debit                  0
previous_month_debit                 0
current_month_balance                0
previous_month_balance               0
churn                                0
last_transaction                     0
dtype: int64

### `a) Missing Value Treatment`

In [58]:
#Convert Gender
dict_gender = {'Male': 1, 'Female':0}
df.replace({'gender': dict_gender}, inplace = True)

# Replace with -1 for missing gender
df['gender'] = df['gender'].fillna(-1)

# Replacing with max. occurence values
df['dependents'] = df['dependents'].fillna(0)
df['occupation'] = df['occupation'].fillna('self_employed')
df['city'] = df['city'].fillna(1020)

### `b) Dummy variables`

In [59]:
# Convert occupation to one hot encoded features
df = pd.concat([df,pd.get_dummies(df['occupation'],prefix = str('occupation'),prefix_sep='_')],axis = 1)

In [60]:
df.head()

Unnamed: 0,customer_id,vintage,age,gender,dependents,occupation,city,customer_nw_category,branch_code,current_balance,...,previous_month_debit,current_month_balance,previous_month_balance,churn,last_transaction,occupation_company,occupation_retired,occupation_salaried,occupation_self_employed,occupation_student
0,1,2101,66,1.0,0.0,self_employed,187.0,2,755,1458.71,...,0.2,1458.71,1458.71,0,2019-05-21,False,False,False,True,False
1,2,2348,35,1.0,0.0,self_employed,1020.0,2,3214,5390.37,...,100.56,6496.78,8787.61,0,2019-11-01,False,False,False,True,False
2,4,2194,31,1.0,0.0,salaried,146.0,2,41,3913.16,...,259.23,5006.28,5070.14,0,NaT,False,False,True,False,False
3,5,2329,90,-1.0,0.0,self_employed,1020.0,2,582,2291.91,...,2143.33,2291.91,1669.79,1,2019-08-06,False,False,False,True,False
4,6,1579,42,1.0,2.0,self_employed,1494.0,3,388,927.72,...,1538.06,1157.15,1677.16,1,2019-11-03,False,False,False,True,False


# Train and Test Split

In [61]:
#x = df.drop('Attrition', axis=1)
x = df.drop(['churn','customer_id', 'occupation', 'last_transaction'], axis=1)
y = df['churn']
# Splitting the data into train and test
X_train,X_test,y_train,y_test=train_test_split(x, y, train_size=0.8, stratify = y, random_state=100)

In [62]:
y_train.shape, y_test.shape

((22705,), (5677,))

In [63]:
y_train.value_counts()/len(y_train)

churn
0    0.814666
1    0.185334
Name: count, dtype: float64

In [64]:
y_test.value_counts()/len(y_test)

churn
0    0.814691
1    0.185309
Name: count, dtype: float64

In [65]:
from sklearn.preprocessing import StandardScaler
Scaler_X = StandardScaler()
X_train = Scaler_X.fit_transform(X_train)
X_test = Scaler_X.transform(X_test)

# Handling class imbalance:

## A) Undersampling:

### 1. Random Undersampling (RUS):

In [79]:
from imblearn.under_sampling import RandomUnderSampler

counter = Counter(y_train)
print('Before',counter)

# Apply RUS
rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)

counter = Counter(y_train_rus)
print('After',counter)

Before Counter({0: 18497, 1: 4208})
After Counter({0: 4208, 1: 4208})


### 2. Tomek Links:

In [67]:
from imblearn.under_sampling import TomekLinks

counter = Counter(y_train)
print('Before',counter)

# Apply Tomek Links
tl = TomekLinks(sampling_strategy='majority')
X_train_tl, y_train_tl = tl.fit_resample(X_train, y_train)


counter = Counter(y_train_tl)
print('After',counter)

Before Counter({0: 18497, 1: 4208})
After Counter({0: 17142, 1: 4208})


### 3. Edited Nearest Neighbours (ENN):

In [99]:
from imblearn.under_sampling import EditedNearestNeighbours

counter = Counter(y_train)
print('Before',counter)

# Apply ENN
enn = EditedNearestNeighbours(sampling_strategy='majority', n_neighbors=3)
X_train_enn, y_train_enn = enn.fit_resample(X_train, y_train)


counter = Counter(y_train_enn)
print('After',counter)

Before Counter({0: 18497, 1: 4208})
After Counter({0: 11271, 1: 4208})


### 4. CNN:

In [78]:
from imblearn.under_sampling import CondensedNearestNeighbour

counter = Counter(y_train)
print('Before',counter)

cnn = CondensedNearestNeighbour(n_neighbors=1)
X_train_cnn, y_train_cnn = cnn.fit_resample(X_train, y_train)

print("After", Counter(y_train_cnn))

Before Counter({0: 18497, 1: 4208})
After Counter({0: 6186, 1: 4208})


### 5. One-Sided Selection (OSS):

In [70]:
from imblearn.under_sampling import OneSidedSelection

counter = Counter(y_train)
print('Before',counter)

# Apply OSS
oss = OneSidedSelection(random_state=42)
X_train_oss, y_train_oss = oss.fit_resample(X_train, y_train)

counter = Counter(y_train_oss)
print('After',counter)

Before Counter({0: 18497, 1: 4208})
After Counter({0: 17142, 1: 4208})


### 6. Neighbourhood Cleaning Rule (NCL):

In [71]:
from imblearn.under_sampling import NeighbourhoodCleaningRule

counter = Counter(y_train)
print('Before', counter)

# Apply NCL
ncl = NeighbourhoodCleaningRule(n_neighbors=3)
X_train_ncl, y_train_ncl = ncl.fit_resample(X_train, y_train)

counter = Counter(y_train_ncl)
print('After', counter)

Before Counter({0: 18497, 1: 4208})
After Counter({0: 11407, 1: 4208})


## B) Oversampling:

### 1. SMOTE:

In [72]:
from imblearn.over_sampling import SMOTE

counter = Counter(y_train)
print('Before', counter)

# Apply SMOTE
sm = SMOTE(random_state=42)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

counter = Counter(y_train_sm)
print('After', counter)

Before Counter({0: 18497, 1: 4208})
After Counter({0: 18497, 1: 18497})


### 2. SMOTE-NC ((Nominal Continuous):

In [None]:
from imblearn.over_sampling import SMOTENC
from sklearn.preprocessing import LabelEncoder

counter = Counter(y_train)
print('Before', counter)

# Specify categorical feature indices
categorical_features = [2, 4]  # Example indices

# Apply SMOTE-NC
smnc = SMOTENC(categorical_features=categorical_features, random_state=42)
X_train_smnc, y_train_smnc = smnc.fit_resample(X_train, y_train)

counter = Counter(y_train_smnc)
print('After:', counter)

* `The SMOTE-NC (Nominal Continuous) technique is not applicable in this case, as the categorical variables have already been transformed using one-hot encoding.`

### 3. Borderline-SMOTE:

In [73]:
from imblearn.over_sampling import BorderlineSMOTE

counter = Counter(y_train)
print('Before', counter)

# Apply bsm
bsm = BorderlineSMOTE(kind='borderline-1', random_state=42)
X_train_bsm, y_train_bsm = bsm.fit_resample(X_train, y_train)

counter = Counter(y_train_bsm)
print('After:', counter)

Before Counter({0: 18497, 1: 4208})
After: Counter({0: 18497, 1: 18497})


### 4. ADASYN Technique

In [74]:
from imblearn.over_sampling import ADASYN

counter = Counter(y_train)
print('Before',counter)

# oversampling the train dataset using ADASYN
ada = ADASYN(random_state=42)
X_train_ada, y_train_ada = ada.fit_resample(X_train, y_train)

counter = Counter(y_train_ada)
print('After',counter)

Before Counter({0: 18497, 1: 4208})
After Counter({0: 18497, 1: 17388})


## C) Hybrid Techniques:

### 1. SMOTE + Tomek Links:

In [36]:
from imblearn.combine import SMOTETomek

counter = Counter(y_train)
print('Before',counter)

# oversampling the train dataset using SMOTE + Tomek
smtom = SMOTETomek(random_state=139)
X_train_smtom, y_train_smtom = smtom.fit_resample(X_train, y_train)

counter = Counter(y_train_smtom)
print('After',counter)

Before Counter({0: 18497, 1: 4208})
After Counter({0: 18090, 1: 18090})


### 2. SMOTE + ENN:

In [37]:
from imblearn.combine import SMOTEENN

counter = Counter(y_train)
print('Before',counter)

# oversampling the train dataset using SMOTE + ENN
smenn = SMOTEENN()
X_train_smenn, y_train_smenn = smenn.fit_resample(X_train, y_train)

counter = Counter(y_train_smenn)
print('After',counter)

Before Counter({0: 18497, 1: 4208})
After Counter({1: 14689, 0: 8976})


# Model Building - Imbalanced data

In [100]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import pandas as pd
import numpy as np
from collections import Counter

def run_models_on_resampled_sets(dataset_dict, X_test, y_test):

    results = []

    # Define models
    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
        "Random Forest": RandomForestClassifier(random_state=42),
        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    }

    for model_name, model in models.items():
        for method, (X_train_res, y_train_res) in dataset_dict.items():
            # Train model
            model.fit(X_train_res, y_train_res)
            y_pred = model.predict(X_test)
            y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else np.zeros_like(y_pred)

            # Compute metrics
            acc = accuracy_score(y_test, y_pred)
            prec = precision_score(y_test, y_pred, zero_division=0)
            rec = recall_score(y_test, y_pred, zero_division=0)
            f1 = f1_score(y_test, y_pred, zero_division=0)
            roc = roc_auc_score(y_test, y_prob) if len(np.unique(y_test)) > 1 else np.nan

            results.append({
                "Model": model_name,
                "Resampling": method,
                "Accuracy": acc,
                "Precision": prec,
                "Recall": rec,
                "F1-score": f1,
                "ROC-AUC": roc,
                "Train Size": Counter(y_train_res)
            })

    return pd.DataFrame(results)

In [101]:
dataset_dict = {
    "Original": (X_train, y_train),
    "RUS": (X_train_rus, y_train_rus),
    "Tomek": (X_train_tl, y_train_tl),
    "ENN": (X_train_enn, y_train_enn),
    "CNN": (X_train_cnn, y_train_cnn),
    "OSS": (X_train_oss, y_train_oss),
    "NCL": (X_train_ncl, y_train_ncl),
    "SMOTE": (X_train_sm, y_train_sm),
    "Borderline-SMOTE": (X_train_bsm, y_train_bsm),
    "ADASYN": (X_train_ada, y_train_ada),
    "SMOTE + Tomek Links": (X_train_smtom, y_train_smtom),
    "SMOTE + ENN": (X_train_smenn, y_train_smenn),
}

In [102]:
results_df = run_models_on_resampled_sets(dataset_dict, X_test, y_test)

## Model Comparision

In [103]:
results_df

Unnamed: 0,Model,Resampling,Accuracy,Precision,Recall,F1-score,ROC-AUC,Train Size
0,Logistic Regression,Original,0.823146,0.75,0.068441,0.125436,0.771703,"{0: 18497, 1: 4208}"
1,Logistic Regression,RUS,0.77858,0.434755,0.64924,0.520778,0.77002,"{0: 4208, 1: 4208}"
2,Logistic Regression,Tomek,0.824731,0.761468,0.078897,0.14298,0.770861,"{0: 17142, 1: 4208}"
3,Logistic Regression,ENN,0.824379,0.578797,0.192015,0.288365,0.774958,"{0: 11271, 1: 4208}"
4,Logistic Regression,CNN,0.835476,0.684375,0.208175,0.319242,0.772916,"{0: 6186, 1: 4208}"
5,Logistic Regression,OSS,0.824731,0.761468,0.078897,0.14298,0.770861,"{0: 17142, 1: 4208}"
6,Logistic Regression,NCL,0.830016,0.680498,0.155894,0.253674,0.767734,"{0: 11407, 1: 4208}"
7,Logistic Regression,SMOTE,0.772943,0.429506,0.686312,0.528357,0.777765,"{0: 18497, 1: 18497}"
8,Logistic Regression,Borderline-SMOTE,0.772063,0.427458,0.677757,0.524265,0.776354,"{0: 18497, 1: 18497}"
9,Logistic Regression,ADASYN,0.812929,0.496372,0.65019,0.562963,0.779727,"{0: 18497, 1: 17388}"
