In [None]:
import numpy as np
import pandas as pd
import sys
import os

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', '')))
from src.customer_churn_prediction.utils import DropColumnsTransformer
from src.customer_churn_prediction.utils import remove_numerical_outliers

from sklearn.neighbors import KNeighborsClassifier 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('Churn_Modelling.csv')

In [5]:
df['Exited'] = df['Exited'].replace({1: 'Yes', 0: 'No'})

In [6]:
df = remove_numerical_outliers(df)

In [7]:
df['Exited'] = df['Exited'].replace({'Yes': 1, 'No': 0})

In [8]:
X, y = df.drop(['Exited'], axis=1), df['Exited']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
categorical_cols = ['Geography', 'Gender']
numerical_cols = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']
dropped_cols = ['RowNumber', 'CustomerId', 'Surname']

In [10]:
columns_to_drop = DropColumnsTransformer(dropped_cols)

In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ('drop', columns_to_drop, dropped_cols),
        ('num', StandardScaler(), numerical_cols),  # Apply StandardScaler to numerical columns
        ('cat', OneHotEncoder(), categorical_cols)      # Apply OneHotEncoder to nominal columns
    ])

In [12]:
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

In [66]:

models = {
    'LogisticRegression': LogisticRegression(),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(),
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier(),
    'NaiveBayes': GaussianNB(),
    'GradientBoosting': GradientBoostingClassifier()
}

param_grids = {
    'LogisticRegression': {
        'C': [0.01, 0.1, 1, 10],
        'solver': ['liblinear', 'saga'],
        'penalty': ['l2'],
        'class_weight':['balanced'],
        'max_iter':[500]
    },
    'KNN': {
        'n_neighbors': [3, 5, 7, 10],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan'],
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto'],
        'class_weight':['balanced']
    },
    'DecisionTree': {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'class_weight':['balanced']
    },
    'RandomForest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'criterion': ['gini', 'entropy'],
        'class_weight':['balanced']
    },
    'NaiveBayes': {
        # Gaussian Naive Bayes doesn't require much tuning, but we can tweak priors if needed.
        'priors': [None, [0.3, 0.7]],
    },
    'GradientBoosting': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    }
}

# Running GridSearchCV on each model
best_models = {}
for model_name, model in models.items():
    print(f"Running GridSearchCV for {model_name}...")
    grid_search = GridSearchCV(estimator=model, param_grid=param_grids[model_name], cv=5, n_jobs=-1, verbose=1)
    grid_search.fit(X_train_preprocessed, y_train)
    best_models[model_name] = grid_search.best_estimator_

# Evaluate the best models
for model_name, model in best_models.items():
    print(f"\nEvaluating best model: {model_name} with hyperparameters: {grid_search.best_params_}")
    y_pred = model.predict(preprocessor.transform(X_test))
    print(classification_report(y_test, y_pred))
    print('-'*100)


Running GridSearchCV for LogisticRegression...
Fitting 5 folds for each of 8 candidates, totalling 40 fits


Running GridSearchCV for KNN...
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Running GridSearchCV for SVM...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Running GridSearchCV for DecisionTree...
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Running GridSearchCV for RandomForest...
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Running GridSearchCV for NaiveBayes...
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Running GridSearchCV for GradientBoosting...
Fitting 5 folds for each of 27 candidates, totalling 135 fits

Evaluating best model: LogisticRegression with hyperparameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50}
              precision    recall  f1-score   support

           0       0.92      0.73      0.81      1524
           1       0.41      0.74      0.53       390

    accuracy                           0.73      1914
   macro avg       0.66      0.73      0.67      1914
weight

## Since the target set is imbalanced, we will try to test this models with oversampled minority class.

In [13]:
# Oversampling the minority class to balance the target
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_preprocessed, y_train)

## Running models after oversampling

In [20]:

models = {
    'LogisticRegression': LogisticRegression(),
    'KNN': KNeighborsClassifier(),
    'SVM': SVC(),
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier(),
    'NaiveBayes': GaussianNB(),
    'GradientBoosting': GradientBoostingClassifier()
}

param_grids = {
    'LogisticRegression': {
        'C': [0.01, 0.1, 1, 10],
        'solver': ['liblinear', 'saga'],
        'penalty': ['l2'],
        'class_weight':['balanced'],
        'max_iter':[500]
    },
    'KNN': {
        'n_neighbors': [3, 5, 7, 10],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan'],
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': ['scale', 'auto'],
        'class_weight':['balanced']
    },
    'DecisionTree': {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'class_weight':['balanced']
    },
    'RandomForest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'criterion': ['gini', 'entropy'],
        'class_weight':['balanced']
    },
    'NaiveBayes': {
        # Gaussian Naive Bayes doesn't require much tuning, but we can tweak priors if needed.
        'priors': [None, [0.3, 0.7]],
    },
    'GradientBoosting': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7]
    }
}

# Running GridSearchCV on each model
best_models = {}
for model_name, model in models.items():
    print(f"Running GridSearchCV for {model_name}...")
    grid_search = GridSearchCV(estimator=model, param_grid=param_grids[model_name], cv=5, n_jobs=-1, verbose=1)
    grid_search.fit(X_train_resampled, y_train_resampled)
    best_models[model_name] = grid_search.best_estimator_

# Evaluate the best models
for model_name, model in best_models.items():
    print(f"\nEvaluating best model: {model_name} with hyperparameters: {grid_search.best_params_}")
    y_pred = model.predict(preprocessor.transform(X_test))
    print(classification_report(y_test, y_pred))
    print('-'*100)


Running GridSearchCV for LogisticRegression...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Running GridSearchCV for KNN...
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Running GridSearchCV for SVM...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Running GridSearchCV for DecisionTree...
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Running GridSearchCV for RandomForest...
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Running GridSearchCV for NaiveBayes...
Fitting 5 folds for each of 2 candidates, totalling 10 fits
Running GridSearchCV for GradientBoosting...
Fitting 5 folds for each of 27 candidates, totalling 135 fits

Evaluating best model: LogisticRegression with hyperparameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200}
              precision    recall  f1-score   support

           0       0.91      0.73      0.81      1524
           1       0.41      0.73      0.53       390

    ac

## 

In [19]:

models = {
    'XGBClassifier': XGBClassifier()
}

param_grids = {
    'XGBClassifier':{
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
        'gamma': [0, 0.1, 0.2]
    }
}

# Running GridSearchCV on each model
best_models = {}
for model_name, model in models.items():
    print(f"Running GridSearchCV for {model_name}...")
    grid_search = GridSearchCV(estimator=model, param_grid=param_grids[model_name], cv=5, n_jobs=-1, verbose=1)
    grid_search.fit(X_train_resampled, y_train_resampled)
    best_models[model_name] = grid_search.best_estimator_

# Evaluate the best models
for model_name, model in best_models.items():
    print(f"\nEvaluating best model: {model_name} with hyperparameters: {grid_search.best_params_}")
    y_pred = model.predict(preprocessor.transform(X_test))
    print(classification_report(y_test, y_pred))
    print('-'*100)


Running GridSearchCV for XGBClassifier...
Fitting 5 folds for each of 324 candidates, totalling 1620 fits

Evaluating best model: XGBClassifier with hyperparameters: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}
              precision    recall  f1-score   support

           0       0.89      0.93      0.91      1524
           1       0.66      0.56      0.60       390

    accuracy                           0.85      1914
   macro avg       0.78      0.74      0.76      1914
weighted avg       0.84      0.85      0.85      1914

----------------------------------------------------------------------------------------------------


## xgboost performs well with hyperparameters {colsample_bytree = 0.8, gamma = 0, learning_rate = 0.1, max_depth = 7, n_estimators = 200, subsample = 0.8}