In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE,ADASYN
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc, confusion_matrix, classification_report
from sklearn.metrics import confusion_matrix
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
import optuna
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
# Read in the data set
df = pd.read_csv('Customer-Churn-Records.csv')

df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Complain,Satisfaction Score,Card Type,Point Earned
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1,1,2,DIAMOND,464
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,1,3,DIAMOND,456
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1,1,3,DIAMOND,377
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0,0,5,GOLD,350
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0,0,5,GOLD,425


In [45]:
# drop row # and customer id and surname as they are not needed
df.drop(['RowNumber', 'CustomerId','Surname'], axis=1, inplace=True)

In [46]:
# Look at the distribution of the target variable
df['Exited'].value_counts()

0    7962
1    2038
Name: Exited, dtype: int64

In [47]:
# Get skewness of each column
skewness = df[['CreditScore','Age','Tenure','Balance','NumOfProducts','EstimatedSalary',
               'Satisfaction Score','Point Earned']].skew()

skewness

CreditScore          -0.071607
Age                   1.011320
Tenure                0.010991
Balance              -0.141109
NumOfProducts         0.745568
EstimatedSalary       0.002085
Satisfaction Score   -0.008936
Point Earned          0.008344
dtype: float64

In [48]:
# Get kurtosis of each column
kurtosis = df[['CreditScore','Age','Tenure','Balance','NumOfProducts','EstimatedSalary',
                'Satisfaction Score','Point Earned']].kurtosis()

kurtosis

CreditScore          -0.425726
Age                   1.395347
Tenure               -1.165225
Balance              -1.489412
NumOfProducts         0.582981
EstimatedSalary      -1.181518
Satisfaction Score   -1.285097
Point Earned         -1.193781
dtype: float64

In [49]:
df.columns

Index(['CreditScore', 'Geography', 'Gender', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
       'Exited', 'Complain', 'Satisfaction Score', 'Card Type',
       'Point Earned'],
      dtype='object')

In [50]:
# define categorical and numerical columns
cat_cols = ['Geography','Gender','Card Type','HasCrCard','IsActiveMember', 'Complain','NumOfProducts']
num_cols = ['CreditScore','Age','Tenure','Balance','EstimatedSalary',
            'Satisfaction Score','Point Earned']

In [51]:
# one hot encode categorical variables
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# standard scale numerical variables
# scaler = StandardScaler()

# df[num_cols] = scaler.fit_transform(df[num_cols])


### Train/test split

In [52]:
# Create features and target
X = df.drop('Exited', axis=1)
y = df['Exited']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)


In [53]:
# Function to try different models
def try_models(X_train, y_train, X_test, y_test, model_name, model_class, params):
    # Initialize the model with the parameters passed or default parameters if none are provided
    model = model_class(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # If no parameters are provided, initialize an empty dictionary
    if params is None:
        params = {}

    # Train the model and make predictions
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Print the accuracy
    print(f"{model_name} accuracy: {accuracy_score(y_test, y_pred)}")
    return model

In [54]:
# Run a bunch of models and print their accuracy for a baseline comparison
try_models(X_train, y_train, X_test, y_test, 'XGBClassifier', XGBClassifier, {})
try_models(X_train, y_train, X_test, y_test, 'LGBMClassifier', LGBMClassifier, {})    
try_models(X_train, y_train, X_test, y_test, 'RandomForestClassifier', RandomForestClassifier, {}) 
try_models(X_train, y_train, X_test, y_test, 'LogisticRegression', LogisticRegression, {})
try_models(X_train, y_train, X_test, y_test, 'KNeighborsClassifier', KNeighborsClassifier, {})

XGBClassifier accuracy: 0.999
LGBMClassifier accuracy: 0.999
RandomForestClassifier accuracy: 0.999
LogisticRegression accuracy: 0.799
KNeighborsClassifier accuracy: 0.765


In [55]:
# Fit logistic regression model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

# Print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression accuracy: {accuracy}")

# print the balanced accuracy score
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
print(f"Balanced Accuracy Score: {balanced_accuracy}")

# Print the confusion matrix
confusion = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(confusion)

Logistic Regression accuracy: 0.799
Balanced Accuracy Score: 0.526033526983569
Confusion Matrix:
[[1568   39]
 [ 363   30]]


In [56]:
# Look at what a 15 fold cross validation would look like to get a better idea of the accuracy
kf = KFold(n_splits=15, shuffle=True, random_state=42)

# Create a list to store the accuracy scores
accuracy_scores = []
balanced_accuracy_scores = []

# For each train-test split
for i, (train_index, test_index) in enumerate(kf.split(X)):
    # Split train-test
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Train the model
    logreg.fit(X_train, y_train)

    # Calculate and print the accuracy and balanced accuracy for the current fold
    fold_accuracy = accuracy_score(y_test, logreg.predict(X_test))
    fold_balanced_accuracy = balanced_accuracy_score(y_test, logreg.predict(X_test))
    print(f"Fold {i+1} Accuracy: {fold_accuracy}")
    print(f"Fold {i+1} Balanced Accuracy: {fold_balanced_accuracy}")

    # Append the accuracy scores to the respective lists
    accuracy_scores.append(fold_accuracy)
    balanced_accuracy_scores.append(fold_balanced_accuracy)

# Print the mean accuracy scores
print('The mean accuracy score is:', np.mean(accuracy_scores))
print('The mean balanced accuracy score is:', np.mean(balanced_accuracy_scores))

Fold 1 Accuracy: 0.7916041979010495
Fold 1 Balanced Accuracy: 0.5105777400169924
Fold 2 Accuracy: 0.8110944527736131
Fold 2 Balanced Accuracy: 0.5400699912510936
Fold 3 Accuracy: 0.7961019490254873
Fold 3 Balanced Accuracy: 0.5288496541681834
Fold 4 Accuracy: 0.8065967016491754
Fold 4 Balanced Accuracy: 0.537794237012987
Fold 5 Accuracy: 0.800599700149925
Fold 5 Balanced Accuracy: 0.5263572554075348
Fold 6 Accuracy: 0.7856071964017991
Fold 6 Balanced Accuracy: 0.5719184866042688


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Fold 7 Accuracy: 0.7976011994002998
Fold 7 Balanced Accuracy: 0.519340321294292
Fold 8 Accuracy: 0.7616191904047976
Fold 8 Balanced Accuracy: 0.5022997590111684
Fold 9 Accuracy: 0.782608695652174
Fold 9 Balanced Accuracy: 0.5162374627270263
Fold 10 Accuracy: 0.823088455772114
Fold 10 Balanced Accuracy: 0.5213768115942029
Fold 11 Accuracy: 0.7927927927927928
Fold 11 Balanced Accuracy: 0.5199778024417314
Fold 12 Accuracy: 0.7597597597597597
Fold 12 Balanced Accuracy: 0.5028957528957528
Fold 13 Accuracy: 0.8393393393393394
Fold 13 Balanced Accuracy: 0.6979578392621871
Fold 14 Accuracy: 0.7987987987987988
Fold 14 Balanced Accuracy: 0.5244537234267658
Fold 15 Accuracy: 0.7882882882882883
Fold 15 Balanced Accuracy: 0.5212450592885376
The mean accuracy score is: 0.795700047873961
The mean balanced accuracy score is: 0.5360901264268484


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [57]:
# Predict probabilities for the test data, what does this look like and is it useful?
y_probs = logreg.predict_proba(X_test)

# Create a df with the predicted probabilities
df_probs = pd.DataFrame(y_probs, columns=["class_0", "class_1"])

df_probs.head()


Unnamed: 0,class_0,class_1
0,0.863012,0.136988
1,0.848446,0.151554
2,0.872278,0.127722
3,0.929115,0.070885
4,0.920339,0.079661
