In [19]:
# importing the necessary packages
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import accuracy_score

from category_encoders import TargetEncoder
from category_encoders import OneHotEncoder
# import xgboost as xgb

import pandas as pd
import seaborn as sns
import numpy as np
import itertools
import matplotlib.pyplot as plt
%matplotlib inline
df = pd.read_csv('Churn_Modelling.csv')

In [20]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [21]:
df.isna().sum() 
# there are no missing values! This is extremely lucky and rare!

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [22]:
df['Gender'].value_counts()

Male      5457
Female    4543
Name: Gender, dtype: int64

In [23]:
#making 'Gender' the target variable
df['Target'] = df['Gender'].map(lambda x: 1 if x == 'male' else 0)

In [24]:
#dropping the old column because we have the new 'Target' column
df.drop('Gender', axis=1, inplace=True)

In [25]:
# taking a random sample of the dataframe
df.sample(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Target
5202,5203,15713354,Morrice,597,Germany,22,6,101528.61,1,1,0,70529.0,1,0
8259,8260,15703199,Golibe,619,Spain,38,3,96143.47,1,0,0,98994.92,0,0
515,516,15746902,Belstead,793,Spain,38,9,0.0,2,1,0,88225.02,0,0
943,944,15781465,Schofield,675,Germany,29,8,121326.42,1,1,0,133457.52,0,0
107,108,15812878,Parsons,785,Germany,36,2,99806.85,1,0,1,36976.52,0,0


In [26]:
# creating my train test split for the models
X = df.drop(['Target'], axis=1)
y = df['Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=25)

In [27]:
# creating the baseline null score. the models should be more accurate than the null score.
ns = 1 - y_test.mean()
print(f'Null score: {ns}')

Null score: 1.0


In [28]:
# target encoding the df so that the computer can read everything
enc = TargetEncoder()
data_train = enc.fit_transform(X_train, y_train)
data_test = enc.transform(X_test)

In [30]:
data_train.head(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
2569,2570,15700657,0.0,641,0.0,40,2,110086.69,1,1,0,159773.14,0
4807,4808,15753248,0.0,611,0.0,28,2,0.0,2,0,0,25395.83,0
6576,6577,15808905,0.0,823,0.0,37,5,164858.18,1,1,1,173516.71,0
3318,3319,15625126,0.0,629,0.0,40,6,0.0,2,1,1,139356.3,0
9900,9901,15645896,0.0,646,0.0,39,6,121681.91,2,0,1,61793.47,0
