In [800]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [801]:
df = pd.read_csv("../data/bank.csv")
print(df.shape)
print(f'Null values: {df.isna().sum().sum()}')
print(df.Exited.value_counts())
df.sample(1)

(10000, 14)
Null values: 0
Exited
0    7963
1    2037
Name: count, dtype: int64


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
7105,7106,15597536,Nkemjika,576,Spain,Male,45,5,133618.01,1,0,0,135244.87,0


In [802]:
df['Gender'] = df['Gender'].map({'Male' : 0, 'Female' : 1})
df.sample(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
1355,1356,15615029,Munro,734,Spain,0,39,6,0.0,1,1,1,95135.27,0
3429,3430,15674678,Bradley,731,Germany,1,43,9,79120.27,1,0,0,548.52,1
7401,7402,15677395,Nwabugwu,633,France,1,39,9,129189.15,2,0,0,170998.83,0
9176,9177,15610433,Kwemto,573,France,0,35,9,0.0,2,1,0,11743.89,0
876,877,15581229,Gregory,502,Germany,1,32,1,173340.83,1,0,1,122763.95,0


In [803]:
df = pd.get_dummies(df, columns=['Geography'], dtype=int, prefix='')
df 

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,_France,_Germany,_Spain
0,1,15634602,Hargrave,619,1,42,2,0.00,1,1,1,101348.88,1,1,0,0
1,2,15647311,Hill,608,1,41,1,83807.86,1,0,1,112542.58,0,0,0,1
2,3,15619304,Onio,502,1,42,8,159660.80,3,1,0,113931.57,1,1,0,0
3,4,15701354,Boni,699,1,39,1,0.00,2,0,0,93826.63,0,1,0,0
4,5,15737888,Mitchell,850,1,43,2,125510.82,1,1,1,79084.10,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,0,39,5,0.00,2,1,0,96270.64,0,1,0,0
9996,9997,15569892,Johnstone,516,0,35,10,57369.61,1,1,1,101699.77,0,1,0,0
9997,9998,15584532,Liu,709,1,36,7,0.00,1,0,1,42085.58,1,1,0,0
9998,9999,15682355,Sabbatini,772,0,42,3,75075.31,2,1,0,92888.52,1,0,1,0


In [804]:
df = df.drop(columns=['RowNumber', 'CustomerId', 'Surname'])
df

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,_France,_Germany,_Spain
0,619,1,42,2,0.00,1,1,1,101348.88,1,1,0,0
1,608,1,41,1,83807.86,1,0,1,112542.58,0,0,0,1
2,502,1,42,8,159660.80,3,1,0,113931.57,1,1,0,0
3,699,1,39,1,0.00,2,0,0,93826.63,0,1,0,0
4,850,1,43,2,125510.82,1,1,1,79084.10,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,0,39,5,0.00,2,1,0,96270.64,0,1,0,0
9996,516,0,35,10,57369.61,1,1,1,101699.77,0,1,0,0
9997,709,1,36,7,0.00,1,0,1,42085.58,1,1,0,0
9998,772,0,42,3,75075.31,2,1,0,92888.52,1,0,1,0


In [805]:
x_train, x_test, y_train, y_test = train_test_split(df.drop(columns=['Exited']), df['Exited'], test_size=0.25, random_state=0) 

In [806]:
dt = DecisionTreeClassifier(criterion='gini')
dt.fit(x_train, y_train)
y_pred = dt.predict(x_test)
accuracy_score(y_test, y_pred)

0.7932

In [807]:
new_col = pd.DataFrame((df['EstimatedSalary'] * df['Tenure'] + df['Balance']) / 2, columns=['Parameter'])
df.join(new_col)

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,_France,_Germany,_Spain,Parameter
0,619,1,42,2,0.00,1,1,1,101348.88,1,1,0,0,101348.880
1,608,1,41,1,83807.86,1,0,1,112542.58,0,0,0,1,98175.220
2,502,1,42,8,159660.80,3,1,0,113931.57,1,1,0,0,535556.680
3,699,1,39,1,0.00,2,0,0,93826.63,0,1,0,0,46913.315
4,850,1,43,2,125510.82,1,1,1,79084.10,0,0,0,1,141839.510
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,0,39,5,0.00,2,1,0,96270.64,0,1,0,0,240676.600
9996,516,0,35,10,57369.61,1,1,1,101699.77,0,1,0,0,537183.655
9997,709,1,36,7,0.00,1,0,1,42085.58,1,1,0,0,147299.530
9998,772,0,42,3,75075.31,2,1,0,92888.52,1,0,1,0,176870.435


In [808]:
x_train, x_test, y_train, y_test = train_test_split(df.drop(columns=['Exited']), df['Exited'], test_size=0.25, random_state=0)

dt.fit(x_train, y_train)
y_pred = dt.predict(x_test)
accuracy_score(y_test, y_pred)

0.796

In [809]:
df = pd.get_dummies(df, columns=['Tenure'], dtype=int)
df

Unnamed: 0,CreditScore,Gender,Age,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,_France,...,Tenure_1,Tenure_2,Tenure_3,Tenure_4,Tenure_5,Tenure_6,Tenure_7,Tenure_8,Tenure_9,Tenure_10
0,619,1,42,0.00,1,1,1,101348.88,1,1,...,0,1,0,0,0,0,0,0,0,0
1,608,1,41,83807.86,1,0,1,112542.58,0,0,...,1,0,0,0,0,0,0,0,0,0
2,502,1,42,159660.80,3,1,0,113931.57,1,1,...,0,0,0,0,0,0,0,1,0,0
3,699,1,39,0.00,2,0,0,93826.63,0,1,...,1,0,0,0,0,0,0,0,0,0
4,850,1,43,125510.82,1,1,1,79084.10,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,0,39,0.00,2,1,0,96270.64,0,1,...,0,0,0,0,1,0,0,0,0,0
9996,516,0,35,57369.61,1,1,1,101699.77,0,1,...,0,0,0,0,0,0,0,0,0,1
9997,709,1,36,0.00,1,0,1,42085.58,1,1,...,0,0,0,0,0,0,1,0,0,0
9998,772,0,42,75075.31,2,1,0,92888.52,1,0,...,0,0,1,0,0,0,0,0,0,0


In [810]:
df['EstimatedSalary'] = MinMaxScaler().fit_transform(df['EstimatedSalary'].to_frame())
df

Unnamed: 0,CreditScore,Gender,Age,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,_France,...,Tenure_1,Tenure_2,Tenure_3,Tenure_4,Tenure_5,Tenure_6,Tenure_7,Tenure_8,Tenure_9,Tenure_10
0,619,1,42,0.00,1,1,1,0.506735,1,1,...,0,1,0,0,0,0,0,0,0,0
1,608,1,41,83807.86,1,0,1,0.562709,0,0,...,1,0,0,0,0,0,0,0,0,0
2,502,1,42,159660.80,3,1,0,0.569654,1,1,...,0,0,0,0,0,0,0,1,0,0
3,699,1,39,0.00,2,0,0,0.469120,0,1,...,1,0,0,0,0,0,0,0,0,0
4,850,1,43,125510.82,1,1,1,0.395400,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,0,39,0.00,2,1,0,0.481341,0,1,...,0,0,0,0,1,0,0,0,0,0
9996,516,0,35,57369.61,1,1,1,0.508490,0,1,...,0,0,0,0,0,0,0,0,0,1
9997,709,1,36,0.00,1,0,1,0.210390,1,1,...,0,0,0,0,0,0,1,0,0,0
9998,772,0,42,75075.31,2,1,0,0.464429,1,0,...,0,0,1,0,0,0,0,0,0,0


In [811]:
df['Balance'] = MinMaxScaler().fit_transform(df['Balance'].to_frame())
df

Unnamed: 0,CreditScore,Gender,Age,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,_France,...,Tenure_1,Tenure_2,Tenure_3,Tenure_4,Tenure_5,Tenure_6,Tenure_7,Tenure_8,Tenure_9,Tenure_10
0,619,1,42,0.000000,1,1,1,0.506735,1,1,...,0,1,0,0,0,0,0,0,0,0
1,608,1,41,0.334031,1,0,1,0.562709,0,0,...,1,0,0,0,0,0,0,0,0,0
2,502,1,42,0.636357,3,1,0,0.569654,1,1,...,0,0,0,0,0,0,0,1,0,0
3,699,1,39,0.000000,2,0,0,0.469120,0,1,...,1,0,0,0,0,0,0,0,0,0
4,850,1,43,0.500246,1,1,1,0.395400,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,0,39,0.000000,2,1,0,0.481341,0,1,...,0,0,0,0,1,0,0,0,0,0
9996,516,0,35,0.228657,1,1,1,0.508490,0,1,...,0,0,0,0,0,0,0,0,0,1
9997,709,1,36,0.000000,1,0,1,0.210390,1,1,...,0,0,0,0,0,0,1,0,0,0
9998,772,0,42,0.299226,2,1,0,0.464429,1,0,...,0,0,1,0,0,0,0,0,0,0


In [812]:
x_train, x_test, y_train, y_test = train_test_split(df.drop(columns=['Exited']), df['Exited'], test_size=0.25, random_state=0)

dt.fit(x_train, y_train)
y_pred = dt.predict(x_test)
accuracy_score(y_test, y_pred)

0.81