In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

In [2]:
df = pd.read_csv('/content/drive/MyDrive/12th Grade/Machine Learning/Data/Bank Churn Dataset/train.csv/train.csv')

In [3]:
df.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,0,15674932,Okwudilichukwu,668,France,Male,33.0,3,0.0,2,1.0,0.0,181449.97,0
1,1,15749177,Okwudiliolisa,627,France,Male,33.0,1,0.0,2,1.0,1.0,49503.5,0
2,2,15694510,Hsueh,678,France,Male,40.0,10,0.0,2,1.0,0.0,184866.69,0
3,3,15741417,Kao,581,France,Male,34.0,2,148882.54,1,1.0,1.0,84560.88,0
4,4,15766172,Chiemenam,716,Spain,Male,33.0,5,0.0,2,1.0,1.0,15068.83,0


In [4]:
df.shape

(165034, 14)

In [5]:
df['Exited'].sum()

34921

In [6]:
df.isna().sum()

id                 0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [7]:
len(df.loc[(df['Gender']=='Male')&(df['Exited']==1)].index)/len(df.loc[df['Exited']==1].index)

0.42427192806620656

In [8]:
df.groupby(['Gender','Exited']).mean()

  df.groupby(['Gender','Exited']).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,id,CustomerId,CreditScore,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
Gender,Exited,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Female,0,82305.528245,15692650.0,657.624481,36.674778,5.026459,51245.019804,1.62058,0.754456,0.550957,112126.11825
Female,1,82685.868142,15690910.0,652.68769,44.141806,4.904501,70138.122401,1.326038,0.741408,0.292166,115322.571535
Male,0,82552.963145,15692190.0,657.568387,36.484929,5.063561,51262.938616,1.611765,0.761828,0.553068,112056.647593
Male,1,82831.190605,15690250.0,651.586461,43.709706,4.927781,72664.463676,1.330319,0.727592,0.298529,113153.974419


In [9]:
df['Credit Range'] = 0
df.loc[df['CreditScore'] >= 800, 'Credit Range'] = 4
df.loc[(df['CreditScore'] >= 740) & (df['CreditScore'] < 800), 'Credit Range'] = 3
df.loc[(df['CreditScore'] >= 670) & (df['CreditScore'] < 739), 'Credit Range'] = 2
df.loc[(df['CreditScore'] >= 580) & (df['CreditScore'] < 669), 'Credit Range'] = 1

In [10]:
chars = []
for index,row in df.iterrows():
  chars.append(' '.join(row['Surname'].lower()))

In [11]:
df['SurChars'] = chars

In [12]:
vectorizer = CountVectorizer(analyzer='char')
vectorizer.fit_transform(df['SurChars'])

<165034x30 sparse matrix of type '<class 'numpy.int64'>'
	with 1093844 stored elements in Compressed Sparse Row format>

In [13]:
df.columns

Index(['id', 'CustomerId', 'Surname', 'CreditScore', 'Geography', 'Gender',
       'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited', 'Credit Range',
       'SurChars'],
      dtype='object')

In [14]:
features = ['Geography', 'Gender',
       'Age', 'Balance', 'NumOfProducts',
       'IsActiveMember', 'Credit Range', 'SurChars']

In [15]:
X = df[features]
y = df['Exited']

In [16]:
ohe = OneHotEncoder()

In [17]:
ct = make_column_transformer((ohe, ['Geography','Gender']), (vectorizer, 'SurChars'), remainder='passthrough')

In [18]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import xgboost as xgb

In [19]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import make_pipeline

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

xg = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
pipe = make_pipeline_imb(ct,SMOTE(),xg)
pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8666962897133971


In [20]:
print(confusion_matrix(y_test,y_pred))
print(y_test.value_counts())

[[36881  2252]
 [ 4348  6030]]
0    39133
1    10378
Name: Exited, dtype: int64


In [21]:
models = {
    'LogisticRegression': LogisticRegression(max_iter=200),
    'RandomForest': RandomForestClassifier(n_estimators=100),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

In [22]:
cv_scores = {}
for model_name, model in models.items():
  pipe = make_pipeline_imb(ct,SMOTE(),model)
  scores = cross_val_score(pipe, X, y, cv=5)
  cv_scores[model_name] = scores
  print(f"{model_name}: Accuracy per fold: {scores}")
  print(f"{model_name}: Average accuracy: {np.mean(scores)}\n")

best_model = max(cv_scores, key=lambda k: np.mean(cv_scores[k]))
print(f"Best model based on average accuracy: {best_model}")

LogisticRegression: Accuracy per fold: [0.21159148 0.21159148 0.21159148 0.21162178 0.21159789]
LogisticRegression: Average accuracy: 0.21159882205526884

RandomForest: Accuracy per fold: [0.85481867 0.85072863 0.85303118 0.85033478 0.8496637 ]
RandomForest: Average accuracy: 0.8517153916666012

XGBoost: Accuracy per fold: [0.86939134 0.86469537 0.86845215 0.86427122 0.86487305]
XGBoost: Average accuracy: 0.8663366247950639

Best model based on average accuracy: XGBoost


In [23]:
df_new = pd.read_csv('/content/drive/MyDrive/12th Grade/Machine Learning/Data/Bank Churn Dataset/test.csv/test.csv')

In [24]:
df_new.head()

Unnamed: 0,id,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,165034,15773898,Lucchese,586,France,Female,23.0,2,0.0,2,0.0,1.0,160976.75
1,165035,15782418,Nott,683,France,Female,46.0,2,0.0,1,1.0,0.0,72549.27
2,165036,15807120,K?,656,France,Female,34.0,7,0.0,2,1.0,0.0,138882.09
3,165037,15808905,O'Donnell,681,France,Male,36.0,8,0.0,1,1.0,0.0,113931.57
4,165038,15607314,Higgins,752,Germany,Male,38.0,10,121263.62,1,1.0,0.0,139431.0


In [25]:
df_new.isna().sum()

id                 0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
dtype: int64

In [26]:
df_new['Credit Range'] = 0
df_new.loc[df_new['CreditScore'] >= 800, 'Credit Range'] = 4
df_new.loc[(df_new['CreditScore'] >= 740) & (df_new['CreditScore'] < 800), 'Credit Range'] = 3
df_new.loc[(df_new['CreditScore'] >= 670) & (df_new['CreditScore'] < 739), 'Credit Range'] = 2
df_new.loc[(df_new['CreditScore'] >= 580) & (df_new['CreditScore'] < 669), 'Credit Range'] = 1

In [27]:
chars2 = []
for index,row in df_new.iterrows():
  chars2.append(' '.join(row['Surname'].lower()))

In [28]:
df_new['SurChars'] = chars2

In [29]:
vect2 = CountVectorizer(analyzer='char')
vect2.fit_transform(df_new['SurChars'])

<110023x30 sparse matrix of type '<class 'numpy.int64'>'
	with 729372 stored elements in Compressed Sparse Row format>

In [30]:
X_new = df_new[features]

In [31]:
xg = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')

In [32]:
pipe_new = make_pipeline_imb(ct,SMOTE(),xg)
pipe_new.fit(X,y)

y_new = pipe_new.predict(X_new)

In [33]:
res = pd.DataFrame({'id':df_new['id'], 'Exited':y_new})

In [34]:
res['Exited'].unique()

array([0, 1])

In [35]:
res['Exited'].sum()/len(res.index)

0.16513819837670304

In [36]:
#res.to_csv('/content/drive/MyDrive/12th Grade/Machine Learning/Data/Bank Churn Dataset/result3.csv', index=False)