<a href="https://colab.research.google.com/github/databyhuseyn/MachineLearning/blob/main/Churn_Modelling_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('/content/Churn_Modelling.csv')
data

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.00,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.00,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1


In [None]:
data_neg = data[data['Exited']==0].head(2000)

In [None]:
data = pd.concat([data_neg, data[data['Exited']==1]], ignore_index=True, axis=0)

In [None]:
data.drop(['RowNumber', 'CustomerId', 'Surname'], axis = 1, inplace=True)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4037 entries, 0 to 4036
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      4037 non-null   int64  
 1   Geography        4037 non-null   object 
 2   Gender           4037 non-null   object 
 3   Age              4037 non-null   int64  
 4   Tenure           4037 non-null   int64  
 5   Balance          4037 non-null   float64
 6   NumOfProducts    4037 non-null   int64  
 7   HasCrCard        4037 non-null   int64  
 8   IsActiveMember   4037 non-null   int64  
 9   EstimatedSalary  4037 non-null   float64
 10  Exited           4037 non-null   int64  
dtypes: float64(2), int64(7), object(2)
memory usage: 347.1+ KB


In [None]:
data.isna().sum()

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [None]:
X = data.drop(['Exited'], axis=1)
y = data['Exited'].copy()

In [None]:
y.value_counts()

1    2037
0    2000
Name: Exited, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y)

In [None]:
cat_features = X.select_dtypes(exclude=(np.number)).columns
num_features = X.select_dtypes(include=(np.number)).columns

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [None]:
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('one_hot', OneHotEncoder())
])

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

transformer = ColumnTransformer([
    ('cat', cat_pipeline, cat_features),
    ('num', num_pipeline, num_features)
], remainder='passthrough')

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
full_pipeline_rfc = Pipeline([
    ('preprocessing', transformer),
    ('rfc', RandomForestClassifier())       # class_weight={0:1, 1:3}
])

full_pipeline_rfc.fit(X_train, y_train)

In [None]:
full_pipeline_rfc.score(X_train, y_train), full_pipeline_rfc.score(X_test, y_test)

(1.0, 0.7660891089108911)

(1.0, 0.7648514851485149)


In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

In [None]:
predictions_rfc = full_pipeline_rfc.predict(X_test)

In [None]:
f1_score(predictions, y_test), recall_score(predictions, y_test), precision_score(predictions, y_test)

(0.759235668789809, 0.7904509283819628, 0.7303921568627451)

(0.7570332480818415, 0.7914438502673797, 0.7254901960784313)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
full_pipeline_logr = Pipeline([
    ('preprocessing', transformer),
    ('logr', LogisticRegression())       # class_weight={0:1, 1:3}
])

full_pipeline_logr.fit(X_train, y_train)

In [None]:
full_pipeline_logr.score(X_train, y_train), full_pipeline_logr.score(X_test, y_test)

(0.7042427996283679, 0.6992574257425742)

In [None]:
predictions_logr = full_pipeline_logr.predict(X_test)

In [None]:
f1_score(predictions_logr, y_test), recall_score(predictions_logr, y_test), precision_score(predictions_logr, y_test)

(0.6966292134831461, 0.7099236641221374, 0.6838235294117647)

In [None]:
from sklearn.svm import SVC

In [None]:
full_pipeline_svc = Pipeline([
    ('preprocessing', transformer),
    ('svc', SVC())       # class_weight={0:1, 1:3}
])

full_pipeline_svc.fit(X_train, y_train)

In [None]:
full_pipeline_svc.score(X_train, y_train), full_pipeline_svc.score(X_test, y_test)

(0.806751316196965, 0.7747524752475248)

In [None]:
predictions_svc = full_pipeline_svc.predict(X_test)

In [None]:
f1_score(predictions_svc, y_test), recall_score(predictions_svc, y_test), precision_score(predictions_svc, y_test)

(0.7672634271099743, 0.8021390374331551, 0.7352941176470589)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid_svc = {
    'svc__kernel': ['linear', 'poly', 'sigmoid', 'precomputed'],
}

grid_search_svc = GridSearchCV(full_pipeline_svc, param_grid_svc, cv = 3)
grid_search_svc.fit(X_train, y_train)

3 fits failed out of a total of 12.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/pipeline.py", line 405, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/svm/_base.py", line 217, in fit
    raise ValueError(
ValueError: Precomputed matrix must be a square matrix. Input is a 2152x13 matrix.

-------------------------------------------------------------------------------

In [None]:
grid_search_svc.best_params_

{'svc__kernel': 'poly'}

In [None]:
grid_search_svc.best_estimator_.fit(X_train, y_train)

In [None]:
grid_search_svc.best_estimator_.score(X_train, y_train), grid_search_svc.best_estimator_.score(X_test, y_test)

(0.8024156085475379, 0.7784653465346535)

In [None]:
predictions_svc2 = grid_search_svc.best_estimator_.predict(X_test)

In [None]:
f1_score(predictions_svc2, y_test), recall_score(predictions_svc2, y_test), precision_score(predictions_svc2, y_test)

(0.7708066581306019, 0.806970509383378, 0.7377450980392157)