In [1]:
#importing the libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
#reading the file
data = pd.read_csv("Churn_Modelling.csv")


In [3]:
#checking for null value
data.isnull().sum()

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [4]:
#removing columns which is not relavant for prediction
#data.drop(["RowNumber","CustomerId","Surname"],axis=1,inplace=True)
data.drop(["Surname"],axis=1,inplace=True)

In [5]:
#seperating independent and dependent variable
x=data.iloc[:,:-1].values
y=data.iloc[:,-1].values

In [6]:
print(x)

[[1 15634602 619 ... 1 1 101348.88]
 [2 15647311 608 ... 0 1 112542.58]
 [3 15619304 502 ... 1 0 113931.57]
 ...
 [9998 15584532 709 ... 0 1 42085.58]
 [9999 15682355 772 ... 1 0 92888.52]
 [10000 15628319 792 ... 1 0 38190.78]]


In [7]:
#converting the categorical variable (Geography) into numerical data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[("encoder",OneHotEncoder(),[3])],remainder="passthrough")
x=ct.fit_transform(x)

In [8]:
print(x)

[[1.0 0.0 0.0 ... 1 1 101348.88]
 [0.0 0.0 1.0 ... 0 1 112542.58]
 [1.0 0.0 0.0 ... 1 0 113931.57]
 ...
 [1.0 0.0 0.0 ... 0 1 42085.58]
 [0.0 1.0 0.0 ... 1 0 92888.52]
 [1.0 0.0 0.0 ... 1 0 38190.78]]


In [9]:
#converting the categorical variable (Gender) into numerical data
ct = ColumnTransformer(transformers=[("encoder",OneHotEncoder(),[6])],remainder="passthrough")
x=(ct.fit_transform(x))

In [10]:
#seperating dataset into training and testing data
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)


In [11]:
#building the model using Random Forest Classifier
#Hyperparameter tuning of Random Forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
regression = RandomForestClassifier(random_state=0)
param_grid = {
    'n_estimators': [200, 500],
    'max_features':  ['sqrt'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}
CV_regression = GridSearchCV(estimator=regression, param_grid=param_grid, cv= 5)
CV_regression.fit(x_train, y_train)


In [12]:
#printing predicted and y_test data
from sklearn.metrics import accuracy_score
y_pred = CV_regression.predict(x_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))


[[0 0]
 [0 1]
 [0 0]
 ...
 [0 0]
 [0 0]
 [0 0]]


In [13]:
#printing accuracy score and confusion matrix
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[1560   35]
 [ 232  173]]


0.8665

In [14]:
#printing cross validation score
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = CV_regression, X = x_train, y = y_train,cv=2)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 85.36 %
Standard Deviation: 0.54 %
