# Importing useful Libraries 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Importing the datasets

In [None]:
dataset = pd.read_csv("/content/gdrive/My Drive/customer_churn_prediction/Churn_Modelling.csv")

In [6]:
dataset.info()
dataset.shape

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


(10000, 14)

In [7]:
dataset.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


# Since RowNumber,CustomerId And Surname doesn't provide much information on predicting the customer churning behaviour in a bank
# So, we remove those columns in our dataset


In [None]:
X = dataset.iloc[:,3:13]
y = dataset.iloc[:,13]

In [9]:
X.shape

(10000, 10)

# Printing the  feature values of our dataset

In [11]:
X

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,France,Female,42,2,0.00,1,1,1,101348.88
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58
2,502,France,Female,42,8,159660.80,3,1,0,113931.57
3,699,France,Female,39,1,0.00,2,0,0,93826.63
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.10
...,...,...,...,...,...,...,...,...,...,...
9995,771,France,Male,39,5,0.00,2,1,0,96270.64
9996,516,France,Male,35,10,57369.61,1,1,1,101699.77
9997,709,France,Female,36,7,0.00,1,0,1,42085.58
9998,772,Germany,Male,42,3,75075.31,2,1,0,92888.52


# Printing the labels of our datasets

In [13]:
y

0       1
1       0
2       1
3       0
4       0
       ..
9995    0
9996    0
9997    1
9998    1
9999    0
Name: Exited, Length: 10000, dtype: int64

# By observing the feature values ,we know that country and gender are categorical values in the dataset and 
#  while building our machine learning models the categorical varaibale and values are not allowed. So we need to encode those categorical data

# Encoding Categorical Data

In [None]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

In [None]:
label_X_1 = LabelEncoder()
X['Geography'] = label_X_1.fit_transform(X['Geography'])
label_X_2 = LabelEncoder()
X['Gender'] = label_X_2.fit_transform(X['Gender'])


# 0 stans for France , 2 stands for Spain and 1 stands for germany
# 0 stands for Female and 1 stands for Male

In [19]:
X

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,0,0,42,2,0.00,1,1,1,101348.88
1,608,2,0,41,1,83807.86,1,0,1,112542.58
2,502,0,0,42,8,159660.80,3,1,0,113931.57
3,699,0,0,39,1,0.00,2,0,0,93826.63
4,850,2,0,43,2,125510.82,1,1,1,79084.10
...,...,...,...,...,...,...,...,...,...,...
9995,771,0,1,39,5,0.00,2,1,0,96270.64
9996,516,0,1,35,10,57369.61,1,1,1,101699.77
9997,709,0,0,36,7,0.00,1,0,1,42085.58
9998,772,1,1,42,3,75075.31,2,1,0,92888.52


In [20]:
# Since we are encoding three different countries France , Spain and germany as 0 , 2 and 1 . However there are not any relationship between these countries
# but encoding them like this shows that Spain is greater than germany and France mathematically. So for this purpose we need to perform one hot encoding.
X.Geography.values.shape

(10000,)

In [None]:
onehotencoder = OneHotEncoder()
ohe = onehotencoder.fit_transform(X.Geography.values.reshape(-1,1)).toarray()


In [None]:
# 0 stans for France , 2 stands for Spain and 1 stands for germany
# 0 stands for Female and 1 stands for Male

In [23]:
X.head(5)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,619,0,0,42,2,0.0,1,1,1,101348.88
1,608,2,0,41,1,83807.86,1,0,1,112542.58
2,502,0,0,42,8,159660.8,3,1,0,113931.57
3,699,0,0,39,1,0.0,2,0,0,93826.63
4,850,2,0,43,2,125510.82,1,1,1,79084.1


In [24]:
i = 0
for items in ohe:
  print(items)
  i+= 1
  if(i==5):
    break

[1. 0. 0.]
[0. 0. 1.]
[1. 0. 0.]
[1. 0. 0.]
[0. 0. 1.]


In [None]:
encoded_df = pd.DataFrame(ohe,columns=['France','Germany','Spain'])

In [26]:
encoded_df

Unnamed: 0,France,Germany,Spain
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0
...,...,...,...
9995,1.0,0.0,0.0
9996,1.0,0.0,0.0
9997,1.0,0.0,0.0
9998,0.0,1.0,0.0


In [None]:
import pandas as pd


In [None]:
X = pd.concat([encoded_df,X],axis=1)

In [29]:
X

Unnamed: 0,France,Germany,Spain,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,1.0,0.0,0.0,619,0,0,42,2,0.00,1,1,1,101348.88
1,0.0,0.0,1.0,608,2,0,41,1,83807.86,1,0,1,112542.58
2,1.0,0.0,0.0,502,0,0,42,8,159660.80,3,1,0,113931.57
3,1.0,0.0,0.0,699,0,0,39,1,0.00,2,0,0,93826.63
4,0.0,0.0,1.0,850,2,0,43,2,125510.82,1,1,1,79084.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1.0,0.0,0.0,771,0,1,39,5,0.00,2,1,0,96270.64
9996,1.0,0.0,0.0,516,0,1,35,10,57369.61,1,1,1,101699.77
9997,1.0,0.0,0.0,709,0,0,36,7,0.00,1,0,1,42085.58
9998,0.0,1.0,0.0,772,1,1,42,3,75075.31,2,1,0,92888.52


In [None]:
# Removing one dummy feature / variable / columns.
# Dropping Geography columns and one dummy variable columns i.e. France 
# 

In [None]:
preprocessed_dataframe = X.drop(['France','Geography'],axis=1)

In [32]:
preprocessed_dataframe.head()

Unnamed: 0,Germany,Spain,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,0.0,0.0,619,0,42,2,0.0,1,1,1,101348.88
1,0.0,1.0,608,0,41,1,83807.86,1,0,1,112542.58
2,0.0,0.0,502,0,42,8,159660.8,3,1,0,113931.57
3,0.0,0.0,699,0,39,1,0.0,2,0,0,93826.63
4,0.0,1.0,850,0,43,2,125510.82,1,1,1,79084.1


In [None]:
trainable_data = preprocessed_dataframe.iloc[:,:].values


In [34]:
trainable_data[0]

array([0.0000000e+00, 0.0000000e+00, 6.1900000e+02, 0.0000000e+00,
       4.2000000e+01, 2.0000000e+00, 0.0000000e+00, 1.0000000e+00,
       1.0000000e+00, 1.0000000e+00, 1.0134888e+05])

In [35]:
trainable_labels = dataset.iloc[:,13].values
trainable_labels

array([1, 0, 1, ..., 1, 1, 0])

In [36]:
trainable_data.shape

(10000, 11)

In [None]:
# Splitting the dataset into Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(trainable_data,trainable_labels,test_size=0.2,random_state=0)

In [None]:
#Feature Scaling


In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.fit_transform(X_test)

In [None]:
# Now Data preprocessing step is finished now we must focus on building the architecture of ANN 

In [None]:
#Importing the Keras Libraries and Packages