In [117]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [118]:
data=pd.read_csv("Churn_Modelling.csv")

In [119]:
data.head(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [120]:
data.tail(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
9995,9996,15606229,Obijiaku,771,France,Male,39,5,0.0,2,1,0,96270.64,0
9996,9997,15569892,Johnstone,516,France,Male,35,10,57369.61,1,1,1,101699.77,0
9997,9998,15584532,Liu,709,France,Female,36,7,0.0,1,0,1,42085.58,1
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1
9999,10000,15628319,Walker,792,France,Female,28,4,130142.79,1,1,0,38190.78,0


In [121]:
data.shape

(10000, 14)

In [122]:
data.columns

Index(['RowNumber', 'CustomerId', 'Surname', 'CreditScore', 'Geography',
       'Gender', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard',
       'IsActiveMember', 'EstimatedSalary', 'Exited'],
      dtype='object')

In [123]:
# Preprocess the data
# drop the irrelavant columns

data=data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

In [124]:
data.head(5)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [125]:
cat_features=[feature for feature in data.columns if data[feature].dtypes=='object']
num_features=[feature for feature in data.columns if data[feature].dtypes!='object']

In [126]:
print(cat_features)
print(num_features)

['Geography', 'Gender']
['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary', 'Exited']


In [127]:
# Categorical and Numerical Features

# Encoding

label_encoder_gender=LabelEncoder()
data['Gender']=label_encoder_gender.fit_transform(data['Gender'])

In [128]:
data['Gender']

0       0
1       0
2       0
3       0
4       0
       ..
9995    1
9996    1
9997    0
9998    1
9999    0
Name: Gender, Length: 10000, dtype: int64

In [129]:
from sklearn.preprocessing import OneHotEncoder

In [130]:
onehotencoder_geo=OneHotEncoder()

In [131]:
geo_encoder=onehotencoder_geo.fit_transform(data[['Geography']])

In [132]:
geo_encoder

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 10000 stored elements and shape (10000, 3)>

In [133]:
geo_encoder.toarray()

array([[1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.]])

In [134]:
onehotencoder_geo.get_feature_names_out(['Geography'])

array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [135]:
geo_encoder_df=pd.DataFrame(geo_encoder.toarray(), columns=onehotencoder_geo.get_feature_names_out(['Geography']))

In [136]:
data['Geography'].dtypes

dtype('O')

In [138]:
# Combine the one hot encoder columns with the original data

data=pd.concat([data.drop('Geography', axis=1), geo_encoder_df], axis=1)

In [139]:
data.head()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.0,1,1,1,101348.88,1,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0,0.0,0.0,1.0
2,502,0,42,8,159660.8,3,1,0,113931.57,1,1.0,0.0,0.0
3,699,0,39,1,0.0,2,0,0,93826.63,0,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.1,0,0.0,0.0,1.0


In [140]:
data.dtypes

CreditScore            int64
Gender                 int64
Age                    int64
Tenure                 int64
Balance              float64
NumOfProducts          int64
HasCrCard              int64
IsActiveMember         int64
EstimatedSalary      float64
Exited                 int64
Geography_France     float64
Geography_Germany    float64
Geography_Spain      float64
dtype: object

In [141]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CreditScore        10000 non-null  int64  
 1   Gender             10000 non-null  int64  
 2   Age                10000 non-null  int64  
 3   Tenure             10000 non-null  int64  
 4   Balance            10000 non-null  float64
 5   NumOfProducts      10000 non-null  int64  
 6   HasCrCard          10000 non-null  int64  
 7   IsActiveMember     10000 non-null  int64  
 8   EstimatedSalary    10000 non-null  float64
 9   Exited             10000 non-null  int64  
 10  Geography_France   10000 non-null  float64
 11  Geography_Germany  10000 non-null  float64
 12  Geography_Spain    10000 non-null  float64
dtypes: float64(5), int64(8)
memory usage: 1015.8 KB


In [142]:
data.isnull().sum()

CreditScore          0
Gender               0
Age                  0
Tenure               0
Balance              0
NumOfProducts        0
HasCrCard            0
IsActiveMember       0
EstimatedSalary      0
Exited               0
Geography_France     0
Geography_Germany    0
Geography_Spain      0
dtype: int64

In [145]:
data.duplicated().sum()

np.int64(0)

In [149]:
data.describe()

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Geography_France,Geography_Germany,Geography_Spain
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,650.5288,0.5457,38.9218,5.0128,76485.889288,1.5302,0.7055,0.5151,100090.239881,0.2037,0.5014,0.2509,0.2477
std,96.653299,0.497932,10.487806,2.892174,62397.405202,0.581654,0.45584,0.499797,57510.492818,0.402769,0.500023,0.433553,0.431698
min,350.0,0.0,18.0,0.0,0.0,1.0,0.0,0.0,11.58,0.0,0.0,0.0,0.0
25%,584.0,0.0,32.0,3.0,0.0,1.0,0.0,0.0,51002.11,0.0,0.0,0.0,0.0
50%,652.0,1.0,37.0,5.0,97198.54,1.0,1.0,1.0,100193.915,0.0,1.0,0.0,0.0
75%,718.0,1.0,44.0,7.0,127644.24,2.0,1.0,1.0,149388.2475,0.0,1.0,1.0,0.0
max,850.0,1.0,92.0,10.0,250898.09,4.0,1.0,1.0,199992.48,1.0,1.0,1.0,1.0


In [150]:
import pickle

In [151]:
with open('label_encoder_gender.pkl', 'wb') as file:
    pickle.dump(label_encoder_gender, file)
    
with open('label_encoder_geo.pkl', 'wb') as file:
    pickle.dump(onehotencoder_geo, file)

In [153]:
# Dividing the dataset into independent and dependent features

data.columns
X=data.drop('Exited', axis=1)
y=data['Exited']

In [154]:
X

Unnamed: 0,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Geography_France,Geography_Germany,Geography_Spain
0,619,0,42,2,0.00,1,1,1,101348.88,1.0,0.0,0.0
1,608,0,41,1,83807.86,1,0,1,112542.58,0.0,0.0,1.0
2,502,0,42,8,159660.80,3,1,0,113931.57,1.0,0.0,0.0
3,699,0,39,1,0.00,2,0,0,93826.63,1.0,0.0,0.0
4,850,0,43,2,125510.82,1,1,1,79084.10,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,771,1,39,5,0.00,2,1,0,96270.64,1.0,0.0,0.0
9996,516,1,35,10,57369.61,1,1,1,101699.77,1.0,0.0,0.0
9997,709,0,36,7,0.00,1,0,1,42085.58,1.0,0.0,0.0
9998,772,1,42,3,75075.31,2,1,0,92888.52,0.0,1.0,0.0


In [155]:
y

0       1
1       0
2       1
3       0
4       0
       ..
9995    0
9996    0
9997    1
9998    1
9999    0
Name: Exited, Length: 10000, dtype: int64

In [156]:
# Splitting the data into training and testing

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.3, random_state=23)

In [157]:
X_train.shape, X_test.shape

((7000, 12), (3000, 12))

In [158]:
# Scaling the features

scaler=StandardScaler()

X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [159]:
X_train

array([[ 2.02129476,  0.91660692,  2.09753592, ..., -0.99771689,
        -0.57559072,  1.72155741],
       [ 0.20507906,  0.91660692, -0.85430682, ..., -0.99771689,
        -0.57559072,  1.72155741],
       [ 0.52498069,  0.91660692,  0.09790052, ...,  1.00228833,
        -0.57559072, -0.58086939],
       ...,
       [-0.25929428,  0.91660692, -1.61607268, ...,  1.00228833,
        -0.57559072, -0.58086939],
       [ 1.18542276,  0.91660692, -0.18776168, ..., -0.99771689,
         1.73734559, -0.58086939],
       [ 0.4011478 ,  0.91660692,  3.24018472, ..., -0.99771689,
        -0.57559072,  1.72155741]])

In [160]:
X_test

array([[-0.98165279,  0.91660692, -0.75908608, ...,  1.00228833,
        -0.57559072, -0.58086939],
       [ 1.17510335, -1.0909802 ,  0.00267978, ..., -0.99771689,
         1.73734559, -0.58086939],
       [-1.90008005, -1.0909802 ,  1.62143225, ..., -0.99771689,
         1.73734559, -0.58086939],
       ...,
       [-1.42538731, -1.0909802 ,  0.09790052, ..., -0.99771689,
         1.73734559, -0.58086939],
       [ 0.76232706,  0.91660692, -0.56864462, ..., -0.99771689,
         1.73734559, -0.58086939],
       [-0.20769724,  0.91660692, -1.23518975, ...,  1.00228833,
        -0.57559072, -0.58086939]])

In [161]:
with open('scaling.pkl', 'wb') as file:
    pickle.dump(scaler, file)