In [182]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
import pickle

In [183]:
data = pd.read_csv('Churn_Modelling.csv')
data.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


### Output feature will be salary

In [184]:
## Drop the columns that are not required
data = data.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)
data.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


In [185]:
## Checking the dataset data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      10000 non-null  int64  
 1   Geography        10000 non-null  object 
 2   Gender           10000 non-null  object 
 3   Age              10000 non-null  int64  
 4   Tenure           10000 non-null  int64  
 5   Balance          10000 non-null  float64
 6   NumOfProducts    10000 non-null  int64  
 7   HasCrCard        10000 non-null  int64  
 8   IsActiveMember   10000 non-null  int64  
 9   EstimatedSalary  10000 non-null  float64
 10  Exited           10000 non-null  int64  
dtypes: float64(2), int64(7), object(2)
memory usage: 859.5+ KB


In [186]:
## Checking for null values
data.isnull().sum()

CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [187]:
## Checkcing values for Gender
data['Gender'].value_counts()

Gender
Male      5457
Female    4543
Name: count, dtype: int64

In [188]:
## Label Encoding for Gender
gender_label_encoder = LabelEncoder()
data['Gender'] = gender_label_encoder.fit_transform(data['Gender'])

In [189]:
data['Gender'].value_counts()

Gender
1    5457
0    4543
Name: count, dtype: int64

In [190]:
## Checking values for Geography
data['Geography'].value_counts()

Geography
France     5014
Germany    2509
Spain      2477
Name: count, dtype: int64

In [191]:
## One Hot Encoding for Geography
geography_onehot_encoder = OneHotEncoder()
geo_encoded = geography_onehot_encoder.fit_transform(data[['Geography']])
geo_encoded

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 10000 stored elements and shape (10000, 3)>

In [192]:
geography_onehot_encoder.get_feature_names_out()

array(['Geography_France', 'Geography_Germany', 'Geography_Spain'],
      dtype=object)

In [193]:
geo_encoded_df = pd.DataFrame(geo_encoded.toarray(), columns=geography_onehot_encoder.get_feature_names_out())
geo_encoded_df.head()

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain
0,1.0,0.0,0.0
1,0.0,0.0,1.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,1.0


In [194]:
## Merging geography encoded data with original data and droping the original Geography column

data_df = pd.concat([geo_encoded_df, data], axis=1)

data_df = data_df.drop(['Geography'], axis=1)

data_df.head()

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1.0,0.0,0.0,619,0,42,2,0.0,1,1,1,101348.88,1
1,0.0,0.0,1.0,608,0,41,1,83807.86,1,0,1,112542.58,0
2,1.0,0.0,0.0,502,0,42,8,159660.8,3,1,0,113931.57,1
3,1.0,0.0,0.0,699,0,39,1,0.0,2,0,0,93826.63,0
4,0.0,0.0,1.0,850,0,43,2,125510.82,1,1,1,79084.1,0


In [195]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Geography_France   10000 non-null  float64
 1   Geography_Germany  10000 non-null  float64
 2   Geography_Spain    10000 non-null  float64
 3   CreditScore        10000 non-null  int64  
 4   Gender             10000 non-null  int64  
 5   Age                10000 non-null  int64  
 6   Tenure             10000 non-null  int64  
 7   Balance            10000 non-null  float64
 8   NumOfProducts      10000 non-null  int64  
 9   HasCrCard          10000 non-null  int64  
 10  IsActiveMember     10000 non-null  int64  
 11  EstimatedSalary    10000 non-null  float64
 12  Exited             10000 non-null  int64  
dtypes: float64(5), int64(8)
memory usage: 1015.8 KB


In [196]:
## Exporting the label and the one hot encoder
with open('label_encoder_gender_reg.pkl', 'wb') as file:
    pickle.dump(gender_label_encoder, file)


with open('onehot_encoder_geo_reg.pkl', 'wb') as file:
    pickle.dump(geography_onehot_encoder, file)

In [197]:
## Splitting the data into dependent and independent features
X = data_df.drop(['EstimatedSalary'], axis=1)
y = data_df['EstimatedSalary']

In [198]:
X

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,Exited
0,1.0,0.0,0.0,619,0,42,2,0.00,1,1,1,1
1,0.0,0.0,1.0,608,0,41,1,83807.86,1,0,1,0
2,1.0,0.0,0.0,502,0,42,8,159660.80,3,1,0,1
3,1.0,0.0,0.0,699,0,39,1,0.00,2,0,0,0
4,0.0,0.0,1.0,850,0,43,2,125510.82,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1.0,0.0,0.0,771,1,39,5,0.00,2,1,0,0
9996,1.0,0.0,0.0,516,1,35,10,57369.61,1,1,1,0
9997,1.0,0.0,0.0,709,0,36,7,0.00,1,0,1,1
9998,0.0,1.0,0.0,772,1,42,3,75075.31,2,1,0,1


In [199]:
y

0       101348.88
1       112542.58
2       113931.57
3        93826.63
4        79084.10
          ...    
9995     96270.64
9996    101699.77
9997     42085.58
9998     92888.52
9999     38190.78
Name: EstimatedSalary, Length: 10000, dtype: float64

In [200]:
## Splitting training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [201]:
## Scaling the data
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
pd.DataFrame(X_train, columns=X.columns).head()

Unnamed: 0,Geography_France,Geography_Germany,Geography_Spain,CreditScore,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,Exited
0,1.001501,-0.579467,-0.576388,0.3565,0.913248,-0.655786,0.34568,-1.218471,0.808436,0.649203,0.974817,-0.50858
1,-0.998501,1.725723,-0.576388,-0.203898,0.913248,0.294938,-0.348369,0.696838,0.808436,0.649203,0.974817,-0.50858
2,-0.998501,-0.579467,1.734942,-0.961472,0.913248,-1.416365,-0.695393,0.618629,-0.916688,0.649203,-1.025834,1.96626
3,1.001501,-0.579467,-0.576388,-0.940717,-1.094993,-1.131148,1.386753,0.953212,-0.916688,0.649203,-1.025834,1.96626
4,1.001501,-0.579467,-0.576388,-1.397337,0.913248,1.625953,1.386753,1.057449,-0.916688,-1.540351,-1.025834,1.96626


In [202]:
## Exporting the scaler
with open('scaler_reg.pkl', 'wb') as file:
    pickle.dump(scaler, file)

## ANN Regresion Implementation

In [203]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input

In [204]:
## Build or ANN Model
model = Sequential(
    [
        # First hidden layer, input shape is the number of features
        Input(shape=(X_train.shape[1],)),
        Dense(units=64, activation='relu'), ## Hidden layer 1 connected to input layer
        Dense(units=32, activation='relu'), # Hidden layer 2
        # No activation function for output layer as it is a regression problem
        Dense(1), # Output layer
    ]
)

In [205]:
## Compile the model
model.compile(optimizer='adam', loss='mean_absolute_error', metrics=['mae'])

In [206]:
## Summary of the model
model.summary()

In [207]:
## Set up the Tensorboard
import datetime
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

log_dir = 'regresion-logs/fit/' + datetime.datetime.now().strftime('%Y%m%d-%H%M%S')
tensorflow_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

In [208]:
## Set up Early Stopping
## Stops the training when a monitored metric has stopped improving
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

In [209]:
### Training the model
history = model.fit(
    X_train, y_train, validation_data=(X_test, y_test), epochs=100,
    callbacks=[tensorflow_callback, early_stopping]
)

Epoch 1/100


[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 100691.0859 - mae: 100691.0859 - val_loss: 98567.4219 - val_mae: 98567.4219
Epoch 2/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 101356.7969 - mae: 101356.7969 - val_loss: 97300.6953 - val_mae: 97300.6953
Epoch 3/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 98871.8750 - mae: 98871.8750 - val_loss: 93994.7031 - val_mae: 93994.7031
Epoch 4/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 95275.0625 - mae: 95275.0625 - val_loss: 88351.9141 - val_mae: 88351.9141
Epoch 5/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 88466.3203 - mae: 88466.3203 - val_loss: 80839.5781 - val_mae: 80839.5781
Epoch 6/100
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 80452.6953 - mae: 80452.6953 - val_loss: 72587.3594 - val_mae: 72587

In [210]:
## Load Tensorboard Extension
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [211]:
%tensorboard --logdir regresion-logs/fit/

Reusing TensorBoard on port 6006 (pid 10861), started 0:01:37 ago. (Use '!kill 10861' to kill it.)

In [212]:
## Evalaute model on the test data
test_loss, test_mae = model.evaluate(X_test, y_test)
print(f'Test Loss: {test_loss}')
print(f'Test MAE: {test_mae}')

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 51155.0430 - mae: 51155.0430
Test Loss: 50212.328125
Test MAE: 50212.328125


In [213]:
model.save('regresssion.h5')

