In [192]:
# The data is stored in the file UniversalBank.csv
# The customer features (or predictors) we will use as our X are:
# Age = Customers age 
# Experience = Numbers of years of professional experience
# Income = Annual income in units of 1000 euros
# Family size = Size of customers family
# CCAvg = Average spending on credit cards per month (in units if 1000 e)
# Education = Undergraduate (1), Gaduate (2), Advanced/professional (3)
# Mortgage = Value of house motgage in units of 1000 e
# Securities Account = 1 if customer has this type of account with the bank (else 0)
# CD Account = 1 if customer has this type of account with the bank (else 0)
# Online Banking = 1 if customer uses Internet banking facilities (else 0)
# Credit Card = 1 if customer has credit card issued by the bank (else 0)
#
# The thing we want to predict our Y is:
# Personal Loan = 1 if the customer has responded positively to the banks e-mail drive (0 else) 

In [193]:
# Lets import the data (using the pandas library) and have a look

In [194]:
import pandas as pd

In [195]:
bank_df = pd.read_csv('UniversalBank.csv')

In [196]:
bank_df.head()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


In [197]:
bank_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  5000 non-null   int64  
 1   Age                 5000 non-null   int64  
 2   Experience          5000 non-null   int64  
 3   Income              5000 non-null   int64  
 4   ZIP Code            5000 non-null   int64  
 5   Family              5000 non-null   int64  
 6   CCAvg               5000 non-null   float64
 7   Education           5000 non-null   int64  
 8   Mortgage            5000 non-null   int64  
 9   Personal Loan       5000 non-null   int64  
 10  Securities Account  5000 non-null   int64  
 11  CD Account          5000 non-null   int64  
 12  Online              5000 non-null   int64  
 13  CreditCard          5000 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 547.0 KB


In [198]:
# The columns containing ID and the postal zip code will not be used, so we will drop them. Also some 
# column names (such as CD Account) contains a blank space that will be replaced by '_'

In [199]:
bank_df.drop(columns=['ID','ZIP Code'], inplace=True)
bank_df.columns = [c.replace(' ', '_') for c in bank_df.columns]

In [200]:
# The Education variable is coded as an integer (1,2,3), but it has to be turned into a categorical variable
# This is done in two steps. First changing the data-type from integer to categorical, then creating new 
# dummy variables

In [201]:
bank_df['Education'] = bank_df['Education'].astype('category')
new_categories = {1: 'Undergrad', 2: 'Graduate', 3: 'Advanced/Professional'}
bank_df.Education.cat.rename_categories(new_categories, inplace=True)
bank_df = pd.get_dummies(bank_df, prefix_sep='_', drop_first=True)

  res = method(*args, **kwargs)


In [202]:
bank_df.head()

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Mortgage,Personal_Loan,Securities_Account,CD_Account,Online,CreditCard,Education_Graduate,Education_Advanced/Professional
0,25,1,49,4,1.6,0,0,1,0,0,0,0,0
1,45,19,34,3,1.5,0,0,1,0,0,0,0,0
2,39,15,11,1,1.0,0,0,0,0,0,0,0,0
3,35,9,100,1,2.7,0,0,0,0,0,0,1,0
4,35,8,45,4,1.0,0,0,0,0,0,1,1,0


In [203]:
y = bank_df['Personal_Loan']
X = bank_df.drop(columns=['Personal_Loan'])   #X are all columns except the y-column

print('X shape: {}'.format(X.shape))
print('Y shape: {}'.format(y.shape))

X shape: (5000, 12)
Y shape: (5000,)


In [204]:
# Scale values

from sklearn import preprocessing

scaler = preprocessing.MinMaxScaler()
scaled_X = scaler.fit_transform(X)

In [205]:
# The data pre-processing steps are now ready. Next we must choose a ML model.

In [206]:
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split # Import train_test_split function

n_cols = X.shape[1]
print('Inputs: {}'.format(n_cols))

X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.2, shuffle=True, random_state=1) # 70% training and 30% test
print(y_train.shape, y_test.shape)

# Create network with 3 hidden layers, each with 16 nodes
model = Sequential()
model.add(Dense(16, activation='relu', input_shape=(n_cols,)))
model.add(Dense(16, activation='relu'))
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # One node in output layer with sigmoid activation function for binary classification

# Create model using binary cross-entropy loss function
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model with training data
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))


Inputs: 12
(4000,) (1000,)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f6e09c03c50>

In [207]:
# Evaluate model accuracy according to test data
score = model.evaluate(X_test, y_test, return_dict=True)
print('Accuracy: {}'.format(score['accuracy']))

Accuracy: 0.9779999852180481


In [208]:
# Calculate and display confusion matrix

from sklearn.metrics import confusion_matrix

predictions = model.predict(X_test)
# Predictions are probability values, converting them to binary outputs
binary_predictions = []
for prediction in predictions:
  if prediction >= 0.5:  # For probabilities over 50% assume 1, otherwise zero
    binary_predictions.append(1)
  else:
    binary_predictions.append(0)

# Compare binary predictions to actual results
cm = confusion_matrix(y_test, binary_predictions)
accuracy = (cm[0][0] + cm[1][1]) / len(y_test)
print(cm)
print(accuracy)


[[898   2]
 [ 20  80]]
0.978
