In [None]:
# Importing necessary packages
import pandas as pd
import numpy as np

In [None]:
file_url = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter03/bank-full.csv'
bankData = pd.read_csv(file_url, sep=";")

In [None]:
# Normalising data
from sklearn import preprocessing
x = bankData[['balance']].values.astype(float)
# Creating the scaling function
minmaxScaler = preprocessing.MinMaxScaler()
# Transforming the balance data by normalising it with minmaxScalre
bankData['balanceTran'] = minmaxScaler.fit_transform(x)
# Printing the head of the data
bankData.head()

In [None]:
# Adding a small numerical constant to eliminate 0 values

bankData['balanceTran'] = bankData['balanceTran'] + 0.00001

In [None]:
# Let us transform values for loan data
bankData['loanTran'] = 1
# Giving a weight of 5 if there is no loan
bankData.loc[bankData['loan'] == 'no', 'loanTran'] = 5
bankData.head()

In [None]:
# Let us transform values for Housing data
bankData['houseTran'] = 5
# Giving a weight of 1 if the customer has a house
bankData.loc[bankData['housing'] == 'no', 'houseTran'] = 1

bankData.head()

In [None]:
# Let us now create the new variable which is a product of all these
bankData['assetIndex'] = bankData['balanceTran'] * bankData['loanTran'] * bankData['houseTran']
bankData.head()

In [None]:
# Finding the quantile
np.quantile(bankData['assetIndex'],[0.25,0.5,0.75])

In [None]:
# Creating quantiles from the assetindex data
bankData['assetClass'] = 'Quant1'

bankData.loc[(bankData['assetIndex'] > 0.38) & (bankData['assetIndex'] < 0.57), 'assetClass'] = 'Quant2'

bankData.loc[(bankData['assetIndex'] > 0.57) & (bankData['assetIndex'] < 1.9), 'assetClass'] = 'Quant3'

bankData.loc[bankData['assetIndex'] > 1.9, 'assetClass'] = 'Quant4'

bankData.head()

In [None]:
# Calculating total of each asset class
assetTot = bankData.groupby('assetClass')['y'].agg(assetTot='count').reset_index()
# Calculating the category wise counts
assetProp = bankData.groupby(['assetClass', 'y'])['y'].agg(assetCat='count').reset_index()

In [None]:
# Merging both the data frames
assetComb = pd.merge(assetProp, assetTot, on=['assetClass'])
assetComb['catProp'] = (assetComb.assetCat / assetComb.assetTot)*100
assetComb

In [None]:
# Categorical variables, removing loan and housing
bankCat1 = pd.get_dummies(bankData[['job','marital','education','default','contact','month','poutcome']])

In [None]:
bankNum1 = bankData[['age','day','duration','campaign','pdays','previous','assetIndex']]
bankNum1.head()

In [None]:
# Normalise some of the numerical variables
from sklearn import preprocessing

In [None]:
# Creating the scaling function
minmaxScaler = preprocessing.MinMaxScaler()

In [None]:
# Creating the transformation variables
ageT1 = bankNum1[['age']].values.astype(float)
dayT1 = bankNum1[['day']].values.astype(float)
durT1 = bankNum1[['duration']].values.astype(float)

In [None]:
# Transforming the balance data by normalising it with minmaxScalre
bankNum1['ageTran'] = minmaxScaler.fit_transform(ageT1)
bankNum1['dayTran'] = minmaxScaler.fit_transform(dayT1)
bankNum1['durTran'] = minmaxScaler.fit_transform(durT1)

In [None]:
# Let us create a new numerical variable by selecting the transformed variables
bankNum2 = bankNum1[['ageTran','dayTran','durTran','campaign','pdays','previous','assetIndex']]

# Printing the head of the data
bankNum2.head()

In [None]:
# Preparing the X variables
X = pd.concat([bankCat1, bankNum2], axis=1)
print(X.shape)
# Preparing the Y variable
Y = bankData['y']
print(Y.shape)
X.head()

In [None]:

from sklearn.model_selection import train_test_split
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=123)


In [None]:
from sklearn.linear_model import LogisticRegression
# Defining the LogisticRegression function
bankModel = LogisticRegression()
bankModel.fit(X_train, y_train)

In [None]:
pred = bankModel.predict(X_test)
print('Accuracy of Logistic regression model prediction on test set: {:.2f}'.format(bankModel.score(X_test, y_test)))

In [None]:
# Confusion Matrix for the model
from sklearn.metrics import confusion_matrix
confusionMatrix = confusion_matrix(y_test, pred)
print(confusionMatrix)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))