In [None]:
# Loading the necessary library files
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [None]:
# Loading data from the drive

# Please change the filename as per the location where the file is stored
filename = '../Dataset/bank-full.csv'
# Loading the data using pandas

bankData = pd.read_csv(filename,sep=";")
bankData.head()

In [None]:
from sklearn.preprocessing import RobustScaler

rob_scaler = RobustScaler()

In [None]:
# Converting each of the columns to scaled version

bankData['ageScaled'] = rob_scaler.fit_transform(bankData['age'].values.reshape(-1,1))
bankData['balScaled'] = rob_scaler.fit_transform(bankData['balance'].values.reshape(-1,1))
bankData['durScaled'] = rob_scaler.fit_transform(bankData['duration'].values.reshape(-1,1))

In [None]:
# Dropping the original columns

bankData.drop(['age','balance','duration'], axis=1, inplace=True)

In [None]:
# Converting all the categorical variables to dummy variables
bankCat = pd.get_dummies(bankData[['job','marital','education','default','housing','loan','contact','month','poutcome']])

In [None]:
# Seperating the numerical data
bankNum = bankData[['ageScaled','balScaled','day','durScaled','campaign','pdays','previous']]
bankNum.shape

In [None]:
# Merging with the original data frame
# Preparing the X variables
X = pd.concat([bankCat, bankNum], axis=1)
print(X.shape)
# Preparing the Y variable
Y = bankData['y']
print(Y.shape)
X.head()

In [None]:
from sklearn.model_selection import train_test_split
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=123)

In [None]:
# Let us first join the train_x and train_y for ease of operation

trainData = pd.concat([X_train,y_train],axis=1)
trainData.head()

In [None]:
# Finding the indexes of the sample data set where the propensity is 'yes'
ind = trainData[trainData['y']=='yes'].index
print(len(ind))

In [None]:
# Seperate the minority classes
minData = trainData.loc[ind]
print(minData.shape)

In [None]:
# Finding indexes of majority class
ind1 = trainData[trainData['y']=='no'].index
print(len(ind1))

In [None]:
# Seperating the majority class
majData = trainData.loc[ind1]
print(majData.shape)
majData.head()

In [None]:
# Take a random sample equal to length of the minority class to make the data set balanced

majSample = majData.sample(n=len(ind),random_state = 123)

In [None]:

print(majSample.shape)
majSample.head()



In [None]:
# Concatinating both data sets and then shuffling the data set

balData = pd.concat([minData,majSample],axis = 0)
print('balanced data set shape',balData.shape)

In [None]:
# Shuffling the data set

from sklearn.utils import shuffle

balData = shuffle(balData)
balData.head()

In [None]:
# Making the new X_train and y_train

X_trainNew = balData.iloc[:,0:51]
print(X_trainNew.head())

y_trainNew = balData['y']
print(y_trainNew.head())

In [None]:
from sklearn.linear_model import LogisticRegression
# Defining the LogisticRegression function
bankModel1 = LogisticRegression()
bankModel1.fit(X_trainNew, y_trainNew)

In [None]:
# Predicting on the test
pred = bankModel1.predict(X_test)
print('Accuracy of Logistic regression model prediction on test set for balanced data set: {:.2f}'.format(bankModel1.score(X_test, y_test)))



In [None]:
# Confusion Matrix for the model
from sklearn.metrics import confusion_matrix
confusionMatrix = confusion_matrix(y_test, pred)
print(confusionMatrix)

from sklearn.metrics import classification_report
print(classification_report(y_test, pred))