In [None]:
!pip install smote-variants

In [None]:
# Loading the necessary library files
import pandas as pd

In [None]:
# Loading data from the Github repository

filename = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter13/Dataset/bank-full.csv'
# Loading the data using pandas

bankData = pd.read_csv(filename,sep=";")
bankData.head()

In [None]:
from sklearn.preprocessing import RobustScaler

rob_scaler = RobustScaler()

In [None]:
# Converting each of the columns to scaled version

bankData['ageScaled'] = rob_scaler.fit_transform(bankData['age'].values.reshape(-1,1))
bankData['balScaled'] = rob_scaler.fit_transform(bankData['balance'].values.reshape(-1,1))
bankData['durScaled'] = rob_scaler.fit_transform(bankData['duration'].values.reshape(-1,1))

In [None]:
# Dropping the original columns

bankData.drop(['age','balance','duration'], axis=1, inplace=True)

In [None]:
# Converting all the categorical variables to dummy variables
bankCat = pd.get_dummies(bankData[['job','marital','education','default','housing','loan','contact','month','poutcome']])

In [None]:
# Seperating the numerical data
bankNum = bankData[['ageScaled','balScaled','day','durScaled','campaign','pdays','previous']]
bankNum.shape

In [None]:
# Merging with the original data frame
# Preparing the X variables
X = pd.concat([bankCat, bankNum], axis=1)
print(X.shape)
# Preparing the Y variable
Y = bankData['y']
print(Y.shape)
X.head()

In [None]:
from sklearn.model_selection import train_test_split
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=123)

In [None]:
print("Before OverSampling count of yes: {}".format(sum(y_train=='yes')))
print("Before OverSampling count of no: {} \n".format(sum(y_train=='no')))

In [None]:
import smote_variants as sv
import numpy as np

In [None]:
# Instantiating the MSMOTE class
oversampler= sv.MSMOTE()

In [None]:

# Creating new training sts
X_train_os, y_train_os = oversampler.sample(np.array(X_train), np.array(y_train))

In [None]:
# Shape after oversampling
print('After OverSampling, the shape of train_X: {}'.format(X_train_os.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_os.shape))

print("After OverSampling, counts of label 'Yes': {}".format(sum(y_train_os=='yes')))
print("After OverSampling, counts of label 'no': {}".format(sum(y_train_os=='no')))

In [None]:
# Fitting model

# Training the model with Logistic regression model

from sklearn.linear_model import LogisticRegression
# Defining the LogisticRegression function
bankModel3 = LogisticRegression()
bankModel3.fit(X_train_os, y_train_os)

In [None]:
# Predicting on the test
pred = bankModel3.predict(X_test)

In [None]:
print('Accuracy of Logistic regression model prediction on test set for MSMOTE balanced data set: {:.2f}'.format(bankModel3.score(X_test, y_test)))

In [None]:
# Confusion Matrix for the model
from sklearn.metrics import confusion_matrix
confusionMatrix = confusion_matrix(y_test, pred)
print(confusionMatrix)

from sklearn.metrics import classification_report
print(classification_report(y_test, pred))