In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Loading the necessary library files
import pandas as pd

In [None]:
# Loading data from the drive

# Please change the filename as per the location where the file is stored
filename = '/content/drive/My Drive/Packt_Colab/bank-full.csv'
# Loading the data u'sing pandas

bankData = pd.read_csv(filename,sep=";")
bankData.head()

**Feature engineering steps**

Let us now do some feature engineering to the data. First we will scale the numerical data and then convert the ordinal data to 
dummy data

In [None]:
from sklearn.preprocessing import RobustScaler

rob_scaler = RobustScaler()

# Converting each of the columns to scaled version

bankData['ageScaled'] = rob_scaler.fit_transform(bankData['age'].values.reshape(-1,1))
bankData['balScaled'] = rob_scaler.fit_transform(bankData['balance'].values.reshape(-1,1))
bankData['durScaled'] = rob_scaler.fit_transform(bankData['duration'].values.reshape(-1,1))

# Dropping the original columns

bankData.drop(['age','balance','duration'], axis=1, inplace=True)

# Print the head of the data

bankData.head()

In [None]:
# Converting all the categorical variables to dummy variables
bankCat = pd.get_dummies(bankData[['job','marital','education','default','housing','loan','contact','month','poutcome']])

In [None]:
# Seperating the numerical data
bankNum = bankData[['ageScaled','balScaled','day','durScaled','campaign','pdays','previous']]
bankNum.shape

In [None]:
# Merging with the original data frame
# Preparing the X variables
X = pd.concat([bankCat, bankNum], axis=1)
print(X.shape)
# Preparing the Y variable
Y = bankData['y']
print(Y.shape)
X.head()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=123)
# Defining the LogisticRegression function
bankModel = LogisticRegression()
bankModel.fit(X_train, y_train)

In [None]:
pred = bankModel.predict(X_test)
print('Accuracy of Logistic regression model prediction on test set: {:.2f}'.format(bankModel.score(X_test, y_test)))

# Confusion Matrix for the model
from sklearn.metrics import confusion_matrix
confusionMatrix = confusion_matrix(y_test, pred)
print(confusionMatrix)

from sklearn.metrics import classification_report
print(classification_report(y_test, pred))

In [None]:
print('Percentage of positive class :',(y_train[y_train=='yes'].value_counts()/len(y_train) ) * 100)
print('Percentage of negative class :',(y_train[y_train=='no'].value_counts()/len(y_train) ) * 100)

**Undersampling Method.**

In the random undersampling method, we down sample the majority class to the same amount as the minority class to make the data set balanced. Let us see how we can achieve that

In this method we first identify the count of  the  minority cases and then undersample the majority cases to be the same as minority cases. 




In [None]:
from sklearn.model_selection import train_test_split

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=123)


In [None]:
# let us first join the train_x and train_y for ease of operation

trainData = pd.concat([X_train,y_train],axis=1)
trainData.head()

In [None]:
# Finding the indexes of the sample data set where the propensity is 'yes'
ind = trainData[trainData['y']=='yes'].index
print(len(ind))

# Seperate the minority classes
minData = trainData.loc[ind]
print(minData.shape)

# Finding indexes of majority class
ind1 = trainData[trainData['y']=='no'].index
print(len(ind1))
# Seperating the majority class
majData = trainData.loc[ind1]
print(majData.shape)
majData.head()

In [None]:
# Take a random sample equal to length of the minority class to make the data set balanced

majSample = majData.sample(n=len(ind),random_state = 123)
print(majSample.shape)
majSample.head()



In [None]:
# Concatinating both data sets and then shuffling the data set

balData = pd.concat([minData,majSample],axis = 0)
print('balanced data set shape',balData.shape)

# Shuffling the data set

from sklearn.utils import shuffle

balData = shuffle(balData)
balData.head()

In [None]:
# Making the new X_train and y_train

X_trainNew = balData.iloc[:,0:51]
X_trainNew.head()

y_trainNew = balData['y']
y_trainNew.head()

In [None]:

# Defining the LogisticRegression function
bankModel1 = LogisticRegression()
bankModel1.fit(X_trainNew, y_trainNew)

# Predicting on the test
pred = bankModel1.predict(X_test)
print('Accuracy of Logisticr regression model prediction on test set for balanced data set: {:.2f}'.format(bankModel1.score(X_test, y_test)))



In [None]:
# Confusion Matrix for the model
from sklearn.metrics import confusion_matrix
confusionMatrix = confusion_matrix(y_test, pred)
print(confusionMatrix)

In [None]:
# Confusion Matrix for the model
from sklearn.metrics import confusion_matrix
confusionMatrix = confusion_matrix(y_test, pred)
print(confusionMatrix)

from sklearn.metrics import classification_report
print(classification_report(y_test, pred))

**Random Over Sampling**

Let us now try the over sampling method and find what effect it has on the results

In [None]:
!pip install smote-variants

In [None]:
# Splitting the data into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

print("Before OverSampling count of yes: {}".format(sum(y_train=='yes')))
print("Before OverSampling count of no: {} \n".format(sum(y_train=='no')))

In [None]:
import smote_variants as sv
import numpy as np

# Instantiating the SMOTE class
oversampler= sv.SMOTE()

# Creating new training set

X_train_us, y_train_us = oversampler.sample(np.array(X_train), np.array(y_train))

# Shape after oversampling

print('After OverSampling, the shape of train_X: {}'.format(X_train_us.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_us.shape))

print("After OverSampling, counts of label 'Yes': {}".format(sum(y_train_us=='yes')))
print("After OverSampling, counts of label 'no': {}".format(sum(y_train_us=='no')))


In [None]:
# Training the model with Logistic regression model

# Defining the LogisticRegression function

bankModel2 = LogisticRegression()

bankModel2.fit(X_train_us, y_train_us)

# Predicting on the test set
pred = bankModel2.predict(X_test)

# Printing accuracy 
print('Accuracy of Logistic regression model prediction on test set for Smote balanced data set: {:.2f}'.format(bankModel2.score(X_test, y_test)))

# Confusion Matrix for the model

from sklearn.metrics import confusion_matrix
confusionMatrix = confusion_matrix(y_test, pred)
print(confusionMatrix)

# Classification report for the model

from sklearn.metrics import classification_report
print(classification_report(y_test, pred))



**Activity 1**

Implementing MSMOTE

In [None]:
# Splitting the data into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

print("Before OverSampling count of yes: {}".format(sum(y_train=='yes')))
print("Before OverSampling count of no: {} \n".format(sum(y_train=='no')))

In [None]:
import smote_variants as sv
import numpy as np
# Instantiating the SMOTE class
oversampler= sv.MSMOTE()
# Creating new training sts
X_train_us, y_train_us = oversampler.sample(np.array(X_train), np.array(y_train))

# Shape after oversampling
print('After OverSampling, the shape of train_X: {}'.format(X_train_us.shape))
print('After OverSampling, the shape of train_y: {} \n'.format(y_train_us.shape))

print("After OverSampling, counts of label 'Yes': {}".format(sum(y_train_us=='yes')))
print("After OverSampling, counts of label 'no': {}".format(sum(y_train_us=='no')))

In [None]:
# Fitting model

# Training the model with Logistic regression model

# Defining the LogisticRegression function
bankModel2 = LogisticRegression()
bankModel2.fit(X_train_us, y_train_us)

# Predicting on the test
pred = bankModel2.predict(X_test)
print('Accuracy of Logistic regression model prediction on test set for Smote balanced data set: {:.2f}'.format(bankModel2.score(X_test, y_test)))

# Confusion Matrix for the model
from sklearn.metrics import confusion_matrix
confusionMatrix = confusion_matrix(y_test, pred)
print(confusionMatrix)

from sklearn.metrics import classification_report
print(classification_report(y_test, pred))