In [None]:
!pip install smote-variants

In [None]:
# Loading data from the Github repository
import pandas as pd
filename = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter13/Dataset/churn.csv'
# Loading the data using pandas

churnData = pd.read_csv(filename,sep=",")
churnData.head()

**Feature engineering steps**

Let us now do some feature engineering to the data. First we will scale the numerical data and then convert the ordinal data to 
dummy data

In [None]:
# Normalising data
from sklearn import preprocessing

minmaxScaler = preprocessing.MinMaxScaler()

# Converting each of the columns to scaled version

churnData['alScaled'] = minmaxScaler.fit_transform(churnData['accountlength'].values.reshape(-1,1))
churnData['nvmmScaled'] = minmaxScaler.fit_transform(churnData['numbervmailmessages'].values.reshape(-1,1))
churnData['tdmScaled'] = minmaxScaler.fit_transform(churnData['totaldayminutes'].values.reshape(-1,1))
churnData['tdcScaled'] = minmaxScaler.fit_transform(churnData['totaldaycalls'].values.reshape(-1,1))
churnData['tdchScaled'] = minmaxScaler.fit_transform(churnData['totaldaycharge'].values.reshape(-1,1))
churnData['temScaled'] = minmaxScaler.fit_transform(churnData['totaleveminutes'].values.reshape(-1,1))
churnData['tecScaled'] = minmaxScaler.fit_transform(churnData['totalevecalls'].values.reshape(-1,1))
churnData['techScaled'] = minmaxScaler.fit_transform(churnData['totalevecharge'].values.reshape(-1,1))
churnData['tnmScaled'] = minmaxScaler.fit_transform(churnData['totalnightminutes'].values.reshape(-1,1))
churnData['tncScaled'] = minmaxScaler.fit_transform(churnData['totalnightcalls'].values.reshape(-1,1))
churnData['tnchScaled'] = minmaxScaler.fit_transform(churnData['totalnightcharge'].values.reshape(-1,1))
churnData['timScaled'] = minmaxScaler.fit_transform(churnData['totalintlminutes'].values.reshape(-1,1))
churnData['ticScaled'] = minmaxScaler.fit_transform(churnData['totalintlcalls'].values.reshape(-1,1))
churnData['tichScaled'] = minmaxScaler.fit_transform(churnData['totalintlcharge'].values.reshape(-1,1))
churnData['ncscScaled'] = minmaxScaler.fit_transform(churnData['numbercustomerservicecalls'].values.reshape(-1,1))




In [None]:
# Dropping the original columns

churnData.drop(['accountlength','numbervmailmessages','totaldayminutes','totaldaycalls','totaldaycharge','totaleveminutes','totalevecalls','totalevecharge','totalnightminutes','totalnightcalls','totalnightcharge','totalintlminutes','totalintlcalls','totalintlcharge','numbercustomerservicecalls'], axis=1, inplace=True)

# Print the head of the data

churnData.head()

In [None]:
# Converting all the categorical variables to dummy variables
churnCat = pd.get_dummies(churnData[['internationalplan','voicemailplan']])

In [None]:
# Seperating the numerical data
churnNum = churnData[['alScaled','nvmmScaled','tdmScaled','tdcScaled','tdchScaled','temScaled','tecScaled','techScaled','tnmScaled','tncScaled','tnchScaled','timScaled','ticScaled','tichScaled','ncscScaled']]
churnNum.shape

In [None]:
# Merging with the original data frame
# Preparing the X variables
X = pd.concat([churnCat, churnNum], axis=1)
print(X.shape)
# Preparing the Y variable
Y = churnData['churn']
print(Y.shape)
X.head()

In [None]:
from sklearn.model_selection import train_test_split

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=123)

**Undersampling Method.**






In [None]:
# let us first join the train_x and train_y for ease of operation

trainData = pd.concat([X_train,y_train],axis=1)
trainData.head()

In [None]:
# Finding the indexes of the sample data set where the churn  is 'yes'
ind = trainData[trainData['churn']=='Yes'].index
print(len(ind))

In [None]:
# Seperate the minority classes
minData = trainData.loc[ind]
print(minData.shape)

In [None]:
# Finding indexes of majority class
ind1 = trainData[trainData['churn']=='No'].index
print(len(ind1))

In [None]:
# Seperating the majority class
majData = trainData.loc[ind1]
print(majData.shape)
majData.head()

In [None]:
# Take a random sample equal to length of the minority class to make the data set balanced

majSample = majData.sample(n=len(ind),random_state = 123)
print(majSample.shape)
majSample.head()



In [None]:
# Concatenating both data sets

balData = pd.concat([minData,majSample],axis = 0)
print('balanced data set shape',balData.shape)

In [None]:
# Shuffling the data set

from sklearn.utils import shuffle

balData = shuffle(balData)


In [None]:
# Making the new X_train and y_train

X_trainNew = balData.iloc[:,0:19]
X_trainNew.shape

In [None]:
# Making the new y_train
y_trainNew = balData['churn']
y_trainNew.shape

In [None]:
from sklearn.linear_model import LogisticRegression
# Defining the LogisticRegression function for Undersampling
churnModel1 = LogisticRegression()
churnModel1.fit(X_trainNew, y_trainNew)

**SMOTE**



In [None]:
import smote_variants as sv
import numpy as np

# Instantiating the SMOTE class
oversampler= sv.SMOTE()

In [None]:
# Creating new training set

X_train_smote, y_train_smote = oversampler.sample(np.array(X_train), np.array(y_train))


In [None]:
# Training the model with Logistic regression model

# Defining the LogisticRegression function

churnModel2 = LogisticRegression()

churnModel2.fit(X_train_smote, y_train_smote)

**MSMOTE**



In [None]:
import smote_variants as sv
import numpy as np
# Instantiating the MSMOTE class
oversampler= sv.MSMOTE()

In [None]:
# Creating new training sets
X_train_msmote, y_train_msmote = oversampler.sample(np.array(X_train), np.array(y_train))

In [None]:
# Fitting model

# Training the model with Logistic regression model

# Defining the LogisticRegression function
churnModel3 = LogisticRegression()
churnModel3.fit(X_train_msmote, y_train_msmote)

**Predictions with all three models**

Let us create separate predictions with each model

In [None]:
# Predicting using Under sampler
pred_us = churnModel1.predict(X_test)

# Predicting using SMOTE
pred_smote = churnModel2.predict(X_test)

# Predicting using MSMOTE
pred_msmote = churnModel3.predict(X_test)


In [None]:
# Printing accuracy of each model

print('Accuracy of Logistic regression model prediction on test set for Random Undersampled data set: {:.2f}'.format(churnModel1.score(X_test, y_test)))

print('Accuracy of Logistic regression model prediction on test set for SMOTE data set: {:.2f}'.format(churnModel2.score(X_test, y_test)))

print('Accuracy of Logistic regression model prediction on test set for MSMOTE data set: {:.2f}'.format(churnModel3.score(X_test, y_test)))

In [None]:
# Confusion Matrix & Classification reports for the model
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
# Metrics for Random undersample data set
print(confusion_matrix(y_test, pred_us))

print(classification_report(y_test, pred_us))

In [None]:
# Metrics for SMOTE data set
print(confusion_matrix(y_test, pred_smote))

print(classification_report(y_test, pred_smote))

In [None]:
# Metrics for MSMOTE data set
print(confusion_matrix(y_test, pred_msmote))

print(classification_report(y_test, pred_msmote))