In [4]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix,classification_report, roc_curve, roc_auc_score
from imblearn.over_sampling import SMOTE
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [5]:
# eliminate the future warnings on the following cells
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.simplefilter(action='ignore', category=[FutureWarning, DataConversionWarning])

In [6]:
df = pd.read_csv('bank-full.csv', header = 0, sep = ';')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


# preprocessing / cleaning

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
age          45211 non-null int64
job          45211 non-null object
marital      45211 non-null object
education    45211 non-null object
default      45211 non-null object
balance      45211 non-null int64
housing      45211 non-null object
loan         45211 non-null object
contact      45211 non-null object
day          45211 non-null int64
month        45211 non-null object
duration     45211 non-null int64
campaign     45211 non-null int64
pdays        45211 non-null int64
previous     45211 non-null int64
poutcome     45211 non-null object
y            45211 non-null object
dtypes: int64(7), object(10)
memory usage: 4.1+ MB


In [8]:
# checking the values in these columns to decide on class or dummy variable creation
df.marital.unique(), df.education.unique(), df.poutcome.unique()

(array(['married', 'single', 'divorced'], dtype=object),
 array(['tertiary', 'secondary', 'unknown', 'primary'], dtype=object),
 array(['unknown', 'failure', 'other', 'success'], dtype=object))

<div class = "alert alert-warning">
    Using all of the columns in the dataset to explore how to get_dummies, and reduce features that do not correlate
    </div>

In [9]:
df_copy = df.copy()
# change strings into binary
df_copy.y.replace(('yes', 'no'), (1, 0), inplace=True)
df_copy.default.replace(('yes', 'no'), (1, 0), inplace=True)
df_copy.loan.replace(('yes', 'no'), (1, 0), inplace=True)
df_copy.housing.replace(('yes', 'no'), (1, 0), inplace=True)


In [10]:
# df using all dummy variables
all_dummydf = pd.get_dummies(data = df_copy, 
                             columns = ['job',
                                        'marital',
                                        'education',
                                        'contact',
                                        'day',
                                        'month',
                                        'poutcome'], 
                             dtype = 'int64')
all_dummydf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 79 columns):
age                    45211 non-null int64
default                45211 non-null int64
balance                45211 non-null int64
housing                45211 non-null int64
loan                   45211 non-null int64
duration               45211 non-null int64
campaign               45211 non-null int64
pdays                  45211 non-null int64
previous               45211 non-null int64
y                      45211 non-null int64
job_admin.             45211 non-null int64
job_blue-collar        45211 non-null int64
job_entrepreneur       45211 non-null int64
job_housemaid          45211 non-null int64
job_management         45211 non-null int64
job_retired            45211 non-null int64
job_self-employed      45211 non-null int64
job_services           45211 non-null int64
job_student            45211 non-null int64
job_technician         45211 non-null int64
job_unemplo

<div class = "alert alert-warning">
    Using some of the columns in the dataset to explore how to use one hot. The column of day, month and contact are dropped
    </div>

In [11]:
some_df = df.copy()
# change strings into binary
some_df.y.replace(('yes', 'no'), (1, 0), inplace=True)
some_df.default.replace(('yes', 'no'), (1, 0), inplace=True)
some_df.loan.replace(('yes', 'no'), (1, 0), inplace=True)
some_df.housing.replace(('yes', 'no'), (1, 0), inplace=True)
some_df.marital.replace(('married', 'divorced','single'), (1,0,0), inplace = True)
some_df.poutcome.replace(('success', 'other', 'unknown', 'failure'), (1,0,0,0), inplace = True)


In [12]:
# df using some dummy variables
dummydf = pd.get_dummies(data = some_df, 
                    prefix = 'dum', columns = ['job','education'], dtype = 'int64'
  )
# drop columns day, month and contact as not relevant/ correlated
dummydf.drop(['day', 'month', 'contact'], inplace = True, axis = 1)
dummydf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 28 columns):
age                  45211 non-null int64
marital              45211 non-null int64
default              45211 non-null int64
balance              45211 non-null int64
housing              45211 non-null int64
loan                 45211 non-null int64
duration             45211 non-null int64
campaign             45211 non-null int64
pdays                45211 non-null int64
previous             45211 non-null int64
poutcome             45211 non-null int64
y                    45211 non-null int64
dum_admin.           45211 non-null int64
dum_blue-collar      45211 non-null int64
dum_entrepreneur     45211 non-null int64
dum_housemaid        45211 non-null int64
dum_management       45211 non-null int64
dum_retired          45211 non-null int64
dum_self-employed    45211 non-null int64
dum_services         45211 non-null int64
dum_student          45211 non-null int64
dum_techn

# ML all_dummydf/ SMOTE test #1

In [13]:
# obtain dependent and independent variables
Xall = all_dummydf.loc[:, all_dummydf.columns != 'y']
yall = all_dummydf.loc[:, all_dummydf.columns == 'y']

In [14]:
# checking the shape due to warnings 
Xall.shape, yall.shape

((45211, 78), (45211, 1))

In [15]:
smote = SMOTE(random_state = 45)
X_train1, X_test1, y_train1, y_test1 = train_test_split(Xall, yall, test_size = 0.3, random_state = 123)
# fit smote on training data
balanced_X1, balanced_y1 = smote.fit_sample(X_train1, y_train1)
# smote outputs numpy array therefore transformed to df
balanced_X1 = pd.DataFrame(data=balanced_X1, columns= X_train1.columns )
balanced_y1 = pd.DataFrame(data = balanced_y1,columns=['y'])


TypeError: issubclass() arg 2 must be a class or tuple of classes

In [16]:
# we can Check the numbers of our data
print("length of oversampled data is ",len(balanced_X1))
print("Number of no subscriptions in oversampled data",len(balanced_y1[balanced_y1['y']==0]))
print("Number of subscriptions",len(balanced_y1[balanced_y1['y']==1]))
print("Proportion of no subscription data in oversampled data is ",len(balanced_y1[balanced_y1['y']==0])/len(balanced_X1))
print("Proportion of subscription data in oversampled data is ",len(balanced_y1[balanced_y1['y']==1])/len(balanced_X1))

NameError: name 'balanced_X1' is not defined

# This is the cell with the error, the cell usually changes from the cell using SMOTE method and this one

In [17]:
# hypertuning parameters; Create hyperparameter grid and fit
param_grid = {'penalty' : ['l1', 'l2'], 'C' : [0.001, 0.01, 0.1, 1, 10, 100]}
clf = GridSearchCV(LogisticRegression(random_state = 123, 
                                    ), 
                   param_grid,
                   cv=5)
best = clf.fit(balanced_X1, balanced_y1.y.ravel())
print('Best Penalty:', best.best_estimator_.get_params()['penalty'])
print('Best C:', best.best_estimator_.get_params()['C'])

NameError: name 'balanced_X1' is not defined

In [18]:
clf1 = LogisticRegression(random_state = 123, 
                         class_weight = 'balanced', 
                         C = 2.783)
clf1.fit(balanced_X1, balanced_y1)
y_pred1 = clf2.predict(X_test1)

NameError: name 'balanced_X1' is not defined

# Evaluation test #1

<div class = "alert alert-warning">
    CONFUSION MATRIX
    </div>

<div class = "alert alert-warning">
  CLASSIFICATION REPORT
    </div>

<div class = "alert alert-warning">
    ROC CURVE
    </div>

 # ML dummydf/ SMOTE test #2


In [3]:
# used in the hope of creating a class and subclass during the model fit but was unsuccessful

class SubclassedSeries(pd.Series):

    @property
    def _constructor(self):
        return SubclassedSeries

    @property
    def _constructor_expanddim(self):
        return SubclassedDataFrame


class SubclassedDataFrame(pd.DataFrame):

    @property
    def _constructor(self):
        return SubclassedDataFrame

    @property
    def _constructor_sliced(self):
        return SubclassedSeries

NameError: name 'pd' is not defined

In [None]:
X = dummydf.loc[:, dummydf.columns != 'y']
y = dummydf.loc[:, dummydf.columns == 'y']

In [None]:
X.shape, y.shape

In [None]:
smote = SMOTE(random_state=45)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)
columns = X_train.columns
balanced_X,balanced_y=smote.fit_sample(X_train, y_train)
balanced_X = pd.DataFrame(data=balanced_X,columns=columns )
balanced_y= pd.DataFrame(data=balanced_y,columns=['y'])


In [None]:
# we can Check the numbers of our data
print("length of oversampled data is ",len(balanced_X))
print("Number of no subscription in oversampled data",len(balanced_y[balanced_y['y']==0]))
print("Number of subscription",len(balanced_y[balanced_y['y']==1]))
print("Proportion of no subscription data in oversampled data is ",len(balanced_y[balanced_y['y']==0])/len(balanced_X))
print("Proportion of subscription data in oversampled data is ",len(balanced_y[balanced_y['y']==1])/len(balanced_X))

In [None]:
# hypertuning parameters; Create hyperparameter grid and fit
param_grid = {'penalty' : ['l1', 'l2'], 'C' : [0.001, 0.01, 0.1, 1, 10, 100]}
clf = GridSearchCV(LogisticRegression(random_state = 123, 
                                      class_weight = 'balanced'), 
                   param_grid,
                   cv=5)
clf.fit(balanced_X, balanced_y)
print('Best Penalty:', clf.best_estimator_.get_params()['penalty'])
print('Best C:', clf.best_estimator_.get_params()['C'])

In [None]:
clf1 = LogisticRegression(random_state = 123, 
                         class_weight = 'balanced', 
                         C = 2.783)
clf1.fit(balanced_X, balanced_y)
y_pred1 = clf2.predict(X_test1)

# Evaluation test #2

In [None]:
<div class = "alert alert-warning">
    CONFUSION MATRIX
    </div>

In [None]:
<div class = "alert alert-warning">
    CLASSIFICATION REPORT
    </div>


In [None]:
<div class = "alert alert-warning">
    ROC CURVE
    </div>

# Error


In [20]:
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-15-4fd25fd735db> in <module>
      2 X_train1, X_test1, y_train1, y_test1 = train_test_split(Xall, yall, test_size = 0.3, random_state = 123)
      3 # fit smote on training data
----> 4 balanced_X1, balanced_y1 = smote.fit_sample(X_train1, y_train1)
      5 # smote outputs numpy array therefore transformed to df
      6 balanced_X1 = pd.DataFrame(data=balanced_X1, columns= X_train1.columns )

~\Anaconda3\lib\site-packages\imblearn\base.py in fit_resample(self, X, y)
     77 
     78         check_classification_targets(y)
---> 79         X, y, binarize_y = self._check_X_y(X, y)
     80 
     81         self.sampling_strategy_ = check_sampling_strategy(

~\Anaconda3\lib\site-packages\imblearn\base.py in _check_X_y(X, y)
    135     def _check_X_y(X, y):
    136         y, binarize_y = check_target_type(y, indicate_one_vs_all=True)
--> 137         X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'])
    138         return X, y, binarize_y
    139 

~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_X_y(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, warn_on_dtype, estimator)
    722                         dtype=None)
    723     else:
--> 724         y = column_or_1d(y, warn=True)
    725         _assert_all_finite(y)
    726     if y_numeric and y.dtype.kind == 'O':

~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in column_or_1d(y, warn)
    755                           " expected. Please change the shape of y to "
    756                           "(n_samples, ), for example using ravel().",
--> 757                           DataConversionWarning, stacklevel=2)
    758         return np.ravel(y)
    759 

TypeError: issubclass() arg 2 must be a class or tuple of classes

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 26)