In [None]:
# Machine Learning - Logistic & NAive Bayes Regression Models

In [1]:
# seed value for random number generators to obtain reproducible results
RANDOM_SEED = 1

In [2]:
# import base packages into the namespace for this program
import numpy as np
import os
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import roc_auc_score   
from sklearn.model_selection import KFold

In [13]:
# read in comma-delimited text file, creating a pandas DataFrame object
bank = pd.read_csv(
    '/Users/derekhigham/Documents/school/MSDS 422/Module 2/jump-start-bank-v003/bank.csv', sep=';')

# examine the shape of the original input data
print(bank.shape)

(4521, 17)


In [14]:
# drop observations with missing data, if any
bank.dropna()

# examine the shape of the input data after dropping missing data
print(bank.shape)

(4521, 17)


In [15]:
# look at the list of column names, note that y is the response
list(bank.columns.values)

['age',
 'job',
 'marital',
 'education',
 'default',
 'balance',
 'housing',
 'loan',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome',
 'response']

In [16]:
# look at the beginning of the DataFrame
bank.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,response
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [33]:
# view descriptive stats of data
print(bank.describe())

               age       balance          day     duration     campaign  \
count  4521.000000   4521.000000  4521.000000  4521.000000  4521.000000   
mean     41.170095   1422.657819    15.915284   263.961292     2.793630   
std      10.576211   3009.638142     8.247667   259.856633     3.109807   
min      19.000000  -3313.000000     1.000000     4.000000     1.000000   
25%      33.000000     69.000000     9.000000   104.000000     1.000000   
50%      39.000000    444.000000    16.000000   185.000000     2.000000   
75%      49.000000   1480.000000    21.000000   329.000000     3.000000   
max      87.000000  71188.000000    31.000000  3025.000000    50.000000   

             pdays     previous  
count  4521.000000  4521.000000  
mean     39.766645     0.542579  
std     100.121124     1.693562  
min      -1.000000     0.000000  
25%      -1.000000     0.000000  
50%      -1.000000     0.000000  
75%      -1.000000     0.000000  
max     871.000000    25.000000  


In [17]:
# mapping function to convert text no/yes to integer 0/1
convert_to_binary = {'no' : 0, 'yes' : 1}

In [18]:
# define binary variable for having credit in default
default = bank['default'].map(convert_to_binary)

# define binary variable for having a mortgage or housing loan
housing = bank['housing'].map(convert_to_binary)

# define binary variable for having a personal loan
loan = bank['loan'].map(convert_to_binary)

# define response variable to use in the model
response = bank['response'].map(convert_to_binary)

In [20]:
# gather three explanatory variables and response into a numpy array 
# here we use .T to obtain the transpose for the structure we want
model_data = np.array([np.array(default), np.array(housing), np.array(loan), 
    np.array(response)]).T

In [21]:
# examine the shape of model_data, which we will use in subsequent modeling
print(model_data.shape)

(4521, 4)


In [22]:
# prints statistics about data passed in and save it to a txt file for review
def print_stats_save_to_file(data, dataname):
    print('\n---------{} data statistics----------\n'.format(dataname))
    print('\n{} data shape: {}\n'.format(dataname, data.shape))
    print('\n{} data mean: {}\n'.format(dataname, np.mean(data)))
    print('\n{} data standard deviation: {}\n'.format(dataname, np.std(data)))
    print('\n{} data standard median: {}\n'.format(dataname, np.median(data)))
    print('\n{} data variance: {}\n'.format(dataname, np.var(data)))
    with open("model_data_descriptive_stats.txt", "w") as text_file:
        text_file.write('\n---------{} data statistics----------\n'.format(dataname)+
                        '\n{} data mean: {}'.format(dataname, str(np.mean(data))) + 
                        '\n{} data standard deviation: {}'.format(dataname, str(np.std(data)))+ 
                        '\n{} data median: {}'.format(dataname, str(np.median(data)))+
                        '\n{} data variance: {}'.format(dataname, str(np.var(data)))+ 
                        '\n{} data shape: {}'.format(dataname, str(data.shape)))

# prints model_data statistics and saves it to a txt file    
print_stats_save_to_file(model_data, 'Model')


---------Model data statistics----------


Model data shape: (4521, 4)


Model data mean: 0.21272948462729485


Model data standard deviation: 0.40923789047142295


Model data standard median: 0.0


Model data variance: 0.16747565099750036



In [23]:
# shuffle the rows 
np.random.seed(RANDOM_SEED)
np.random.shuffle(model_data)

In [24]:
# examine the shape of model_data, after shuffle, which we will use in subsequent modeling
print_shape(model_data)

The shape of data is: (4521, 4)


In [25]:
# list of names for classifier models
classifier_names = ["Logistic_Regression", "Naive_Bayes"]

# list of classifiers
classifiers = [LogisticRegression(), BernoulliNB(alpha=1.0, binarize=0.5, 
                           class_prior = [0.5, 0.5], fit_prior=False)]

In [26]:
# ten-fold cross-validation employed here
N_FOLDS = 10

# set up numpy array for storing results
crossvalidation_results = np.zeros((N_FOLDS, len(classifier_names)))

# kf, object,  model selection kfold split set up
kf = KFold(n_splits = N_FOLDS, shuffle=False, random_state = RANDOM_SEED)

#--check the splitting process by looking at fold observation counts--
# fold count initialized to zero
index_for_fold = 0

In [27]:
# splits the data, fits the classifier models, returns the crossvalidation
# results
for train_index, test_index in kf.split(model_data):
    print('\nFold index:', index_for_fold,
          '------------------------------------------')
    # 0:model_data.shape[1]-1 slices for explanatory variables,
    X_train = model_data[train_index, 0:model_data.shape[1]-1]
    X_test = model_data[test_index, 0:model_data.shape[1]-1]
    
    # model_data.shape[1]-1 is the index for the response variable
    y_train = model_data[train_index, model_data.shape[1]-1]
    y_test = model_data[test_index, model_data.shape[1]-1]
    
    # prints structure of data after split for x, y 
    print('\nShape of input data for this fold:',
          '\nData Set: (Observations, Variables)')
    print('X_train:', X_train.shape)
    print('X_test:',X_test.shape)
    print('y_train:', y_train.shape)
    print('y_test:',y_test.shape)
    
    # index for method initialized to zero
    index_for_method = 0
    
    # loops through classifiers
    # fits the respective model
    # performs predictions
    for name, clf in zip(classifier_names, classifiers):
        print('\nClassifier evaluation for:', name)
        print('  Scikit Learn method:', clf)
        
         # fit current classifier model using train data set
        clf.fit(X_train, y_train) 
        
        # calculate predictions to evaluate, using test set for this fold
        y_test_predict = clf.predict_proba(X_test)
        
        # calculates ROC AUC score, stores results in cv_results
        fold_method_result = roc_auc_score(y_test, y_test_predict[:,1]) 
        print('Area under ROC curve:', fold_method_result)
        crossvalidation_results[index_for_fold, index_for_method] = fold_method_result
        
        
        # adds one to the index, next loop will be the next classifier
        index_for_method += 1
        
    # adds one to the index, next loop will be the next split in fold    
    index_for_fold += 1


Fold index: 0 ------------------------------------------

Shape of input data for this fold: 
Data Set: (Observations, Variables)
X_train: (4068, 3)
X_test: (453, 3)
y_train: (4068,)
y_test: (453,)

Classifier evaluation for: Logistic_Regression
  Scikit Learn method: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
Area under ROC curve: 0.5634307903152124

Classifier evaluation for: Naive_Bayes
  Scikit Learn method: BernoulliNB(alpha=1.0, binarize=0.5, class_prior=[0.5, 0.5], fit_prior=False)
Area under ROC curve: 0.5634307903152124

Fold index: 1 ------------------------------------------

Shape of input data for this fold: 
Data Set: (Observations, Variables)
X_train: (4069, 3)
X_test: (452, 3)
y_train: (4069,)
y_test: (452,)

Classifier evaluation for: Logistic_Regres

In [30]:
# pandas DataFrame gets assigned cross fold evaluation results
crossvalidation_results_df = pd.DataFrame(crossvalidation_results)
crossvalidation_results_df.columns = classifier_names
with open("/Users/derekhigham/Documents/school/MSDS 422/Module 2/cv-results-df.txt", "w") as text_file:
    text_file.write('\nResults from '+ str(N_FOLDS) + '-fold cross-validation\n'+
                     '\nMethod Area under ROC Curve:\n'+ 
                     str(crossvalidation_results_df))

In [31]:
# print mean of ROC AUC evaluation results for each classifier, saves to file
print('\n----------------------------------------------')
print('\nAverage results from {}-fold cross-validation\n\nMethod Area under ROC Curve:\n{}'
      .format(str(N_FOLDS),str(crossvalidation_results_df.mean())), sep = '')     
print('\nMean of cross validation result:\n{}'.format(crossvalidation_results_df.mean())) 
with open("/Users/derekhigham/Documents/school/MSDS 422/Module 2/cv-results-df-mean.txt", "w") as text_file:
    text_file.write('\nAverage results from {}-fold cross-validation\n\nMethod Area under ROC Curve:\n{}'
                    .format(str(N_FOLDS),str(crossvalidation_results_df.mean())))


----------------------------------------------

Average results from 10-fold cross-validation

Method Area under ROC Curve:
Logistic_Regression    0.607916
Naive_Bayes            0.608131
dtype: float64

Mean of cross validation result:
Logistic_Regression    0.607916
Naive_Bayes            0.608131
dtype: float64
