### Setup

In [391]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None, 'display.max_rows', 100) 

from os import path
import pickle

import missingno as msno
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV, cross_val_predict, learning_curve
from sklearn.metrics.scorer import make_scorer

from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score, precision_recall_curve, fbeta_score

from sklearn.dummy import DummyClassifier

from sklearn.linear_model import LogisticRegression

from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

### Import

In [40]:
# Load train set
f = 'loan_train.p'
d = ['..', 'data', 'processed']
fp = path.join(*d, f)

with open(fp, 'rb') as file:
    train = pickle.load(file)

In [41]:
# Load train target set
f = 'loan_train_target.p'
d = ['..', 'data', 'processed']
fp = path.join(*d, f)

with open(fp, 'rb') as file:
    train_target = pickle.load(file)

In [42]:
# Load test set
f = 'loan_test.p'
d = ['..', 'data', 'processed']
fp = path.join(*d, f)

with open(fp, 'rb') as file:
    test = pickle.load(file)

In [43]:
# Load test target set
f = 'loan_test_target.p'
d = ['..', 'data', 'processed']
fp = path.join(*d, f)

with open(fp, 'rb') as file:
    test_target = pickle.load(file)

In [44]:
train.shape, train_target.shape

((152218, 47), (152218, 2))

In [45]:
test.shape, test_target.shape

((38055, 47), (38055, 2))

### Data Preperation

In [26]:
# Class for selecting attributes from a dataframe, for use in pipelines

class DataFrame_Selector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
    def get_feature_names(self):
        return self.attribute_names

In [51]:
# Converts categorical features into OHE features using pandas get_dummies.
# Can take a list of valid dummy column names to ensure consistency between data sets with different categorical values.
class DataFrame_DummyEncoder(BaseEstimator, TransformerMixin):
    
    def __init__(self, attribute_names, valid_dummy_cols=None):
        self.attribute_names = attribute_names
        # In case categorical values differ between datasets
        self.valid_dummy_cols = valid_dummy_cols
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        if self.valid_dummy_cols is not None:
            zero_data = np.zeros(shape=(len(X),len(self.valid_dummy_cols)))
            self.dummies = pd.DataFrame(zero_data, columns=self.valid_dummy_cols)
            d = pd.get_dummies(X[self.attribute_names])

            for col in d.columns:
                if col in self.dummies.columns:
                    self.dummies[col] = d[col].values
        else:
            self.dummies = pd.get_dummies(X[self.attribute_names])
        return self.dummies.values
    
    def get_feature_names(self):
        return self.dummies.columns.tolist()

In [1]:
class CustomNumAttributes(BaseEstimator, TransformerMixin):
    def __init__(self, ref_dict=None):
        self.ref_dict = ref_dict
    def fit(self, data, y=None):
        return self
    def transform(self, data, y=None):
        X = data.copy()
        # Note: following assumes that X is still be a pandas DataFrame vs. numpy array. 
        self.custom_attr_names = []
        ###
        grade_map = {grade: i for i, grade in enumerate('ABCDEFG')}
        X['grade_value'] = X['grade'].map(grade_map)
        self.custom_attr_names.append('grade_value')
        ###
        subgrade_map = {sg: grade_map[sg[0]]*10 + int(sg[1]) for sg in [c + str(i) for c in 'ABCDEFG' for i in range(1,6)]}
        X['subgrade_value'] = X['sub_grade'].map(subgrade_map)
        self.custom_attr_names.append('subgrade_value')
        ###
        X['lti'] = X['funded_amnt'] / X['annual_inc']
        X['iti'] = X['installment'] / X['annual_inc']
        X['rbti'] = X['revol_bal'] / X['annual_inc']
        X['tbti'] = X['tot_cur_bal'] / X['annual_inc']
        self.custom_attr_names.append(['lti', 'iti', 'rbti', 'tbti'])
        ###
        X['revol_bal_log'] = X['revol_bal'].apply(lambda x: np.log10(x) if x >= 1 else 0)
        X['tot_coll_log'] = X['tot_coll_amt'].apply(lambda x: np.log10(x) if x >= 1 else 0)
        X['rev_lim_log'] = X['total_rev_hi_lim'].apply(lambda x: np.log10(x) if x >= 1 else 0)
        X['rev_lim_sqrt'] = np.sqrt(X['total_rev_hi_lim'])
        self.custom_attr_names.append(['revol_bal_log', 'tot_coll_log', 'rev_lim_log', 'rev_lim_sqrt'])
        ###
        X['earliest_cr_line_td'] = [(issue_d.date() - cr.date()).days for issue_d, cr in zip(X['issue_d'], X['earliest_cr_line'])]
        X['cr_line_td_log'] = X['earliest_cr_line_td'].apply(lambda x: np.log10(x) if x >= 1 else 0)
        self.custom_attr_names.append(['earliest_cr_line_td', 'cr_line_td_log'])
        ###
        ref_dict = self.ref_dict
        if ref_dict is not None:
            ###
            if 'grade_p_map' in ref_dict:
                X['grade_p_value'] = X['grade'].map(ref_dict['grade_p_map'])
                self.custom_attr_names.append('grade_p_value')
            ###
            if 'subgrade_p_map' in ref_dict:
                X['subgrade_p_value'] = X['sub_grade'].map(ref_dict['subgrade_p_map'])
                self.custom_attr_names.append('subgrade_p_value')
            ###
            if ('subgrade_int_rate_mean' in ref_dict) and ('subgrade_int_rate_std' in ref_dict):
                X['int_rate_delta'] = X[['int_rate','sub_grade']].apply(lambda x: (x['int_rate'] - ref_dict['subgrade_int_rate_mean'][x['sub_grade']]) / ref_dict['subgrade_int_rate_std'][x['sub_grade']], axis=1)
                self.custom_attr_names.append('int_rate_delta')
            ###
            if 'annual_inc_q10' in ref_dict:
                X['annual_inc_q10'] = X['annual_inc'].apply(lambda x: len(ref_dict['annual_inc_q10']) if x > max(ref_dict['annual_inc_q10']) else np.argmax(x <= ref_dict['annual_inc_q10'])+1)
                self.custom_attr_names.append('annual_inq_q10')
            ##
            if 'funded_amnt_q10' in ref_dict:
                X['funded_amnt_q10'] = X['funded_amnt'].apply(lambda x: len(ref_dict['funded_amnt_q10']) if x > max(ref_dict['funded_amnt_q10']) else np.argmax(x <= ref_dict['funded_amnt_q10'])+1)
                self.custom_attr_names.append('funded_amnt_q10')
            ###
        return X
    def get_feature_names(self):
        return self.custom_attr_names

NameError: name 'BaseEstimator' is not defined

In [53]:
class CustomBinAttributes(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, data, y=None):
        return self
    def transform(self, data, y=None):
        X = data.copy()        
        # Note: following assumes that X is still be a pandas DataFrame vs. numpy array. 
        self.custom_attr_names = []
        ###
        X['verified'] = (X['verification_status'] != 'Not Verified').astype(int)
        self.custom_attr_names.append('verified')
        ###
        return X
    def get_feature_names(self):
        return self.feature_names

In [54]:
def reference_stats(reference_data):
    d = {}
    data = reference_data.copy()
    default = data['loan_status'].str.contains('Charged Off|Default').astype(int)
    d['grade_p_map'] = default.groupby(data['grade']).mean()
    d['subgrade_p_map'] = default.groupby(data['sub_grade']).mean()
    
    d['subgrade_int_rate_mean'] = data.groupby('sub_grade')['int_rate'].mean()
    d['subgrade_int_rate_std'] = data.groupby('sub_grade')['int_rate'].std()
    
    d['annual_inc_q10'] = data['annual_inc'].quantile(np.arange(0.1, 1.1, 0.1))
    d['funded_amnt_q10'] = data['funded_amnt'].quantile(np.arange(0.1, 1.1, 0.1))
    return d

In [55]:
# Columns identified thus far as best for classification (during data prep, select K best)
num_attr = ['funded_amnt_q10', 'int_rate_delta', 'annual_inc_q10', 'dti', 'delinq_2yrs', 'inq_last_6mths',
            'open_acc', 'revol_bal_log', 'revol_util', 'total_acc', 'collection_recovery_fee', 'collections_12_mths_ex_med',
            'acc_now_delinq', 'rev_lim_sqrt', 'tot_cur_bal', 'tot_coll_amt', 'subgrade_p_value',
            'lti', 'rbti', 'tbti', 'cr_line_td_log']

bin_attr = ['had_delinq', 'had_major_derog', 'had_record', 'verified']

cat_attr = ['emp_length', 'purpose', 'home_ownership', 'term']

In [56]:
# Get list of valid dummy columns from full data set
train_ref_stats = reference_stats(train)
train_dummy_cols = pd.get_dummies(train[cat_attr]).columns

In [57]:
num_prep = Pipeline([('custom', CustomNumAttributes(ref_dict=train_ref_stats)),
                     ('select', DataFrame_Selector(num_attr)), # Select num columns
                     ('sc', StandardScaler())]) # Scale data

bin_prep = Pipeline([('custom', CustomBinAttributes()),
                     ('select', DataFrame_Selector(bin_attr))]) # Select binary columns

cat_prep = Pipeline([('encode', DataFrame_DummyEncoder(cat_attr, train_dummy_cols))]) # Select & encode categrocial columns

feature_prep = FeatureUnion([('num', num_prep),
                             ('bin', bin_prep),
                             ('cat', cat_prep)])

In [58]:
X = feature_prep.fit_transform(train)
y = train_target['default']

In [59]:
X_test = feature_prep.transform(test)
y_test = test_target['default']

In [125]:
X.shape, y_train.shape

((152218, 57), (152218,))

In [126]:
X_test.shape, y_test.shape

((38055, 57), (38055,))

We will split the training set further into a training and validation set:

In [293]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [294]:
X_train.shape, y_train.shape

((121774, 57), (121774,))

In [295]:
X_val.shape, y_val.shape

((30444, 57), (30444,))

### Modeling

##### Setup

In [57]:
# Initialize dict for models
models = {}

In [675]:
def run_time(reset=False, return_time=False):
    if reset:
        run_time.start_time = time.time()
    else:
        td = time.time()-run_time.start_time
        m = td//60
        s = td%60
        ms = 1000*(s%1)
        display = 'Time: '
        display += ('{:.0f}min '.format(m) if m > 0 else '')
        display += ('{:.0f}s '.format(s) if (s > 1 and m > 0) else ('{:.2f}s '.format(s) if (s > 1) else ''))
        display += ((str(round(ms))+'ms ') if (s < 1) else '')
        print(display)
        if return_time:
            return td

In [504]:
def classifier_summary(y_actual, y_pred, print_results=True): 
    
    #f1 = f1_score(y_actual, y_pred)
    f2 = fbeta_score(y_actual, y_pred, beta=2)
    recall = recall_score(y_actual, y_pred)
    precision = precision_score(y_actual, y_pred)
    
    conf_mat = confusion_matrix(y_actual, y_pred)
    
    specificity = conf_mat[0,0] / (conf_mat[0,:].sum())
    fallout = 1 - specificity
    precision_neg = conf_mat[0,0] / (conf_mat[:,0].sum())
    
    df_cmat = pd.DataFrame(conf_mat).rename(index={0:'Actual Negative', 1:'Actual Positive'},
                                  columns={0:'Predicted Negative', 1:'Predicted Positive'})
    
    df_scores = pd.DataFrame([{'Rate': 'F2', 'Score': f2},
                              {'Rate': 'Recall', 'Score': recall},
                              {'Rate': 'Precision (pos)', 'Score': precision},
                              {'Rate': 'Precision (neg)', 'Score': precision_neg},
                              {'Rate': 'Specificity', 'Score': specificity}]).set_index('Rate')
    
    if print_results:
        print('Confusion Matrix:')
        print(df_cmat)
        print(20*'-')
        print('Accuracy Scores:')
        print(df_scores)
   
    return df_cmat, df_scores

In [422]:
def gs_score_summary(gs):
    scores = gs.scoring
    print('-'*20)
    for score in scoring:
        i = np.argmin(gs.cv_results_['rank_test_' + str(score)])
        print('Best {}:'.format(score.title()))
        print('Params: {}'.format(gs.cv_results_['params'][i]))

        for s in scores:
            print('{} = {}'.format(s.title(), gs.cv_results_['mean_test_'+str(s)][i]))
        print('-'*20)

In [502]:
def print_cvs(cvs, scoring='CV'):
    print('Mean {} score = {:.3f} (+\- {:.3f})'.format(scoring, cvs.mean(), cvs.std()))

##### Measuring Accuracy

The business problem at hand for this classification problem is to identify loans that will default. In terms of measuring the performance of our models, the question arises as to which method of scoring / accuracy we value most. In the context of investing in loans, the risks and consequences of failing to identify a default loan greatly outweight those of accidentally discarding some quality loans as default. With this premise, recall (i.e. the proportion of actual default loans identified) should be score we seek to maximize.

However, attempts to boost recall will inevitabely reduce model precision. In an extreme yet possible example, a model that is optimized to identify 90% of default loans could come at the cost of discarding 90% of non-default loans (i.e. 50% precision). From a business standpoint this would leave an unreasonable number of viable loans in the pool of non-default predictions. Consequently, a better approach for optimizing our models is to use an F beta scoring in which recall is given more weight than precision (i.e. beta > 1).

We will proceed with using F2 scoring (beta = 2), in which recall is essentially valued twice as much as precision. This should hopefully allow to us to still maximize recall to a certain degree without decreasing precision to unnacceptable levels.

In [503]:
f2_score = make_scorer(fbeta_score, beta=1.5)

##### Baseline Models

Dummy classifier (stratified):

In [423]:
dummy_strat = DummyClassifier(strategy='stratified')
dummy_strat.fit(X_train, y_train)

DummyClassifier(constant=None, random_state=None, strategy='stratified')

In [424]:
dummy_strat_train_pred = dummy_strat.predict(X_train)
dummy_strat_val_pred = dummy_strat.predict(X_val)

In [505]:
classifier_summary(y_train, dummy_strat_train_pred);

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               79768               18546
Actual Positive               18958                4502
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.192578
Recall           0.191901
Precision (pos)  0.195331
Precision (neg)  0.807974
Specificity      0.811360


In [506]:
classifier_summary(y_val, dummy_strat_val_pred);

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               19794                4836
Actual Positive                4696                1118
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.191373
Recall           0.192294
Precision (pos)  0.187773
Precision (neg)  0.808248
Specificity      0.803654


Dummy classifier (unfirom):

In [507]:
dummy_uniform = DummyClassifier(strategy='uniform')
dummy_uniform.fit(X_train, y_train)

DummyClassifier(constant=None, random_state=None, strategy='uniform')

In [508]:
dummy_uniform_train_pred = dummy_uniform.predict(X_train)
dummy_uniform_val_pred = dummy_uniform.predict(X_val)

In [509]:
classifier_summary(y_train, dummy_uniform_train_pred);

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               48942               49372
Actual Positive               11664               11796
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.380496
Recall           0.502813
Precision (pos)  0.192846
Precision (neg)  0.807544
Specificity      0.497813


In [510]:
classifier_summary(y_val, dummy_uniform_val_pred);

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               12352               12278
Actual Positive                2899                2915
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.379074
Recall           0.501376
Precision (pos)  0.191865
Precision (neg)  0.809914
Specificity      0.501502


These two dummy classifiers reperesent opposite extremes of our modelling potential. The stratified predictions predict defaults infrequently due to its small class proportion, resulting in high specificity but very low recall & precisons scores.

The uniform predictor, on the other hand, boosts the frequency of predicted default loans resulting in a much better recall but still with low precision.

Due to the higher recall scores with our uniform predictor we see a significant improvement in the F2 score, so this is probably 

Another baseline model which should provide a better compromise between these two extremes is a one rule classifier (single level decision tree):

In [511]:
one_r = DecisionTreeClassifier(max_depth=1)

In [512]:
one_r.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [771]:
oner_f2_cvs = cross_val_score(estimator=one_r, X=X_train, y=y_train, cv=5, scoring=f2_score, n_jobs=4)

In [772]:
print_cvs(oner_f2_cvs, 'f2')

Mean f2 score = 0.534 (+\- 0.004)


In [513]:
one_r_train_pred = branch.predict(X_train)
one_r_val_pred = branch.predict(X_val)

In [514]:
classifier_summary(y_train, one_r_train_pred);

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               98314                   0
Actual Positive               13073               10387
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.498287
Recall           0.442754
Precision (pos)  1.000000
Precision (neg)  0.882634
Specificity      1.000000


In [515]:
classifier_summary(y_val, one_r_val_pred);

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               24630                   0
Actual Positive                3198                2616
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.505566
Recall           0.449948
Precision (pos)  1.000000
Precision (neg)  0.885080
Specificity      1.000000


##### Logistic Regression


Basic model, no tuning:

In [516]:
lr = LogisticRegression()

lr_timeit = %timeit -n1 -r1 -o \
lr.fit(X_train, y_train)

4.05 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


Not a bad fit time with 4 seconds. Let's see how the base model performs with cross validation scores (f1 and recall):

In [517]:
lr_f2_cvs = cross_val_score(estimator=logreg,
                             X=X_train,
                             y=y_train,
                             cv=5,
                             scoring=f2_score,
                             n_jobs=4)

In [518]:
print_cvs(lr_f2_cvs, 'f2')

Mean f2 score = 0.505 (+\- 0.002)


In [519]:
lr_train_pred = lr.predict(X_train)
lr_val_pred = lr.predict(X_val)

In [520]:
lr_train_summary = classifier_summary(y_train, lr_train_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               98221                  93
Actual Positive               13647                9813
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.472934
Recall           0.418286
Precision (pos)  0.990612
Precision (neg)  0.878008
Specificity      0.999054


In [521]:
lr_val_summary = classifier_summary(y_val, lr_val_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               24600                  30
Actual Positive                3343                2471
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.479675
Recall           0.425009
Precision (pos)  0.988005
Precision (neg)  0.880364
Specificity      0.998782


The models are very precise as there are hardly any false positives. However, our recall score of interest is in dire need of improvement since we are only identifying 42% of the actual defaulted loans. Our CV, train, and validation sets all perform similarly, so at least there is little indication of overfitting.

Noting the small proportion of defaulted loans, using a balanced class weight may yield more favorable results:

In [522]:
lr_bal = LogisticRegression(class_weight='balanced')

lr_bal_timeit = %timeit -n1 -r1 -o \
lr_bal.fit(X_train, y_train)

4.53 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [523]:
lr_bal_f2_cvs = cross_val_score(estimator=lr_bal,
                             X=X_train,
                             y=y_train,
                             cv=5,
                             scoring=f2_score,
                             n_jobs=4)

In [524]:
print_cvs(lr_bal_f2_cvs, 'f2')

Mean f2 score = 0.599 (+\- 0.003)


In [525]:
lrb_train_pred = lr_bal.predict(X_train)
lrb_val_pred = lr_bal.predict(X_val)

In [526]:
lrb_train_summary = classifier_summary(y_train, lrb_train_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               84469               13845
Actual Positive                8359               15101
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.614932
Recall           0.643691
Precision (pos)  0.521696
Precision (neg)  0.909952
Specificity      0.859176


In [527]:
lrb_val_summary = classifier_summary(y_val, lrb_val_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               21211                3419
Actual Positive                2080                3734
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.613963
Recall           0.642243
Precision (pos)  0.522019
Precision (neg)  0.910695
Specificity      0.861186


Balancing the class weights yields significant improvements to the recall & F2 score, as the increased weight for our default class leads to more loans being flagged as such. However, the consequence is that precision is decreased considerably - from 0.99 to 0.52. This means that with this approach, approx. half the loans identified as default are in fact false positives. However, our false negative rate (denoted 'Precision (neg)') actually improves slightly, meaning the loans that remain as predicted negative have a higher guarantee of actually being non-default. Additionally, specificity only decreases slightly, so the fraction of non-default loans lost to false positive classification is proportionally small.

All in all given these results and our overall increase in F2 score, logistic regression shows promise as a model worth exploring and tuning further.

##### Naive Bayes Classifier

In [528]:
gnb = GaussianNB()

In [529]:
gnb_time = %timeit -n1 -r1 -o \
gnb.fit(X_train, y_train)

129 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [679]:
gnb_f2_cvs = cross_val_score(estimator=gnb, X=X_train, y=y_train, cv=5, scoring=f2_score, n_jobs=4)

In [680]:
print_cvs(gnb_f2_cvs, 'f2')

Mean f2 score = 0.533 (+\- 0.005)


In [532]:
gnb_train_pred = gnb.predict(X_train)
gnb_val_pred = gnb.predict(X_val)

In [533]:
gnb_train_summary = classifier_summary(y_train, gnb_train_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               96110                2204
Actual Positive               12663               10797
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.505284
Recall           0.460230
Precision (pos)  0.830475
Precision (neg)  0.883583
Specificity      0.977582


In [534]:
gnb_val_summary = classifier_summary(y_val, gnb_val_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               24091                 539
Actual Positive                3096                2718
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.512579
Recall           0.467492
Precision (pos)  0.834510
Precision (neg)  0.886122
Specificity      0.978116


The naive Bayes classifier seems to show similar performance in terms of recall to our non-balanced logistic regression model, with a slight improvement to recall and reduction in precision. The class probabilities should already be accounted for by default, so it is possible this model is suffering from high dimensionality given our 57 features. Additionally, with the large training set size of 120,000 records it is also possible the distinction between classes is muddled by the amount of overlapping data, however this seems less likely since both the k-fold cross validated and full training set scores are almost identical.

Regardless, we will see if training set size shows any improvements:

In [693]:
rand_ind = np.random.choice(np.arange(0, X_train.shape[0], 1), size=10000, replace=False)

In [694]:
X_train_sample = X_train[rand_ind]
y_train_sample = y_train.iloc[rand_ind]

In [695]:
gnb_s = GaussianNB()

In [696]:
gnb_s_time = %timeit -n1 -r1 -o \
gnb_s.fit(X_train_sample, y_train_sample)

12 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [697]:
gnb_s_f2_cvs = cross_val_score(estimator=gnb_s, X=X_train_sample, y=y_train_sample, cv=5, scoring=f2_score, n_jobs=4)

In [698]:
print_cvs(gnb_s_f2_cvs, 'f2')

Mean f2 score = 0.557 (+\- 0.033)


In [702]:
gnb_s_train_pred = gnb_s.predict(X_train)
gnb_s_val_pred = gnb_s.predict(X_val)

In [703]:
gnb_train_summary = classifier_summary(y_train, gnb_s_train_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               95613                2701
Actual Positive               12337               11123
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.516561
Recall           0.474126
Precision (pos)  0.804615
Precision (neg)  0.885716
Specificity      0.972527


In [704]:
gnb_val_summary = classifier_summary(y_val, gnb_s_val_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               23981                 649
Actual Positive                3022                2792
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.522905
Recall           0.480220
Precision (pos)  0.811392
Precision (neg)  0.888087
Specificity      0.973650


Very slight improvements. Compared to our balanced class weight Logistic Regression model, these results are still signficantly worse, and only a slight improvement over the OneR decision tree baseline. However, it may be worth exploring the optimal training size further to maximize and implement GNB as via bagging.

##### KNN

In [547]:
knn = Pipeline([('norm', Normalizer()),
                ('estimator', KNeighborsClassifier(n_jobs=2))])

In [548]:
knn_time = %timeit -n1 -r1 -o \
knn.fit(X_train, y_train)

848 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [549]:
knn_f2_cvs = cross_val_score(estimator=knn, X=X_train, y=y_train, cv=3, scoring=f2_score, n_jobs=2)

In [552]:
print_cvs(knn_f2_cvs, 'f2')

Mean f2 score = 0.304 (+\- 0.004)


Already looking to be significantly worse than the other models tested until now, but this could also be a result of the fold sample sizes not providing sufficient information. Predictions using KNN are also likely to take some time due to its nature of having to process the entire set each time:

In [553]:
%timeit -n1 -r1 \
knn_train_pred = knn.predict(X_train)

11min 23s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [564]:
knn_val_pred = knn.predict(X_val)

Due to the nature of KNN these predictions took signficantly longer for predictions than our other models thus far.

In [705]:
knn_train_summary = classifier_summary(y_train, knn_train_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               94452                3862
Actual Positive               12242               11218
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.514965
Recall           0.478176
Precision (pos)  0.743899
Precision (neg)  0.885261
Specificity      0.960718


In [706]:
knn_val_summary = classifier_summary(y_val, knn_val_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               23538                1092
Actual Positive                4324                1490
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.288335
Recall           0.256278
Precision (pos)  0.577072
Precision (neg)  0.844807
Specificity      0.955664


KNN clearly suffers from overfitting given the performance with our validation set (and the CV scores). Perhaps reducing the training size has an impact:

In [None]:
rand_ind = np.random.choice(np.arange(0, X_train.shape[0], 1), size=10000, replace=False)

In [None]:
X_train_sample = X_train[rand_ind]
y_train_sample = y_train.iloc[rand_ind]

In [708]:
knn_s = KNeighborsClassifier(n_jobs=2)
knn_s.fit(X_train_sample, y_train_sample)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=2, n_neighbors=5, p=2,
           weights='uniform')

In [709]:
knn_s_f2_cvs = cross_val_score(estimator=knn_s, X=X_train_sample, y=y_train_sample, cv=3, scoring=f2_score, n_jobs=2)

In [710]:
print_cvs(knn_s_f2_cvs)

Mean CV score = 0.261 (+\- 0.020)


In [711]:
knn_s_val_pred = knn_s.predict(X_val)

In [713]:
classifier_summary(y_val, knn_s_val_pred);

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               23581                1049
Actual Positive                4546                1268
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.247918
Recall           0.218094
Precision (pos)  0.547259
Precision (neg)  0.838376
Specificity      0.957410


Reducing the sample size (although just one scenario) shows a decrease in performance. Perhaps some parameter tuning may show improvements but relative to other model performances seen thus far it may be difficult to reach similar levels of accuracy.

##### Support Vector Classifier

Support Vector Classifiers are likely to be computationally expensive due to the number of features, but we will still give it a try and see how it performs.

In [716]:
lsvc = LinearSVC()

In [717]:
lsvc_time = %timeit -n1 -r1 -o \
lsvc.fit(X_train, y_train)

24.3 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


Somewhat slower due to large number of training instances n (160k) and features m (59). (complexity $O(m\times n)$)

In [718]:
lsvc_f2_cvs = cross_val_score(estimator=LinearSVC(), X=X_train, y=y_train, cv=5, scoring=f2_score, n_jobs=4)

In [719]:
print_cvs(lsvc_f2_cvs)

Mean CV score = 0.524 (+\- 0.003)


Without any tuning this already seems to be better than our base logisitic regression model, with slight improvements over the OneR decision tree model.

In [720]:
lsvc_train_pred = lin_svc.predict(X_train)
lsvc_val_pred = lin_svc.predict(X_val)

In [721]:
lsvc_train_summary = classifier_summary(y_train, lsvc_train_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               98314                   0
Actual Positive               13290               10170
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.488895
Recall           0.433504
Precision (pos)  1.000000
Precision (neg)  0.880918
Specificity      1.000000


In [723]:
lsvc_val_summary = classifier_summary(y_val, lsvc_val_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               24630                   0
Actual Positive                3253                2561
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.495991
Recall           0.440488
Precision (pos)  1.000000
Precision (neg)  0.883334
Specificity      1.000000


The results on the train and validation set are very much like our initial logistic regression model, which bring up the same issue of class weights. Trying the balanced class weight approach:

In [725]:
lsvc_b = LinearSVC(class_weight='balanced')

In [726]:
lsvc_b_time = %timeit -n1 -r1 -o \
lsvc_b.fit(X_train, y_train)

24.3 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [727]:
lsvc_b_f2_cvs = cross_val_score(estimator=lsvc_b, X=X_train, y=y_train, cv=5, scoring=f2_score, n_jobs=4)

In [728]:
print_cvs(lsvc_b_f2_cvs)

Mean CV score = 0.600 (+\- 0.002)


In [729]:
lsvc_b_train_pred = lsvc_b.predict(X_train)
lsvc_b_val_pred = lsvc_b.predict(X_val)

In [730]:
lsvc_b_train_summary = classifier_summary(y_train, lsvc_b_train_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               85900               12414
Actual Positive                8648               14812
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.611732
Recall           0.631373
Precision (pos)  0.544039
Precision (neg)  0.908533
Specificity      0.873731


In [731]:
lsvc_b_val_summary = classifier_summary(y_val, lsvc_b_val_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               21569                3061
Actual Positive                2132                3682
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.613687
Recall           0.633299
Precision (pos)  0.546048
Precision (neg)  0.910046
Specificity      0.875721


As with our balanced class weight logistic regression model, we see a signficant boost to recall (and consequently F2). Compared to our baseline OneR model this is already a signficant improvement, and consequently should be explored more in depth with parameter and feature tuning. 

##### Support Vector Classifier - Polynomial

The polynomial SVC will take an exceptionally long time to fit given the size our training set. However, fitting the model on a sample of the training data may at least give some insight into how the model compares to others. Noting the improvements acheived via balanced class weights we will proceed with the same setting here:

In [736]:
rand_ind = np.random.choice(np.arange(0, X_train.shape[0], 1), size=20000, replace=False)

In [737]:
X_train_sample = X_train[rand_ind]
y_train_sample = y_train.iloc[rand_ind]

In [742]:
psvc = SVC(kernel='poly', degree=2, class_weight='balanced')

In [743]:
psvc_time = %timeit -n1 -r1 -o \
psvc.fit(X_train_sample, y_train_sample)

26.5 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


Even with 20,000 samples this is one of the slower fitting times yet (ignoring the prediction time for KNN).

In [744]:
psvc_f2_cvs = cross_val_score(estimator=psvc, X=X_train_sample, y=y_train_sample, cv=5, scoring=f2_score, n_jobs=4)

In [745]:
print_cvs(psvc_f2_cvs, 'f2')

Mean f2 score = 0.503 (+\- 0.022)


In [746]:
psvc_train_pred = psvc.predict(X_train)
psvc_val_pred = psvc.predict(X_val)

Even with the reduced sample these predictions take a considerable amount of time to process.

In [747]:
psvc_train_summary = classifier_summary(y_train, psvc_train_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               78491               19823
Actual Positive                9544               13916
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.545388
Recall           0.593180
Precision (pos)  0.412460
Precision (neg)  0.891589
Specificity      0.798371


In [749]:
psvc_val_summary = classifier_summary(y_val, psvc_val_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               19748                4882
Actual Positive                2389                3425
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.542566
Recall           0.589095
Precision (pos)  0.412303
Precision (neg)  0.892081
Specificity      0.801786


This performance is a slight improvement over the baseline OneR model, however it pales slightly compared to the linear SVC classifier with balanced weights. Additionally, precision has suffered significantly versus the linear model. This could be a result of the reduced training set size, so if time permits this could be invesigated further with a larger set, perhaps in tandem with reduced dimensionality to aid in computational complexity.

##### Support Vector Classifier  - RBF

In [756]:
rand_ind = np.random.choice(np.arange(0, X_train.shape[0], 1), size=20000, replace=False)

In [757]:
X_train_sample = X_train[rand_ind]
y_train_sample = y_train.iloc[rand_ind]

In [758]:
rsvc = SVC(kernel='rbf', class_weight='balanced')

In [759]:
rsvc_time = %timeit -n1 -r1 -o \
rsvc.fit(X_train_sample, y_train_sample)

31.3 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


Comparable fit time to our polynomial SVC.

In [760]:
rsvc_f2_cvs = cross_val_score(estimator=rsvc, X=X_train_sample, y=y_train_sample, cv=5, scoring=f2_score, n_jobs=4)

In [761]:
print_cvs(rsvc_f2_cvs, 'f2')

Mean f2 score = 0.546 (+\- 0.016)


In [762]:
rsvc_train_pred = rsvc.predict(X_train)
rsvc_val_pred = rsvc.predict(X_val)

In [763]:
rsvc_train_summary = classifier_summary(y_train, rsvc_train_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               80301               18013
Actual Positive                8953               14507
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.574035
Recall           0.618372
Precision (pos)  0.446095
Precision (neg)  0.899691
Specificity      0.816781


In [764]:
rsvc_val_summary = classifier_summary(y_val, rsvc_val_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               20156                4474
Actual Positive                2280                3534
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.565187
Recall           0.607843
Precision (pos)  0.441309
Precision (neg)  0.898378
Specificity      0.818352


RBF actually performed slightly better than the polynomial SVC. It is still not quite as succesful in terms of F2, with precision again taking a notable dip (albeit slightly less than polynomial SVC). Consequently, if additional SVC models are to be pursued (besides linear), RBF may be more worthwhile to investigate further over polynomial.

##### Decision Tree Classifier

Our OneR baseline model is already a decision tree model, but perhaps some basic tuning can provide some immediate improvements. Firstly, we will evaluate a completely unconstrained decision tree:

In [765]:
dtc = DecisionTreeClassifier()

In [778]:
dtc.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [767]:
dtc_f2_cvs = cross_val_score(estimator=dtc, X=X_train, y=y_train, cv=5, scoring=f2_score, n_jobs=4)

In [768]:
print_cvs(dtc_f2_cvs, 'f2')

Mean f2 score = 0.539 (+\- 0.004)


Essentially identical F2 CV score to the OneR model.

In [773]:
dtc_train_pred = dtc.predict(X_train)
dtc_val_pred = dtc.predict(X_val)

In [774]:
dtc_train_summary = classifier_summary(y_train, dtc_train_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               98314                   0
Actual Positive                   0               23460
--------------------
Accuracy Scores:
                 Score
Rate                  
F2                 1.0
Recall             1.0
Precision (pos)    1.0
Precision (neg)    1.0
Specificity        1.0


In [775]:
dtc_val_summary = classifier_summary(y_val, dtc_val_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               21589                3041
Actual Positive                2606                3208
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.543637
Recall           0.551772
Precision (pos)  0.513362
Precision (neg)  0.892292
Specificity      0.876533


Given the difference between our training scores and CV / validation scores, this unconstrained decision tree is clearly suffering from extreme overfitting. We will see if adding some basic constraints shows some improvements:

In [779]:
dtc.tree_.max_depth

46

In [780]:
dtc_v2 = RandomForestClassifier(max_depth=30, min_samples_split=50, class_weight='balanced')
dtc_v2.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=30, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=50, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [781]:
dtc_v2_f1_cvs = cross_val_score(estimator=dtc_v2, X=X_train, y=y_train, cv=5, scoring=f2_score, n_jobs=4)

In [783]:
print_cvs(dtc_v2_f1_cvs, 'f2')

Mean f2 score = 0.583 (+\- 0.003)


In [784]:
dtc_v2_train_pred = dtc_v2.predict(X_train)
dtc_v2_val_pred = dtc_v2.predict(X_val)

In [785]:
dtc_v2_train_summary = classifier_summary(y_train, dtc_v2_train_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               94018                4296
Actual Positive                5721               17739
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.765437
Recall           0.756138
Precision (pos)  0.805037
Precision (neg)  0.942640
Specificity      0.956303


In [787]:
dtc_v2_val_summary = classifier_summary(y_val, dtc_v2_val_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               22959                1671
Actual Positive                2560                3254
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.577339
Recall           0.559684
Precision (pos)  0.660711
Precision (neg)  0.899683
Specificity      0.932156


Whilst overfitting is clearly still an issue, adding some basic, rather arbitrary constraints already shows improvement. In tandem with grid/random search, it should be hopefully be possible to reach similar success in recall / F2 score as our linear regression and SVC models.

In [None]:
# Add model to list
model = {'model_name': 'Decision Tree (Basic)',
         'model_type':'DecisionTreeClassifier',
         'model': dtc,
         'train_score': train_scores.iloc[0,0],
         'test_score': test_scores.iloc[0,0]}

models.append(model)

##### Random Forest Classifier

Random forests are simply collections of decision trees, so we can carry forward what we have learned so far with our initial model, but will constrain our tree a little more for the ensemble:

In [809]:
rfc = RandomForestClassifier(n_estimators=20, max_depth=10, min_samples_split=100, class_weight='balanced', n_jobs=2)

In [810]:
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=10, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=100, min_weight_fraction_leaf=0.0,
            n_estimators=20, n_jobs=2, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [811]:
rfc_f2_cvs = cross_val_score(estimator=rfc, X=X_train, y=y_train, cv=5, scoring=f2_score, n_jobs=2)

In [812]:
print_cvs(rfc_f2_cvs, 'f2')

Mean f2 score = 0.600 (+\- 0.003)


This CV score is a promising start. Let us see how it fares in terms of train vs. validation scores:

In [813]:
rfc_train_pred = rfc.predict(X_train)
rfc_val_pred = rfc.predict(X_val)

In [814]:
rfc_train_summary = classifier_summary(y_train, rfc_train_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               83342               14972
Actual Positive                7205               16255
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.649852
Recall           0.692882
Precision (pos)  0.520543
Precision (neg)  0.920428
Specificity      0.847712


In [815]:
rfc_val_summary = classifier_summary(y_val, rfc_val_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               20738                3892
Actual Positive                1944                3870
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.623831
Recall           0.665635
Precision (pos)  0.498583
Precision (neg)  0.914293
Specificity      0.841981


Random forest is clearly less affected by the overfitting issues seen with the individual decision tree. Regardess of how much this is a result of the constraint parameters versus the collection of 20 estimators, random forest is definitely one of the better performing models and worth pursuing further.