### Setup

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None, 'display.max_rows', 100) 

from os import path
import pickle

import sys
sys.path.append('..')
import src.functions.my_functions as my_func

import missingno as msno
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import seaborn as sns

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV, cross_val_predict, learning_curve
from sklearn.metrics.scorer import make_scorer

from sklearn.metrics import confusion_matrix, f1_score, recall_score, precision_score, precision_recall_curve, fbeta_score

from sklearn.dummy import DummyClassifier

from sklearn.linear_model import LogisticRegression

from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

### Import

In [9]:
# Load train set
f = 'loan_train.p'
d = ['..', 'data', 'processed']
fp = path.join(*d, f)

with open(fp, 'rb') as file:
    train = pickle.load(file)

In [10]:
# Load train target set
f = 'loan_train_target.p'
d = ['..', 'data', 'processed']
fp = path.join(*d, f)

with open(fp, 'rb') as file:
    train_target = pickle.load(file)

In [11]:
# Load test set
f = 'loan_test.p'
d = ['..', 'data', 'processed']
fp = path.join(*d, f)

with open(fp, 'rb') as file:
    test = pickle.load(file)

In [12]:
# Load test target set
f = 'loan_test_target.p'
d = ['..', 'data', 'processed']
fp = path.join(*d, f)

with open(fp, 'rb') as file:
    test_target = pickle.load(file)

In [13]:
train.shape, train_target.shape

((152218, 47), (152218, 2))

In [14]:
test.shape, test_target.shape

((38055, 47), (38055, 2))

### Data Preperation

In [15]:
# Columns identified thus far as best for classification (during data prep, select K best)
num_attr = ['funded_amnt_q10', 'int_rate_delta', 'annual_inc_q10', 'dti', 'delinq_2yrs', 'inq_last_6mths',
            'open_acc', 'revol_bal_log', 'revol_util', 'total_acc', 'collections_12_mths_ex_med',
            'acc_now_delinq', 'rev_lim_sqrt', 'tot_cur_bal', 'tot_coll_amt', 'subgrade_p_value',
            'lti', 'rbti', 'tbti', 'cr_line_td_log', 'emp_length_val']

bin_attr = ['had_delinq', 'had_major_derog', 'had_record', 'verified', 'term_bin']

cat_attr = ['purpose', 'home_ownership']

In [16]:
num_prep = Pipeline([('custom', my_func.CustomNumAttributes()),
                     ('select', my_func.DataFrame_Selector(num_attr)), # Select num columns
                     ('sc', StandardScaler())]) # Scale data

bin_prep = Pipeline([('custom', my_func.CustomBinAttributes()),
                     ('select', my_func.DataFrame_Selector(bin_attr))]) # Select binary columns

cat_prep = Pipeline([('encode', my_func.DataFrame_DummyEncoder(cat_attr))]) # Select & encode categrocial columns

feature_prep = FeatureUnion([('num', num_prep),
                             ('bin', bin_prep),
                             ('cat', cat_prep)])

In [17]:
X_train_full = feature_prep.fit_transform(train)
y_train_full = train_target['default']

In [18]:
X_test = feature_prep.transform(test)
y_test = test_target['default']

In [19]:
X_train_full.shape, y_train_full.shape

((152218, 44), (152218,))

In [20]:
X_test.shape, y_test.shape

((38055, 44), (38055,))

We will split the training set further into a training and validation set:

In [24]:
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

In [25]:
X_train.shape, y_train.shape

((121774, 44), (121774,))

In [26]:
X_val.shape, y_val.shape

((30444, 44), (30444,))

### Modeling

##### Setup

In [None]:
# Initialize dict for models
models = {}

In [27]:
def classifier_summary(y_actual, y_pred, print_results=True): 
    
    #f1 = f1_score(y_actual, y_pred)
    f2 = fbeta_score(y_actual, y_pred, beta=2)
    recall = recall_score(y_actual, y_pred)
    precision = precision_score(y_actual, y_pred)
    
    conf_mat = confusion_matrix(y_actual, y_pred)
    
    specificity = conf_mat[0,0] / (conf_mat[0,:].sum())
    fallout = 1 - specificity
    precision_neg = conf_mat[0,0] / (conf_mat[:,0].sum())
    
    df_cmat = pd.DataFrame(conf_mat).rename(index={0:'Actual Negative', 1:'Actual Positive'},
                                  columns={0:'Predicted Negative', 1:'Predicted Positive'})
    
    df_scores = pd.DataFrame([{'Rate': 'F2', 'Score': f2},
                              {'Rate': 'Recall', 'Score': recall},
                              {'Rate': 'Precision (pos)', 'Score': precision},
                              {'Rate': 'Precision (neg)', 'Score': precision_neg},
                              {'Rate': 'Specificity', 'Score': specificity}]).set_index('Rate')
    
    if print_results:
        print('Confusion Matrix:')
        print(df_cmat)
        print(20*'-')
        print('Accuracy Scores:')
        print(df_scores)
   
    return df_cmat, df_scores

In [28]:
def gs_score_summary(gs):
    scores = gs.scoring
    print('-'*20)
    for score in scoring:
        i = np.argmin(gs.cv_results_['rank_test_' + str(score)])
        print('Best {}:'.format(score.title()))
        print('Params: {}'.format(gs.cv_results_['params'][i]))

        for s in scores:
            print('{} = {}'.format(s.title(), gs.cv_results_['mean_test_'+str(s)][i]))
        print('-'*20)

In [29]:
def print_cvs(cvs, scoring='CV'):
    print('Mean {} score = {:.3f} (+\- {:.3f})'.format(scoring, cvs.mean(), cvs.std()))

##### Measuring Accuracy

The business problem at hand for this classification problem is to identify loans that will default. In terms of measuring the performance of our models, the question arises as to which method of scoring / accuracy we value most. In the context of investing in loans, the risks and consequences of failing to identify a default loan greatly outweight those of accidentally discarding some quality loans as default. With this premise, recall (i.e. the proportion of actual default loans identified) should be score we seek to maximize.

However, attempts to boost recall will inevitabely reduce model precision. In an extreme yet possible example, a model that is optimized to identify 90% of default loans could come at the cost of discarding 90% of non-default loans (i.e. 50% precision). From a business standpoint this would leave an unreasonable number of viable loans in the pool of non-default predictions. Consequently, a better approach for optimizing our models is to use an F beta scoring in which recall is given more weight than precision (i.e. beta > 1).

We will proceed with using F2 scoring (beta = 2), in which recall is essentially valued twice as much as precision. This should hopefully allow to us to still maximize recall to a certain degree without decreasing precision to unnacceptable levels.

In [30]:
f2_score = make_scorer(fbeta_score, beta=2)

##### Baseline Models

Dummy classifier (stratified):

In [31]:
dummy_strat = DummyClassifier(strategy='stratified')
dummy_strat.fit(X_train, y_train)

DummyClassifier(constant=None, random_state=None, strategy='stratified')

In [32]:
dummy_strat_train_pred = dummy_strat.predict(X_train)
dummy_strat_val_pred = dummy_strat.predict(X_val)

In [33]:
my_func.classifier_summary(y_train, dummy_strat_train_pred);

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               79333               18981
Actual Positive               18952                4508
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.192109
Recall           0.192157
Precision (pos)  0.191920
Precision (neg)  0.807173
Specificity      0.806935


In [34]:
my_func.classifier_summary(y_val, dummy_strat_val_pred);

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               19842                4788
Actual Positive                4691                1123
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.192512
Recall           0.193154
Precision (pos)  0.189985
Precision (neg)  0.808788
Specificity      0.805603


Dummy classifier (unfirom):

In [35]:
dummy_uniform = DummyClassifier(strategy='uniform')
dummy_uniform.fit(X_train, y_train)

DummyClassifier(constant=None, random_state=None, strategy='uniform')

In [36]:
dummy_uniform_train_pred = dummy_uniform.predict(X_train)
dummy_uniform_val_pred = dummy_uniform.predict(X_val)

In [37]:
my_func.classifier_summary(y_train, dummy_uniform_train_pred);

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               49269               49045
Actual Positive               11780               11680
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.377835
Recall           0.497869
Precision (pos)  0.192343
Precision (neg)  0.807040
Specificity      0.501139


In [38]:
my_func.classifier_summary(y_val, dummy_uniform_val_pred);

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               12247               12383
Actual Positive                2988                2826
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.367347
Recall           0.486068
Precision (pos)  0.185811
Precision (neg)  0.803873
Specificity      0.497239


These two dummy classifiers reperesent opposite extremes of our modelling potential. The stratified predictions predict defaults infrequently due to its small class proportion, resulting in high specificity but very low recall & precisons scores.

The uniform predictor, on the other hand, boosts the frequency of predicted default loans resulting in a much better recall but still with low precision.

Due to the higher recall scores with our uniform predictor we see a significant improvement in the F2 score. 

Another baseline model which should provide a better compromise between these two extremes is a one rule classifier (single level decision tree):

In [39]:
one_r = DecisionTreeClassifier(max_depth=1, class_weight='balanced')

In [40]:
one_r.fit(X_train, y_train)

DecisionTreeClassifier(class_weight='balanced', criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [41]:
one_r_f2_cvs = cross_val_score(estimator=one_r, X=X_train, y=y_train, cv=5, scoring=f2_score, n_jobs=4)

In [42]:
one_r_recall_cvs = cross_val_score(estimator=one_r, X=X_train, y=y_train, cv=5, scoring='recall', n_jobs=4)

In [43]:
my_func.print_cvs(one_r_f2_cvs, 'f2')
my_func.print_cvs(one_r_recall_cvs, 'recall')

Mean f2 score = 0.525 (+/- 0.010)
Mean recall score = 0.680 (+/- 0.028)


In [44]:
one_r_train_pred = one_r.predict(X_train)
one_r_val_pred = one_r.predict(X_val)

In [45]:
my_func.classifier_summary(y_train, one_r_train_pred);

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               58462               39852
Actual Positive                7996               15464
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.518383
Recall           0.659165
Precision (pos)  0.279557
Precision (neg)  0.879683
Specificity      0.594646


In [46]:
my_func.classifier_summary(y_val, one_r_val_pred);

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               14564               10066
Actual Positive                2017                3797
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.511463
Recall           0.653079
Precision (pos)  0.273895
Precision (neg)  0.878355
Specificity      0.591311


This OneR model provides a more balanced baseline to compare model performance with. We can establish the following standards for deciding which models to pursue:

- F2 score above 0.5
- Recall greater than 0.60 (i.e. able to identify at least 60% of defaulted loans)
- Specificity greather than 0.5 (i.e. retain at least 50% of healthy loans)
- Precision greather than 0.25 (lower priority due to class imbalance)

##### Logistic Regression


Basic model, no tuning:

In [63]:
lr = LogisticRegression()

lr_timeit = %timeit -n1 -r1 -o \
lr.fit(X_train, y_train)

2.07 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


Not a bad fit time with 4 seconds. Let's see how the base model performs with cross validation scores (f1 and recall):

In [65]:
lr_f2_cvs = cross_val_score(estimator=lr,
                             X=X_train,
                             y=y_train,
                             cv=5,
                             scoring=f2_score,
                             n_jobs=4)

In [67]:
my_func.print_cvs(lr_f2_cvs, 'f2')

Mean f2 score = 0.070 (+\- 0.004)


In [68]:
lr_train_pred = lr.predict(X_train)
lr_val_pred = lr.predict(X_val)

In [70]:
lr_train_summary = my_func.classifier_summary(y_train, lr_train_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               97011                1303
Actual Positive               22103                1357
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.070311
Recall           0.057843
Precision (pos)  0.510150
Precision (neg)  0.814438
Specificity      0.986747


In [71]:
lr_val_summary = my_func.classifier_summary(y_val, lr_val_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               24324                 306
Actual Positive                5478                 336
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.070299
Recall           0.057792
Precision (pos)  0.523364
Precision (neg)  0.816187
Specificity      0.987576


Our recall (and F2) score of interest is in dire need of improvement since we are only identifying 5% of the actual defaulted loans. It is likely that this model is suffering from the class imabalance between default and non-default loans, so we will see if weighing class accordingly provides any improvements:

In [72]:
lr_bal = LogisticRegression(class_weight='balanced')

lr_bal_timeit = %timeit -n1 -r1 -o \
lr_bal.fit(X_train, y_train)

1.88 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [73]:
lr_bal_f2_cvs = cross_val_score(estimator=lr_bal,
                             X=X_train,
                             y=y_train,
                             cv=5,
                             scoring=f2_score,
                             n_jobs=4)

In [75]:
my_func.print_cvs(lr_bal_f2_cvs, 'f2')

Mean f2 score = 0.533 (+\- 0.004)


In [76]:
lrb_train_pred = lr_bal.predict(X_train)
lrb_val_pred = lr_bal.predict(X_val)

In [77]:
lrb_train_summary = my_func.classifier_summary(y_train, lrb_train_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               64171               34143
Actual Positive                8167               15293
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.533690
Recall           0.651876
Precision (pos)  0.309349
Precision (neg)  0.887099
Specificity      0.652715


In [78]:
lrb_val_summary = my_func.classifier_summary(y_val, lrb_val_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               16135                8495
Actual Positive                2021                3793
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.533564
Recall           0.652391
Precision (pos)  0.308675
Precision (neg)  0.888687
Specificity      0.655095


Balancing the class weights yields significant improvements to the recall & F2 score, as the increased weight for our default class leads to more loans being flagged as such. However, the consequence is that precision is decreased considerably - from 0.5 to 0.3. This means that with this approach, approx. two thirds of the loans identified as default are in fact false positives. However, our false negative rate (denoted 'Precision (neg)') actually improves slightly, meaning the loans that remain as predicted negative have a higher guarantee of actually being non-default. Specificty has decreased considerably due to a larger number of false positives, but is still better than our baseline OneR score of 0.6.

All in all given the comparable F2 score and improved specificty and precision over the OneR model, logistic regression shows promise as a model worth exploring and tuning further.

In [None]:
lr_bal.coef_f_

##### Naive Bayes Classifier

In [79]:
gnb = GaussianNB()

In [80]:
gnb_time = %timeit -n1 -r1 -o \
gnb.fit(X_train, y_train)

111 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [81]:
gnb_f2_cvs = cross_val_score(estimator=gnb, X=X_train, y=y_train, cv=5, scoring=f2_score, n_jobs=4)

In [83]:
my_func.print_cvs(gnb_f2_cvs, 'f2')

Mean f2 score = 0.410 (+\- 0.030)


In [84]:
gnb_train_pred = gnb.predict(X_train)
gnb_val_pred = gnb.predict(X_val)

In [86]:
gnb_train_summary = my_func.classifier_summary(y_train, gnb_train_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               78143               20171
Actual Positive               13335               10125
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.407819
Recall           0.431586
Precision (pos)  0.334203
Precision (neg)  0.854227
Specificity      0.794831


In [87]:
gnb_val_summary = my_func.classifier_summary(y_val, gnb_val_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               19676                4954
Actual Positive                3362                2452
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.399843
Recall           0.421741
Precision (pos)  0.331083
Precision (neg)  0.854067
Specificity      0.798863


The naive Bayes classifier performs worse than our OneR classifier in terms of F2 and recall. The class probabilities should already be accounted for by default, so it is possible this model is suffering from high dimensionality given our 40+ features. Additionally, with the large training set size of 120,000 records it is also possible the distinction between classes is muddled by the amount of overlapping data, however this seems less likely since both the k-fold cross validated and full training set scores are almost identical.

Regardless, we will see if training set size shows any improvements:

In [98]:
rand_ind = np.random.choice(np.arange(0, X_train.shape[0], 1), size=20000, replace=False)

In [99]:
X_train_sample = X_train[rand_ind]
y_train_sample = y_train.iloc[rand_ind]

In [100]:
gnb_s = GaussianNB()

In [101]:
gnb_s_time = %timeit -n1 -r1 -o \
gnb_s.fit(X_train_sample, y_train_sample)

17.7 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [102]:
gnb_s_f2_cvs = cross_val_score(estimator=gnb_s, X=X_train_sample, y=y_train_sample, cv=5, scoring=f2_score, n_jobs=4)

In [103]:
my_func.print_cvs(gnb_s_f2_cvs, 'f2')

Mean f2 score = 0.459 (+\- 0.069)


In [104]:
gnb_s_train_pred = gnb_s.predict(X_train)
gnb_s_val_pred = gnb_s.predict(X_val)

In [105]:
gnb_train_summary = my_func.classifier_summary(y_train, gnb_s_train_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               75485               22829
Actual Positive               12411               11049
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.432555
Recall           0.470972
Precision (pos)  0.326141
Precision (neg)  0.858799
Specificity      0.767795


In [106]:
gnb_val_summary = my_func.classifier_summary(y_val, gnb_s_val_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               19043                5587
Actual Positive                3083                2731
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.432476
Recall           0.469728
Precision (pos)  0.328324
Precision (neg)  0.860662
Specificity      0.773163


Very slight improvements are made with a smaller training size. Compared to our balanced class weight Logistic Regression and baseline OneR models, these results are still signficantly worse. However, it may be worth exploring the optimal training size further to maximize and implement GNB as via bagging.

##### KNN

In [107]:
knn = Pipeline([('norm', Normalizer()),
                ('estimator', KNeighborsClassifier(n_jobs=2))])

In [108]:
knn_time = %timeit -n1 -r1 -o \
knn.fit(X_train, y_train)

656 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [109]:
knn_f2_cvs = cross_val_score(estimator=knn, X=X_train, y=y_train, cv=3, scoring=f2_score, n_jobs=2)

In [111]:
my_func.print_cvs(knn_f2_cvs, 'f2')

Mean f2 score = 0.170 (+\- 0.004)


Already looking to be significantly worse than the other models tested until now, but this could also be a result of the training and/or fold sample sizes. Predictions using KNN are also likely to take some time due to its nature of having to process the entire set each time:

In [121]:
my_func.run_time(reset=True)
knn_train_pred = knn.predict(X_train)
my_func.run_time()

Time: 6min 51s 


In [118]:
my_func.run_time(reset=True)
knn_val_pred = knn.predict(X_val)
my_func.run_time()

Time: 1min 46s 


In [122]:
knn_train_summary =  my_func.classifier_summary(y_train, knn_train_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               94861                3453
Actual Positive               16437                7023
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.336621
Recall           0.299361
Precision (pos)  0.670389
Precision (neg)  0.852315
Specificity      0.964878


In [123]:
knn_val_summary = my_func.classifier_summary(y_val, knn_val_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               22943                1687
Actual Positive                4947                 867
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.167958
Recall           0.149123
Precision (pos)  0.339468
Precision (neg)  0.822625
Specificity      0.931506


KNN performs signficantly worse on the validation set. We will see if reducing the training size has a noticeable effect:

In [133]:
rand_ind = np.random.choice(np.arange(0, X_train.shape[0], 1), size=20000, replace=False)

In [134]:
X_train_sample = X_train[rand_ind]
y_train_sample = y_train.iloc[rand_ind]

In [135]:
knn_s = KNeighborsClassifier(n_jobs=2)
knn_s.fit(X_train_sample, y_train_sample)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=2, n_neighbors=5, p=2,
           weights='uniform')

In [136]:
knn_s_f2_cvs = cross_val_score(estimator=knn_s, X=X_train_sample, y=y_train_sample, cv=3, scoring=f2_score, n_jobs=2)

In [137]:
my_func.print_cvs(knn_s_f2_cvs)

Mean CV score = 0.157 (+\- 0.012)


In [138]:
knn_s_val_pred = knn_s.predict(X_val)

In [139]:
my_func.classifier_summary(y_val, knn_s_val_pred);

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               23112                1518
Actual Positive                5010                 804
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.157166
Recall           0.138287
Precision (pos)  0.346253
Precision (neg)  0.821848
Specificity      0.938368


Reducing the sample size (although just one scenario) shows a decrease in performance. Perhaps some parameter tuning may show improvements but relative to other model performances our F2 and recall scores are sigificantly worse. Failing to exceed scores in our OneR baseline, it is unlikely KNN will be able to out perform other models even with tuning. Consequently, KNN should be a low priority model for additional tuning, but it is still worth keeping in consideration in tandem with dimensionalty reductions methods.

##### Support Vector Classifier

Support Vector Classifiers are likely to be computationally expensive due to the number of features, but we will still give it a try and see how it performs. We will begin with balanced class weight given our observations with Logistic Regression:

In [144]:
lsvc = LinearSVC(class_weight='balanced')

In [145]:
lsvc_time = %timeit -n1 -r1 -o \
lsvc.fit(X_train, y_train)

24.4 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


Somewhat slower due to large number of training instances n (160k) and features m (59). (complexity $O(m\times n)$)

In [153]:
lsvc_f2_cvs = cross_val_score(estimator=lsvc, X=X_train, y=y_train, cv=5, scoring=f2_score, n_jobs=4)

In [154]:
my_func.print_cvs(lsvc_f2_cvs)

Mean CV score = 0.533 (+\- 0.005)


In [155]:
lsvc_train_pred = lsvc.predict(X_train)
lsvc_val_pred = lsvc.predict(X_val)

In [156]:
lsvc_train_summary = my_func.classifier_summary(y_train, lsvc_train_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               64420               33894
Actual Positive                8249               15211
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.532058
Recall           0.648380
Precision (pos)  0.309765
Precision (neg)  0.886485
Specificity      0.655247


In [157]:
lsvc_val_summary = my_func.classifier_summary(y_val, lsvc_val_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               16191                8439
Actual Positive                2042                3772
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.531762
Recall           0.648779
Precision (pos)  0.308902
Precision (neg)  0.888005
Specificity      0.657369


These results are very similar to those seen with our balanced Logistic Regression model. Compared to our OneR baseline, we see slight improvements to F2 due to higher precision (and specificty). Our recall is slightly below the baseline of 0.65, but with additional parameter tuning this can hopefully be improved. Consequently, Linear SVC shows promise and should be explored further.

##### Support Vector Classifier - Polynomial

The polynomial SVC will take an exceptionally long time to fit given the size our training set. However, fitting the model on a sample of the training data may at least give some insight into how the model compares to others. Noting the improvements acheived via balanced class weights we will proceed with the same setting here:

In [160]:
rand_ind = np.random.choice(np.arange(0, X_train.shape[0], 1), size=20000, replace=False)

In [161]:
X_train_sample = X_train[rand_ind]
y_train_sample = y_train.iloc[rand_ind]

In [162]:
psvc = SVC(kernel='poly', degree=2, class_weight='balanced')

In [163]:
psvc_time = %timeit -n1 -r1 -o \
psvc.fit(X_train_sample, y_train_sample)

22.3 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


Even with 20,000 samples this is one of the slower fitting times yet.

In [164]:
psvc_f2_cvs = cross_val_score(estimator=psvc, X=X_train_sample, y=y_train_sample, cv=5, scoring=f2_score, n_jobs=4)

In [165]:
my_func.print_cvs(psvc_f2_cvs, 'f2')

Mean f2 score = 0.532 (+\- 0.016)


In [166]:
my_func.run_time(reset=True)
psvc_train_pred = psvc.predict(X_train)
my_func.run_time()

Time: 1min 6s 


In [167]:
my_func.run_time(reset=True)
psvc_val_pred = psvc.predict(X_val)
my_func.run_time()

Time: 16.49s 


Even with the reduced sample these predictions take a considerable amount of time to process.

In [168]:
psvc_train_summary = my_func.classifier_summary(y_train, psvc_train_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               64484               33830
Actual Positive                8199               15261
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.533859
Recall           0.650512
Precision (pos)  0.310872
Precision (neg)  0.887195
Specificity      0.655898


In [169]:
psvc_val_summary = my_func.classifier_summary(y_val, psvc_val_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               16156                8474
Actual Positive                2084                3730
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.525945
Recall           0.641555
Precision (pos)  0.305637
Precision (neg)  0.885746
Specificity      0.655948


This performance is almost identical to our linear SVC model. Let us see how it performs using the entire training set (will take a while):

In [172]:
my_func.run_time(reset=True)
psvc.fit(X_train, y_train)
my_func.run_time()

Time: 20min 13s 


In [174]:
my_func.run_time(reset=True)
psvc_train_pred = psvc.predict(X_train)
my_func.run_time()

Time: 6min 40s 


In [173]:
my_func.run_time(reset=True)
psvc_val_pred = psvc.predict(X_val)
my_func.run_time()

Time: 1min 40s 


In [175]:
psvc_train_summary = my_func.classifier_summary(y_train, psvc_train_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               61118               37196
Actual Positive                7022               16438
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.557319
Recall           0.700682
Precision (pos)  0.306485
Precision (neg)  0.896947
Specificity      0.621661


In [176]:
psvc_val_summary = my_func.classifier_summary(y_val, psvc_val_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               15276                9354
Actual Positive                1777                4037
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.550795
Recall           0.694358
Precision (pos)  0.301471
Precision (neg)  0.895795
Specificity      0.620219


The polynomial SVC (2nd degree) is actually the best performing model yet in terms of F2 and recall, whilst still maintaining an acceptable precision and specificity. However, considering the computational complexity and the resulting time to both fit and predict, exploring this model in depth is likely to be less feasible within the constraints of this project. Thus, delving into this model further should be a lower priority.

##### Support Vector Classifier  - RBF

Continuing with SVC, we will see how the RBF kernel performs. Due to the computational costs we will start with a reduced training sample for an initial estimate on performance:

In [177]:
rand_ind = np.random.choice(np.arange(0, X_train.shape[0], 1), size=20000, replace=False)

In [178]:
X_train_sample = X_train[rand_ind]
y_train_sample = y_train.iloc[rand_ind]

In [179]:
rsvc = SVC(kernel='rbf', class_weight='balanced')

In [180]:
rsvc_time = %timeit -n1 -r1 -o \
rsvc.fit(X_train_sample, y_train_sample)

30.7 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


Comparable fit time to our polynomial SVC.

In [181]:
rsvc_f2_cvs = cross_val_score(estimator=rsvc, X=X_train_sample, y=y_train_sample, cv=5, scoring=f2_score, n_jobs=4)

In [183]:
my_func.print_cvs(rsvc_f2_cvs, 'f2')

Mean f2 score = 0.533 (+\- 0.012)


In [184]:
rsvc_train_pred = rsvc.predict(X_train_sample)
rsvc_val_pred = rsvc.predict(X_val)

In [186]:
rsvc_train_summary = my_func.classifier_summary(y_train_sample, rsvc_train_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               10649                5489
Actual Positive                1084                2778
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.585705
Recall           0.719316
Precision (pos)  0.336035
Precision (neg)  0.907611
Specificity      0.659871


In [187]:
rsvc_val_summary = my_func.classifier_summary(y_val, rsvc_val_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               15981                8649
Actual Positive                1977                3837
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.536763
Recall           0.659959
Precision (pos)  0.307304
Precision (neg)  0.889910
Specificity      0.648843


RBF performs slightly better than our polynomial SVC, albeit with a degree of underfitting with the validation set showing lower recall and precision. This may simply be a result of the sample size noting that your CV scores were lower. Given our success with polynomial SVC, we will attempt to fit the entire training set using RBF:

In [179]:
rsvc = SVC(kernel='rbf', class_weight='balanced')

In [188]:
my_func.run_time(reset=True)
rsvc.fit(X_train, y_train)
my_func.run_time()

Time: 28min 21s 


In [189]:
my_func.run_time(reset=True)
rsvc_train_pred = rsvc.predict(X_train)
my_func.run_time()

Time: 9min 51s 


In [190]:
my_func.run_time(reset=True)
rsvc_val_pred = rsvc.predict(X_val)
my_func.run_time()

Time: 2min 29s 


In [191]:
rsvc_train_summary = my_func.classifier_summary(y_train, rsvc_train_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               63674               34640
Actual Positive                6974               16486
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.568616
Recall           0.702728
Precision (pos)  0.322458
Precision (neg)  0.901285
Specificity      0.647660


In [192]:
rsvc_val_summary = my_func.classifier_summary(y_val, rsvc_val_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               15872                8758
Actual Positive                1879                3935
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.547303
Recall           0.676815
Precision (pos)  0.310013
Precision (neg)  0.894147
Specificity      0.644417


Comparible results to our polynomial SVC, with some slightly reduced recall and precision on the validation set. It is possible that the full training data set leads to suboptimal results. It may be worth further pursuing variations in training sizes along with parameter optimization. Additionally, whilst this model is quite computationally expensive, perhaps with feature and dimension reduction efforts both performance and accuracy can be improved.

##### Decision Tree Classifier

Our OneR baseline model is already a decision tree model, but perhaps some basic tuning can provide some immediate improvements. Firstly, we will evaluate a completely unconstrained decision tree to see how deep it extends:

In [193]:
dtc = DecisionTreeClassifier()

In [194]:
dtc.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [196]:
dtc.tree_.max_depth

46

The unconstrained decision tree has a depth of 46, but leaving this as is will almost certainly lead to overfitting. We will set up our decision tree with some basic contraints to limit overfitting:

In [197]:
dtc?

In [233]:
dtc = DecisionTreeClassifier(class_weight='balanced', max_depth=20, min_samples_split=100, min_samples_leaf=100)

In [234]:
dtc.fit(X_train, y_train)

DecisionTreeClassifier(class_weight='balanced', criterion='gini',
            max_depth=20, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=100, min_samples_split=100,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [235]:
dtc_f2_cvs = cross_val_score(estimator=dtc, X=X_train, y=y_train, cv=5, scoring=f2_score, n_jobs=4)

In [236]:
my_func.print_cvs(dtc_f2_cvs, 'f2')

Mean f2 score = 0.511 (+\- 0.006)


In [237]:
dtc_train_pred = dtc.predict(X_train)
dtc_val_pred = dtc.predict(X_val)

In [238]:
dtc_train_summary = my_func.classifier_summary(y_train, dtc_train_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               63287               35027
Actual Positive                6258               17202
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.588831
Recall           0.733248
Precision (pos)  0.329357
Precision (neg)  0.910015
Specificity      0.643723


In [239]:
dtc_val_summary = my_func.classifier_summary(y_val, dtc_val_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               15371                9259
Actual Positive                2108                3706
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.511582
Recall           0.637427
Precision (pos)  0.285847
Precision (neg)  0.879398
Specificity      0.624076


Given the difference between our training scores and CV / validation scores, this decision tree is clearly still suffering from  overfitting. Regardless, our validation scores are still within the acceptable ranges, and considering we arbitrarily chose certin constraints there should be room for improvement with tuning.

##### Random Forest Classifier

Random forests are simply collections of decision trees, so we can carry forward what we have learned so far with our initial model, but will constrain our tree a little more for the ensemble:

In [249]:
rfc = RandomForestClassifier(n_estimators=20, max_depth=10, min_samples_split=100, min_samples_leaf=100, class_weight='balanced', n_jobs=2)

In [250]:
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=10, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=100,
            min_samples_split=100, min_weight_fraction_leaf=0.0,
            n_estimators=20, n_jobs=2, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [251]:
rfc_f2_cvs = cross_val_score(estimator=rfc, X=X_train, y=y_train, cv=5, scoring=f2_score, n_jobs=2)

In [252]:
my_func.print_cvs(rfc_f2_cvs, 'f2')

Mean f2 score = 0.531 (+\- 0.006)


This CV score is a promising start. Let us see how it fares in terms of train vs. validation scores:

In [253]:
rfc_train_pred = rfc.predict(X_train)
rfc_val_pred = rfc.predict(X_val)

In [254]:
rfc_train_summary = my_func.classifier_summary(y_train, rfc_train_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               64978               33336
Actual Positive                7380               16080
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.561233
Recall           0.685422
Precision (pos)  0.325401
Precision (neg)  0.898007
Specificity      0.660923


In [255]:
rfc_val_summary = my_func.classifier_summary(y_val, rfc_val_pred)

Confusion Matrix:
                 Predicted Negative  Predicted Positive
Actual Negative               16126                8504
Actual Positive                2054                3760
--------------------
Accuracy Scores:
                    Score
Rate                     
F2               0.529279
Recall           0.646715
Precision (pos)  0.306588
Precision (neg)  0.887019
Specificity      0.654730


Random forest shows slight improvement over individual decisions trees in terms of overfitting. Whilst Logistic Regression and SVC have peformed better thus far, given the speed at which these models can be tested and the general robustness of random forests, Random Forests should definitely be explored further.