In [2]:
import pandas as pd
import numpy as np

from os import path
import pickle
import sys
sys.path.append('..')
import src.functions.my_functions as my_func

import matplotlib.pyplot as plt
import matplotlib.colors as colors
import seaborn as sns

from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.metrics.scorer import make_scorer
from sklearn.metrics import f1_score, recall_score, precision_score, fbeta_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier

from sklearn.feature_selection import RFE, f_classif

In [3]:
from imp import reload
reload(my_func)

<module 'src.functions.my_functions' from '..\\src\\functions\\my_functions.py'>

### Data Import

In [5]:
data = {}

ds = ['train', 'train_target', 'test', 'test_target']
d = ['..', 'data', 'processed']

for s in ds:    
    fn = 'loan_'+s+'.p'
    fp = path.join(*d, fn)

    with open(fp, 'rb') as file:
        data[s] = pickle.load(file)

### Data Prep

In [7]:
# Columns identified thus far as best for classification (during data prep, select K best)
num_attr = ['funded_amnt_q10', 'int_rate_delta', 'annual_inc_q10', 'dti', 'delinq_2yrs', 'inq_last_6mths',
            'open_acc', 'revol_bal_log', 'revol_util', 'total_acc', 'collections_12_mths_ex_med',
            'acc_now_delinq', 'rev_lim_sqrt', 'tot_cur_bal', 'tot_coll_amt', 'subgrade_p_value',
            'lti', 'rbti', 'tbti', 'cr_line_td_log', 'emp_length_val']

bin_attr = ['had_delinq', 'had_major_derog', 'had_record', 'verified', 'term_bin']

cat_attr = ['purpose', 'home_ownership']

In [8]:
num_prep = Pipeline([('custom', my_func.CustomNumAttributes()), # Create custom num attr
                     ('select', my_func.DataFrame_Selector(num_attr)), # Select num columns
                     ('sc', StandardScaler())]) # Scale data

bin_prep = Pipeline([('custom', my_func.CustomBinAttributes()), # Create custom bin attr
                     ('select', my_func.DataFrame_Selector(bin_attr))]) # Select binary columns

cat_prep = Pipeline([('encode', my_func.DataFrame_DummyEncoder(cat_attr))]) # Select & encode categrocial columns

feature_prep = FeatureUnion([('num', num_prep),
                             ('bin', bin_prep),
                             ('cat', cat_prep)])

In [9]:
X_train = feature_prep.fit_transform(data['train'])
X_test = feature_prep.transform(data['test'])

y_train = data['train_target']['default']
y_test = data['test_target']['default']

### Scoring

In [11]:
f2_score = make_scorer(fbeta_score, beta=2)

### Final Models

In [15]:
models = {}

In [25]:
# Logistic Regression
lr = Pipeline([('rfe', RFE(estimator=LogisticRegression(class_weight='balanced'), n_features_to_select=28)),
                    ('estimator', LogisticRegression(class_weight='balanced', C=1e-8))])

models['lr'] = lr

# Bagged Logistic Regression
lr_bag = Pipeline([('rfe', RFE(estimator=LogisticRegression(class_weight='balanced'), n_features_to_select=28)),
                    ('estimator', BaggingClassifier(base_estimator=LogisticRegression(class_weight='balanced', C=1e-8),
                                                   max_samples=0.2, max_features=0.8, bootstrap_features=True, bootstrap=True,
                                                   n_estimators=10, random_state=998))])

models['lr_bag'] = lr_bag

In [26]:
# Linear SVC
lsvc = Pipeline([('rfe', RFE(estimator=LinearSVC(class_weight='balanced', dual=False), n_features_to_select=31)),
                      ('estimator', LinearSVC(class_weight='balanced', dual=False, C=1e-7, ))])

models['lsvc'] = lsvc

# Bagged Linear SVC
lsvc_bag = BaggingClassifier(base_estimator=LinearSVC(class_weight='balanced', dual=False, C=1e-7),
                             n_estimators=10, max_samples=0.2, max_features=0.8,
                             bootstrap_features=False, bootstrap=False,
                             random_state=521)

models['lsvc_bag'] = lsvc_bag

In [27]:
# Decision Tree
dtc = DecisionTreeClassifier(class_weight='balanced', max_depth=4, max_features=0.4, random_state=356)

models['dtc'] = dtc

# Bagged Decision Tree
dtc_bag = BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight='balanced', max_depth=1),
                            bootstrap=False, bootstrap_features=False, 
                            max_features=0.7, max_samples=0.2,
                            n_estimators=10, random_state=321)

models['dtc_bag'] = dtc_bag

In [28]:
# Random Forest
rfc = RandomForestClassifier(class_weight='balanced', max_features=0.5, min_samples_leaf=.095,
                                  n_estimators=10, random_state=808)

models['rfc'] = rfc

# Bagged Random Forest
rfc_bag = BaggingClassifier(base_estimator=RandomForestClassifier(class_weight='balanced', max_depth=5, min_samples_leaf=0.05),
                                 max_features=0.2, max_samples=0.3,
                                 n_estimators=10, random_state=748)

models['rfc_bag'] = rfc_bag

In [29]:
# Bagged Gaussian Naive Bayes
gnb_bag = BaggingClassifier(base_estimator=GaussianNB(),
                                 max_features=0.5, max_samples=0.2,
                                 n_estimators=50)

models['gnb_bag'] = gnb_bag

In [30]:
# Hard Voting Classifier
vc_hard_estimators = [('lr', lr),
                   ('lsvc', lsvc), ('lsvc_bag', lsvc_bag),
                   ('dtc', dtc), ('dtc_bag_best', dtc_bag),
                   ('rfc_bag', rfc_bag),
                   ('gnb', gnb_bag)]

vc_hard_w = (1, 0.5, 0.5, 0.5, 1, 0.5, 0.5)

vc_hard = VotingClassifier(estimators=vc_hard_estimators, voting='hard', weights=vc_hard_w, n_jobs=4)

models['vc_hard'] = vc_hard

In [31]:
# Soft Voting Classifier
vc_soft_estimators = [('lr', lr), ('lr_bag', lr_bag),
                   ('dtc', dtc), ('dtc_bag_best', dtc_bag),
                   ('rfc_bag', rfc_bag),
                   ('gnb', gnb_bag)]

vc_soft_w = (1, 0.5, 1, 0.5, 1, 0.5)

vc_soft = VotingClassifier(estimators=vc_soft_estimators, voting='soft', weights=vc_soft_w, n_jobs=4)

models['vc_soft'] = vc_soft

In [32]:
# Hard Voting Classifier (Random)

In [33]:
# Soft Voting Classifier (Random)

### Test Performance