In [1]:
import pandas as pd
import numpy as np

In [2]:
lc_df = pd.read_csv('../data/lc_cleaned.csv').drop(['title','state','policy_code','division'], axis = 1)
lc_eo2012 = lc_df[lc_df['year']<=2012]
lc_eo2012 = lc_eo2012.sort_values(['year','length_of_emp'], ascending = [True, False]).\
            reset_index().drop(['index'], axis = 1)

In [3]:
lc_eo2012.isnull().sum()

amount                      0
risk_score              23927
debt_to_income_ratio        0
length_of_emp           11167
year                        0
status                      0
region                      0
dtype: int64

In [4]:
lc_eo2012[lc_eo2012['length_of_emp'].isnull()].describe()

Unnamed: 0,amount,risk_score,debt_to_income_ratio,year,status
count,11167.0,11046.0,11167.0,11167.0,11167.0
mean,11919.244202,673.668749,10.104746,2011.224859,0.271962
std,9297.179228,137.02529,140.321002,0.845084,0.44499
min,1000.0,0.0,-0.01,2008.0,0.0
25%,5000.0,664.0,0.05435,2010.0,0.0
50%,10000.0,695.0,0.1305,2011.0,0.0
75%,16250.0,736.0,0.2107,2012.0,1.0
max,35000.0,850.0,1999.98,2012.0,1.0


In [5]:
lc_eo2012.dropna(subset = ['length_of_emp'], inplace = True)

In [6]:
lc_eo2012[lc_eo2012['risk_score'].isnull()].describe()

Unnamed: 0,amount,risk_score,debt_to_income_ratio,year,status
count,23806.0,0.0,23806.0,23806.0,23806.0
mean,11726.823266,,107.678587,2009.996051,0.0
std,9378.244384,,312.447729,1.251819,0.0
min,500.0,,-0.01,2007.0,0.0
25%,4000.0,,-0.01,2009.0,0.0
50%,10000.0,,-0.01,2010.0,0.0
75%,18700.0,,0.0985,2011.0,0.0
max,90000.0,,1999.98,2012.0,0.0


In [7]:
lc_eo2012.loc[lc_eo2012['risk_score'].isnull(), 'risk_score'] = 0

In [8]:
lc_eo2012.isnull().sum()

amount                  0
risk_score              0
debt_to_income_ratio    0
length_of_emp           0
year                    0
status                  0
region                  0
dtype: int64

dummification

In [9]:
lc_dummies = pd.get_dummies(lc_eo2012, drop_first = True)
lc_dummies.columns

Index(['amount', 'risk_score', 'debt_to_income_ratio', 'year', 'status',
       'length_of_emp_10+ years', 'length_of_emp_2 years',
       'length_of_emp_3 years', 'length_of_emp_4 years',
       'length_of_emp_5 years', 'length_of_emp_6 years',
       'length_of_emp_7 years', 'length_of_emp_8 years',
       'length_of_emp_9 years', 'length_of_emp_< 1 year', 'region_Northeast',
       'region_South', 'region_West'],
      dtype='object')

train test split

In [10]:
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, train_test_split

In [11]:
features = lc_dummies.drop(['status'], axis = 1)
target = lc_dummies['status']

In [12]:
X_train, X_test, y_train, y_test = train_test_split(features, target, 
                                                    test_size=0.2, stratify = target,
                                                    random_state = 13)

In [13]:
y_train.value_counts()

0    597872
1     74292
Name: status, dtype: int64

In [14]:
X_train.shape

(672164, 17)

random undersampling

In [15]:
from imblearn.under_sampling import RandomUnderSampler

In [16]:
rus = RandomUnderSampler(random_state = 13)
X_train, y_train = rus.fit_sample(X_train, y_train)
print(f'RandomUnderSampler: {X_train.shape}')

RandomUnderSampler: (148584, 17)


In [17]:
y_train.value_counts()

1    74292
0    74292
Name: status, dtype: int64

### linear models

checking VIF

In [18]:
X_vif = pd.DataFrame(X_train, dtype=float)

from statsmodels.stats.outliers_influence import variance_inflation_factor 
vif_data = pd.DataFrame() 
vif_data["feature"] = X_vif.columns
vif_data["VIF"] = [variance_inflation_factor(X_vif.values, i) 
                          for i in range(len(X_vif.columns))] 
print(vif_data)

                    feature        VIF
0                    amount   2.955938
1                risk_score  17.174929
2      debt_to_income_ratio   1.008414
3                      year  40.358270
4   length_of_emp_10+ years   3.842012
5     length_of_emp_2 years   2.222821
6     length_of_emp_3 years   2.021623
7     length_of_emp_4 years   1.920220
8     length_of_emp_5 years   1.933349
9     length_of_emp_6 years   1.714004
10    length_of_emp_7 years   1.568887
11    length_of_emp_8 years   1.464795
12    length_of_emp_9 years   1.377927
13   length_of_emp_< 1 year   8.779068
14         region_Northeast   2.437400
15             region_South   3.306269
16              region_West   2.704959


simple linear regression

In [19]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, roc_auc_score

In [20]:
# logistic regression with no regularization or tuning
logit = LogisticRegression(solver='liblinear', random_state = 13)
logit.fit(X_train, y_train)

LogisticRegression(random_state=13, solver='liblinear')

In [21]:
print('accuracy:')
print(f'train: {logit.score(X_train, y_train)}')
print(f'test: {logit.score(X_test, y_test)}')
print('--'*40)
y_train_pred = logit.predict(X_train)
y_test_pred = logit.predict(X_test)
print('confusion matrix:')
print('train:')
print(confusion_matrix(y_train, y_train_pred))
print('test:')
print(confusion_matrix(y_test, y_test_pred))
print('--'*40)
print('AUC-ROC:')
print(f'train: {roc_auc_score(y_train, y_train_pred)}')
print(f'test: {roc_auc_score(y_test, y_test_pred)}')

accuracy:
train: 0.8505828352985516
test: 0.8180087002576752
--------------------------------------------------------------------------------
confusion matrix:
train:
[[60030 14262]
 [ 7939 66353]]
test:
[[120896  28572]
 [  2010  16563]]
--------------------------------------------------------------------------------
AUC-ROC:
train: 0.8505828352985516
test: 0.8503102071847372


linear regression with regularization and CV

In [22]:
logit = LogisticRegression(random_state = 13)
logit.fit(X_train, y_train)

logit_grid_params = [{
    'C': np.linspace(1e-3,1,30),
    'penalty':['l1','l2'],
    'solver': ['liblinear'],
    'class_weight':[None],
    'random_state':[13]
}]

logit_grid_search = GridSearchCV(logit, logit_grid_params, scoring='roc_auc', cv=3, verbose = 1, n_jobs=-1)
%time logit_grid_search.fit(X_train, y_train)

print(logit_grid_search.best_params_)
print(logit_grid_search.best_estimator_)

Fitting 3 folds for each of 60 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  4.1min finished


Wall time: 4min 6s
{'C': 0.44882758620689656, 'class_weight': None, 'penalty': 'l2', 'random_state': 13, 'solver': 'liblinear'}
LogisticRegression(C=0.44882758620689656, random_state=13, solver='liblinear')


In [23]:
print('accuracy:')
print(logit_grid_search.best_estimator_.score(X_train, y_train))
print(logit_grid_search.best_estimator_.score(X_test, y_test))
print('--'*30)
y_train_pred = logit_grid_search.best_estimator_.predict(X_train)
y_test_pred = logit_grid_search.best_estimator_.predict(X_test)
print('confusion matrix:')
print(confusion_matrix(y_train, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))
print('--'*30)
print('AUC-ROC:')
print(roc_auc_score(y_train, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

accuracy:
0.8499232757228234
0.8156937890157759
------------------------------------------------------------
confusion matrix:
[[59822 14470]
 [ 7829 66463]]
[[120502  28966]
 [  2005  16568]]
------------------------------------------------------------
AUC-ROC:
0.8499232757228234
0.8491268033118788


linear with lasso

In [37]:
logit = LogisticRegression(random_state = 13)
logit.fit(X_train, y_train)

logit_grid_params = [{
    'C': np.linspace(1e-3,0.1,30), #np.linspace(1e-3,1,30),
    'penalty':['l1'],
    'solver': ['liblinear'],
    'class_weight':[None],
    'random_state':[13]
}]

logit_grid_search = GridSearchCV(logit, logit_grid_params, scoring='roc_auc', cv=3, verbose = 1, n_jobs=-1)
%time logit_grid_search.fit(X_train, y_train)

print(logit_grid_search.best_params_)
print(logit_grid_search.best_estimator_)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  8.4min finished


Wall time: 8min 38s
{'C': 0.06244827586206897, 'class_weight': None, 'penalty': 'l1', 'random_state': 13, 'solver': 'liblinear'}
LogisticRegression(C=0.06244827586206897, penalty='l1', random_state=13,
                   solver='liblinear')




In [38]:
print('accuracy:')
print(logit_grid_search.best_estimator_.score(X_train, y_train))
print(logit_grid_search.best_estimator_.score(X_test, y_test))
print('--'*30)
y_train_pred = logit_grid_search.best_estimator_.predict(X_train)
y_test_pred = logit_grid_search.best_estimator_.predict(X_test)
print('confusion matrix:')
print(confusion_matrix(y_train, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))
print('--'*30)
print('AUC-ROC:')
print(roc_auc_score(y_train, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

accuracy:
0.8508587734883971
0.8133431722020221
------------------------------------------------------------
confusion matrix:
[[59594 14698]
 [ 7462 66830]]
[[120003  29465]
 [  1901  16672]]
------------------------------------------------------------
AUC-ROC:
0.8508587734883972
0.8502573127893437


In [42]:
logit_grid_search.best_estimator_.coef_

array([[-2.01411270e-05,  5.50206712e-03, -2.62339822e+00,
        -7.00813998e-04,  6.56410720e-01,  7.28953978e-02,
         1.22703086e-01,  3.01943457e-01,  4.68129374e-01,
         4.44959834e-01,  5.45105426e-01,  3.71213616e-01,
         5.62794936e-01, -2.87227799e+00,  1.06283778e-01,
        -5.79292501e-03,  2.20708258e-01]])

In [43]:
# non-zero beta coefficients
lasso_cv_coefs = pd.DataFrame(
    {'features': X_train.columns,
     'coef': logit_grid_search.best_estimator_.coef_[0]}).sort_values('coef', ascending = False)

# non-zero coefficients
print(len(lasso_cv_coefs[lasso_cv_coefs['coef']!=0]))
lasso_cv_coefs[lasso_cv_coefs['coef']!=0]

17


Unnamed: 0,features,coef
4,length_of_emp_10+ years,0.656411
12,length_of_emp_9 years,0.562795
10,length_of_emp_7 years,0.545105
8,length_of_emp_5 years,0.468129
9,length_of_emp_6 years,0.44496
11,length_of_emp_8 years,0.371214
7,length_of_emp_4 years,0.301943
16,region_West,0.220708
6,length_of_emp_3 years,0.122703
14,region_Northeast,0.106284


stochastic gradient descent

In [45]:
sgd = SGDClassifier(random_state = 13)

In [61]:
# SGD CV
sgd_grid_params = [{
    'class_weight':[None],
    'learning_rate':['optimal'],
    'alpha': np.linspace(1,200,30),
    'random_state': [13]}]
sgd_grid_search = GridSearchCV(sgd, sgd_grid_params, scoring='roc_auc', cv=3, n_jobs=-1)
%time sgd_grid_search.fit(X_train, y_train)
print(sgd_grid_search.best_params_)

Wall time: 51.1 s
{'alpha': 151.9655172413793, 'class_weight': None, 'learning_rate': 'optimal', 'random_state': 13}


In [62]:
print('accuracy:')
print(sgd_grid_search.best_estimator_.score(X_train, y_train))
print(sgd_grid_search.best_estimator_.score(X_test, y_test))
print('--'*30)
y_train_pred = sgd_grid_search.best_estimator_.predict(X_train)
y_test_pred = sgd_grid_search.best_estimator_.predict(X_test)
print('confusion matrix:')
print(confusion_matrix(y_train, y_train_pred))
print(confusion_matrix(y_test, y_test_pred))
print('--'*30)
print('AUC-ROC:')
print(roc_auc_score(y_train, y_train_pred))
print(roc_auc_score(y_test, y_test_pred))

accuracy:
0.6267767727346147
0.8030480656506448
------------------------------------------------------------
confusion matrix:
[[63292 11000]
 [44455 29837]]
[[127520  21948]
 [ 11148   7425]]
------------------------------------------------------------
AUC-ROC:
0.6267767727346147
0.6264665349670662
