In [91]:
import pandas as pd
import numpy as np

In [92]:
lc_df = pd.read_csv('../data/lc_cleaned.csv').drop(['title','state','policy_code','division'], axis = 1)
lc_eo2012 = lc_df[lc_df['year']<=2012]
lc_eo2012 = lc_eo2012.sort_values(['year','length_of_emp'], ascending = [True, False]).\
            reset_index().drop(['index'], axis = 1)

In [93]:
lc_eo2012.isnull().sum()

amount                      0
risk_score              23927
debt_to_income_ratio        0
length_of_emp           11167
year                        0
status                      0
region                      0
dtype: int64

In [94]:
lc_eo2012[lc_eo2012['length_of_emp'].isnull()].describe()

Unnamed: 0,amount,risk_score,debt_to_income_ratio,year,status
count,11167.0,11046.0,11167.0,11167.0,11167.0
mean,11919.244202,673.668749,10.104746,2011.224859,0.271962
std,9297.179228,137.02529,140.321002,0.845084,0.44499
min,1000.0,0.0,-0.01,2008.0,0.0
25%,5000.0,664.0,0.05435,2010.0,0.0
50%,10000.0,695.0,0.1305,2011.0,0.0
75%,16250.0,736.0,0.2107,2012.0,1.0
max,35000.0,850.0,1999.98,2012.0,1.0


In [95]:
lc_eo2012.dropna(subset = ['length_of_emp'], inplace = True)

In [96]:
lc_eo2012[lc_eo2012['risk_score'].isnull()].describe()

Unnamed: 0,amount,risk_score,debt_to_income_ratio,year,status
count,23806.0,0.0,23806.0,23806.0,23806.0
mean,11726.823266,,107.678587,2009.996051,0.0
std,9378.244384,,312.447729,1.251819,0.0
min,500.0,,-0.01,2007.0,0.0
25%,4000.0,,-0.01,2009.0,0.0
50%,10000.0,,-0.01,2010.0,0.0
75%,18700.0,,0.0985,2011.0,0.0
max,90000.0,,1999.98,2012.0,0.0


In [97]:
lc_eo2012.loc[lc_eo2012['risk_score'].isnull(), 'risk_score'] = 0

In [98]:
lc_eo2012.isnull().sum()

amount                  0
risk_score              0
debt_to_income_ratio    0
length_of_emp           0
year                    0
status                  0
region                  0
dtype: int64

dummification

In [99]:
lc_dummies = pd.get_dummies(lc_eo2012, drop_first = True)
lc_dummies.columns

Index(['amount', 'risk_score', 'debt_to_income_ratio', 'year', 'status',
       'length_of_emp_10+ years', 'length_of_emp_2 years',
       'length_of_emp_3 years', 'length_of_emp_4 years',
       'length_of_emp_5 years', 'length_of_emp_6 years',
       'length_of_emp_7 years', 'length_of_emp_8 years',
       'length_of_emp_9 years', 'length_of_emp_< 1 year', 'region_Northeast',
       'region_South', 'region_West'],
      dtype='object')

train test split

In [100]:
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, train_test_split

In [101]:
features = lc_dummies.drop(['status'], axis = 1)
target = lc_dummies['status']

In [102]:
X_train, X_test, y_train, y_test = train_test_split(features, target, 
                                                    test_size=0.2, stratify = target,
                                                    random_state = 13)

In [103]:
y_train.value_counts()

0    597872
1     74292
Name: status, dtype: int64

In [104]:
X_train.shape

(672164, 17)

### linear models

checking VIF

In [105]:
X_vif = pd.DataFrame(X_train, dtype=float)

from statsmodels.stats.outliers_influence import variance_inflation_factor 
vif_data = pd.DataFrame() 
vif_data["feature"] = X_vif.columns
vif_data["VIF"] = [variance_inflation_factor(X_vif.values, i) 
                          for i in range(len(X_vif.columns))] 
print(vif_data)

                    feature        VIF
0                    amount   2.678943
1                risk_score  10.224048
2      debt_to_income_ratio   1.000480
3                      year  40.019286
4   length_of_emp_10+ years   2.945271
5     length_of_emp_2 years   2.090122
6     length_of_emp_3 years   1.875852
7     length_of_emp_4 years   1.721389
8     length_of_emp_5 years   1.695365
9     length_of_emp_6 years   1.529877
10    length_of_emp_7 years   1.400624
11    length_of_emp_8 years   1.348266
12    length_of_emp_9 years   1.277495
13   length_of_emp_< 1 year  18.904510
14         region_Northeast   2.321241
15             region_South   3.291501
16              region_West   2.482827


simple linear regression

In [106]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, roc_auc_score

In [107]:
# logistic regression with no regularization or tuning
logit = LogisticRegression(solver='liblinear', random_state = 13, class_weight = 'balanced')
logit.fit(X_train, y_train)

LogisticRegression(class_weight='balanced', random_state=13, solver='liblinear')

In [108]:
print('accuracy:')
print(f'train: {logit.score(X_train, y_train)}')
print(f'test: {logit.score(X_test, y_test)}')
print('--'*40)
y_train_pred = logit.predict(X_train)
y_test_pred = logit.predict(X_test)
print('confusion matrix:')
print('train:')
print(confusion_matrix(y_train, y_train_pred))
print('test:')
print(confusion_matrix(y_test, y_test_pred))
print('--'*40)
print('AUC-ROC:')
print(f'train: {roc_auc_score(y_train, y_train_pred)}')
print(f'test: {roc_auc_score(y_test, y_test_pred)}')

accuracy:
train: 0.8195425521152576
test: 0.8194904814896364
--------------------------------------------------------------------------------
confusion matrix:
train:
[[484597 113275]
 [  8022  66270]]
test:
[[121177  28291]
 [  2042  16531]]
--------------------------------------------------------------------------------
AUC-ROC:
train: 0.8512785220811595
test: 0.8503887421516707


linear regression with regularization and CV

In [None]:
logit = LogisticRegression(random_state = 13, max_iter = 1000)
logit.fit(X_train, y_train)

logit_grid_params = [{
    'C': [5e-4,5e-3,5e-2,0.5,1,5,50,500,5e3,5e4],
    'penalty':['l1','l2'],
    'solver': ['liblinear'],
    'class_weight':[None,'balanced'],
    'random_state':[13]
}]

grid_search_logit = GridSearchCV(logit, logit_grid_params, scoring='roc_auc', cv=5,
                                 verbose = 1, n_jobs=-1)
%time grid_search_logit.fit(X_train, y_train)

print(grid_search_logit.best_params_)
print(grid_search_logit.best_estimator_)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
