In [1]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [21]:
train_df = pd.read_csv(Path('Resources/2019loans.csv'))
test_df = pd.read_csv(Path('Resources/2020Q1loans.csv'))

test_df.columns

Index(['loan_amnt', 'int_rate', 'installment', 'home_ownership', 'annual_inc',
       'verification_status', 'pymnt_plan', 'dti', 'delinq_2yrs',
       'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'total_acc',
       'initial_list_status', 'out_prncp', 'out_prncp_inv', 'total_pymnt',
       'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int',
       'total_rec_late_fee', 'recoveries', 'collection_recovery_fee',
       'last_pymnt_amnt', 'collections_12_mths_ex_med', 'policy_code',
       'application_type', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal',
       'open_acc_6m', 'open_act_il', 'open_il_12m', 'open_il_24m',
       'mths_since_rcnt_il', 'total_bal_il', 'il_util', 'open_rv_12m',
       'open_rv_24m', 'max_bal_bc', 'all_util', 'total_rev_hi_lim', 'inq_fi',
       'total_cu_tl', 'inq_last_12m', 'acc_open_past_24mths', 'avg_cur_bal',
       'bc_open_to_buy', 'bc_util', 'chargeoff_within_12_mths', 'delinq_amnt',
       'mo_sin_old_il_acct', 'mo_sin_old_rev_tl_op', 'm

In [16]:
#check for null values in train_df
# for column in train_df.columns:
    
#     print(f'Column {column} has {train_df[column].isnull().sum()} null values')
    

In [45]:
# Convert categorical data to numeric and separate target feature for training data
X_train = train_df.drop('target', axis=1)
X_train

X_train_dummies=pd.get_dummies(X_train)
X_train_dummies

y=pd.get_dummies(train_df['target'])
y_train=y[['high_risk']]
y_train



Unnamed: 0,high_risk
0,0
1,0
2,0
3,0
4,0
...,...
12175,1
12176,1
12177,1
12178,1


In [17]:
#check for null values in test_df
# for column in test_df.columns:
    
#     print(f'Column {column} has {test_df[column].isnull().sum()} null values')

In [46]:
# Convert categorical data to numeric and separate target feature for testing data
X_test = test_df.drop('target', axis=1)
X_test

X_test_dummies=pd.get_dummies(X_test)
X_test_dummies

y_t=pd.get_dummies(test_df['target'])
y_test=y[['high_risk']]
y_test



Unnamed: 0,high_risk
0,0
1,0
2,0
3,0
4,0
...,...
12175,1
12176,1
12177,1
12178,1


In [48]:
#Code from RL to help us find out what is missing is the testing set
train_list = X_train_dummies.columns.to_list()
test_list = X_test_dummies.columns.to_list()
# print(train_list)


# check for differences between column lists
# print(list(set(train_list).symmetric_difference(set(test_list))))

print("missing from train_list: ",list(set(test_list) - set(train_list)))
print("missing from test_list: ", list(set(train_list) - set(test_list)))

print(f'train column count: {len(train_list)}')
print(f'test column count:  {len(test_list)}')


missing from train_list:  []
missing from test_list:  ['debt_settlement_flag_Y']
train column count: 92
test column count:  91


In [36]:
# add missing dummy variables to testing set
X_test_dummies['debt_settlement_flag_Y']='0'
X_test_dummies


Unnamed: 0,loan_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,...,verification_status_Verified,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,hardship_flag_Y,debt_settlement_flag_N,debt_settlement_flag_Y
0,40000.0,0.1033,856.40,128700.0,12.47,0.0,1.0,8.0,0.0,38113.0,...,0,1,0,1,1,0,0,1,1,0
1,24450.0,0.1430,572.72,44574.0,15.05,0.0,1.0,6.0,0.0,1665.0,...,0,1,0,1,1,0,1,0,1,0
2,13500.0,0.1430,316.23,60000.0,28.72,0.0,0.0,8.0,0.0,13857.0,...,0,1,0,1,1,0,0,1,1,0
3,10625.0,0.1774,268.31,60000.0,15.70,0.0,4.0,17.0,0.0,6216.0,...,1,1,0,1,1,0,1,0,1,0
4,6375.0,0.1862,232.46,60000.0,35.50,0.0,0.0,13.0,0.0,12681.0,...,0,1,0,1,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4697,30000.0,0.1240,673.42,140480.0,15.74,0.0,0.0,20.0,0.0,23215.0,...,0,1,1,0,1,0,1,0,1,0
4698,24000.0,0.0756,747.22,50000.0,26.81,0.0,0.0,9.0,0.0,459.0,...,0,1,0,1,1,0,1,0,1,0
4699,10000.0,0.2305,387.36,33000.0,38.51,0.0,2.0,7.0,0.0,6342.0,...,1,1,1,0,1,0,1,0,1,0
4700,8000.0,0.1862,205.86,38000.0,16.36,0.0,1.0,8.0,1.0,11636.0,...,0,1,0,1,1,0,1,0,1,0


In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score

In [None]:
# Train a Random Forest Classifier model and print the model score

In [None]:
# Scale the data

In [None]:
# Train the Logistic Regression model on the scaled data and print the model score

In [None]:
# Train a Random Forest Classifier model on the scaled data and print the model score