[View in Colaboratory](https://colab.research.google.com/github/dwy904/credit_risk_modeling_hcg/blob/master/home_credit_default_risk.ipynb)

In [0]:
# !pip install kaggle
# !pip install boto3
# !pip install pandas
# !pip install numpy

# !export PATH = "/content/.local/bin/kaggle"
# !export PATH = "/content/.local/lib/python3.6/site-packages/kaggle-1.3.12.dist-info/*"
# !export PATH = "/content/.local/lib/python3.6/site-packages/kaggle/*"
# !ln -s ~/.local/bin/kaggle /usr/bin/kaggle
# !wget https://github.com/fatenaught/home_risk/blob/master/kaggle.json
# !cp kaggle.json .kaggle 
# !chmod 755 .kaggle/kaggle.json
# !kaggle competitions download -c home-credit-default-risk


In [0]:
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from google.colab import files

# Functions

In [0]:
def accuracy_check(score,cm):
  plt.figure(figsize=(9,9))
  sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
  plt.ylabel('Actual label');
  plt.xlabel('Predicted label');
  all_sample_title = 'Accuracy Score: {0}'.format(score)
  plt.title(all_sample_title, size = 15);

# Data Cleaning And Exploration

In [0]:
application_train_original = pd.read_csv("https://s3.amazonaws.com/home-risk/application_train.csv")
application_test = pd.read_csv("https://s3.amazonaws.com/home-risk/application_test.csv")
bureau = pd.read_csv("https://s3.amazonaws.com/home-risk/bureau.csv")
bureau_balance = pd.read_csv("https://s3.amazonaws.com/home-risk/bureau_balance.csv")
credit_card_balance = pd.read_csv("https://s3.amazonaws.com/home-risk/credit_card_balance.csv")
install_payments = pd.read_csv("https://s3.amazonaws.com/home-risk/installments_payments.csv")
previous_applications = pd.read_csv("https://s3.amazonaws.com/home-risk/previous_application.csv")
pos_cash = pd.read_csv("https://s3.amazonaws.com/home-risk/POS_CASH_balance.csv")

In [172]:
application_train_original.head(3)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
print(Counter(application_train_original['TARGET'])) #imbalanced issue
application_train = pd.get_dummies(application_train_original)
print(application_train.loc[:,application_train.columns[application_train.isnull().sum() > 0]].max().max() < 10000000)
application_train = application_train.fillna(10000000)

# print(application_train.isnull().values.any())
# print(application_train.shape)
# application_train = application_train.fillna(application_train.mean()['SK_ID_CURR':'AMT_REQ_CREDIT_BUREAU_YEAR'])


# print(application_train.dtypes)

Counter({0: 282686, 1: 24825})
True


# Main Table Modeling

##Linear Regression

In [0]:
application_train = pd.get_dummies(application_train_original)
application_train = application_train.fillna(application_train.mean()['SK_ID_CURR':'AMT_REQ_CREDIT_BUREAU_YEAR'])
application_data = application_train.drop(columns=['SK_ID_CURR', 'TARGET'])
application_target = application_train['TARGET']
x_train, x_test, y_train, y_test = train_test_split(application_data, application_target, test_size= 0.25, random_state = 0)

In [0]:
model_logit = LogisticRegression(class_weight = 'balanced')
model_logit.fit(x_train,y_train)
logit_score_train = metrics.roc_auc_score(y_train, model_logit.predict(x_train))
logit_score_train = metrics.roc_auc_score(y_test, model_logit.predict(x_test))

logit_score_test = model_logit.score(x_test,y_test) 
logit_cm_test = metrics.confusion_matrix(y_test, model_logit.predict(x_test))
accuracy_check(logit_score, logit_cm)

## Random Forest


**Model V1**


</b> Feature - Main Table</br>
</b> Optimal Tuning</br>

* </b> max_depth: 10 n_tree: 20 max_feature: None train_roc: 0.816 val_roc: 0.733 </br>
* </b>  max_depth: 10 n_tree: 70 max_feature: 48 train_roc: 0.816 val_roc: 0.741</b> 







In [0]:
x_train_ori, x_test, y_train_ori, y_test  = train_test_split(application_data, application_target, test_size = 0.1, random_state = 1)
x_train, x_val, y_train, y_val = train_test_split(x_train_ori, y_train_ori, test_size = 0.25, random_state = 1)

In [0]:
model_rf_version = []
para_max_depth = []
para_n_tree = []
para_max_feature = []
metric_roc_val = []
metric_roc_train = []

i = 1
for value_n_tree in [90, 120, 150]: 
  for value_max_depth in [10]:
    for value_max_feature in [50]:
      
      model_rf = \
        RandomForestClassifier(class_weight = 'balanced', verbose = 0, n_jobs = -1, n_estimators = value_n_tree, 
                               max_depth = value_max_depth, max_features = value_max_feature)
      model_rf.fit(x_train, y_train)
      
      roc_val = metrics.roc_auc_score(y_val, [j[1] for j in model_rf.predict_proba(x_val)]).round(3)
      roc_train = metrics.roc_auc_score(y_train, [j[1] for j in model_rf.predict_proba(x_train)]).round(3)
      
      model_rf_version.append(model_rf)
      para_max_depth.append(value_max_depth)
      para_n_tree.append(value_n_tree)
      para_max_feature.append(str(value_max_feature))
      metric_roc_val.append(roc_val)
      metric_roc_train.append(roc_train)
      
      print('iter', i, '=>', 'max_depth:', value_max_depth, 'n_tree:', value_n_tree, 'max_feature:', value_max_feature,
            'train_roc:', roc_train, 'val_roc:', roc_val)
      i += 1           

In [0]:
model_rf_tuning = pd.DataFrame(para_max_depth, columns=['max_depth'])
model_rf_tuning['n_tree'] = para_n_tree
model_rf_tuning['max_feature'] = para_max_feature
model_rf_tuning['n_tree'] = para_n_tree
model_rf_tuning['roc_val'] = metric_roc_val
model_rf_tuning['roc_train'] = metric_roc_train
model_rf_tuning.to_csv('model_rf_tuning_indepth.csv', index = False, index_label = False)
files.download('model_rf_tuning_indepth.csv')

In [196]:
feature_importance = pd.DataFrame(x_train.columns, columns=['feature_name'])
feature_importance['importance'] = model_rf.feature_importances_.round(2)
feature_importance = feature_importance.sort_values(by = 'importance', ascending = False)
feature_importance.head(10)

Unnamed: 0,feature_name,importance
28,EXT_SOURCE_2,0.2
29,EXT_SOURCE_3,0.18
27,EXT_SOURCE_1,0.05
6,DAYS_BIRTH,0.04
7,DAYS_EMPLOYED,0.04
9,DAYS_ID_PUBLISH,0.02
129,NAME_EDUCATION_TYPE_Higher education,0.02
2,AMT_CREDIT,0.02
3,AMT_ANNUITY,0.02
4,AMT_GOODS_PRICE,0.02


In [218]:
metrics.roc_auc_score(y_test, [i[1].round(3) for i in model_rf_version[np.argsort(metric_roc_val)[-1]].predict_proba(x_test)]).round(3)

0.749

In [0]:
model_rf_ultimate = RandomForestClassifier(class_weight = 'balanced', verbose = 0, n_jobs = -1, 
                                           n_estimators = 50, max_depth = 10, max_features = 65)
model_rf_ultimate.fit(x_train, y_train)

# Merge Bureau Table

In [22]:
print(list(bureau.columns))
#print(bureau.groupby(['SK_ID_BUREAU','CREDIT_ACTIVE']).size().unstack())

print(bureau_balance.head(3))
print(bureau_balance.groupby(['SK_ID_BUREAU','MONTHS_BALANCE']).size().unstack())


['SK_ID_CURR', 'SK_ID_BUREAU', 'CREDIT_ACTIVE', 'CREDIT_CURRENCY', 'DAYS_CREDIT', 'CREDIT_DAY_OVERDUE', 'DAYS_CREDIT_ENDDATE', 'DAYS_ENDDATE_FACT', 'AMT_CREDIT_MAX_OVERDUE', 'CNT_CREDIT_PROLONG', 'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_LIMIT', 'AMT_CREDIT_SUM_OVERDUE', 'CREDIT_TYPE', 'DAYS_CREDIT_UPDATE', 'AMT_ANNUITY']
   SK_ID_BUREAU  MONTHS_BALANCE STATUS
0       5715448               0      C
1       5715448              -1      C
2       5715448              -2      C
MONTHS_BALANCE  -96  -95  -94  -93  -92  -91  -90  -89  -88  -87 ...   -9   \
SK_ID_BUREAU                                                     ...         
5001709         1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0  1.0 ...   1.0   
5001710         NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN ...   1.0   
5001711         NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN ...   NaN   
5001712         NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN ...   1.0   
5001713         NaN  NaN  NaN  NaN  N