In [21]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix

mort = pd.read_csv('../data/data.csv', header=0) # data about mortgage
mort.head()

Unnamed: 0,Bo_Age,Ln_Orig,Orig_LTV_Ratio_Pct,Credit_score,First_home,Tot_mthly_debt_exp,Tot_mthly_incm,orig_apprd_val_amt,pur_prc_amt,DTI Ratio,Status,OUTCOME,State,Median_state_inc,UPB>Appraisal
0,37,75650.0,85,669,N,1707,6000,169000,160000,0.2845,Active,non-default,FL,40171,0
1,46,390775.0,102,684,N,0,5025,310000,309000,0.0,Pay-off,non-default,NY,44228,1
2,30,112500.0,90,662,Y,1812,4800,177000,176450,0.3775,Pay-off,non-default,CA,49894,0
3,24,85250.0,97,647,N,3395,6934,113000,110000,0.489616,Pay-off,non-default,GA,43217,0
4,35,114000.0,100,791,N,3801,5504,103000,103000,0.690589,Pay-off,non-default,NH,57352,1


## 전처리 과정

In [22]:
## First_home 문자형 데이터 변환
mort['First_home'] = np.where(mort['First_home']=='Y',1,0)

## OUTCOME 데이터 변환
mort['OUTCOME'] = np.where(mort['OUTCOME']=='non-default',0,1) # non-default = 0, default = 1

## string 타입을 정수로 변환하기
for k in range(0,10607):
    mort.Median_state_inc[k] = np.float64(mort.Median_state_inc[k].replace(",", ""))
    
## Separating 'target' feature
outcome = mort.OUTCOME
mort.drop(['OUTCOME'], axis='columns', inplace=True)

## Delete feature
mort = mort.drop('Status', axis=1)
mort = mort.drop('State', axis=1)
mort = mort.drop('orig_apprd_val_amt', axis=1)
mort = mort.drop('UPB>Appraisal', axis=1)

mort

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Unnamed: 0,Bo_Age,Ln_Orig,Orig_LTV_Ratio_Pct,Credit_score,First_home,Tot_mthly_debt_exp,Tot_mthly_incm,pur_prc_amt,DTI Ratio,Median_state_inc
0,37,75650.0,85,669,0,1707,6000,160000,0.284500,40171
1,46,390775.0,102,684,0,0,5025,309000,0.000000,44228
2,30,112500.0,90,662,1,1812,4800,176450,0.377500,49894
3,24,85250.0,97,647,0,3395,6934,110000,0.489616,43217
4,35,114000.0,100,791,0,3801,5504,103000,0.690589,57352
...,...,...,...,...,...,...,...,...,...,...
10602,37,280200.0,95,652,1,727,1827,80000,0.397920,42590
10603,30,57000.0,94,589,0,1703,3941,108000,0.432124,45787
10604,33,153200.0,95,675,1,1726,3707,154000,0.465606,43217
10605,38,158850.0,100,786,1,1364,2500,92000,0.545600,40171


In [42]:
# X, y = mort, outcome
## training data와 test data 나누기
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(mort, outcome, test_size=0.25, random_state=1)

scaler = StandardScaler()
X_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

from imblearn.over_sampling import SMOTE
sm = SMOTE(ratio='auto', kind='regular')
X_resampled, y_resampled = sm.fit_sample(X_std, y_train)

pd.Series(y_resampled).value_counts()

1    7723
0    7723
dtype: int64

In [43]:
## 특징 선택
from sklearn.feature_selection import SelectKBest, f_classif
selector = SelectKBest(f_classif, k = 7).fit(X_resampled, y_resampled)
X_resampled_f = selector.transform(X_resampled)
X_test_std_f = selector.transform(X_test_std)

In [25]:
forest = RandomForestClassifier(criterion='gini', n_estimators=500, max_leaf_nodes=16, random_state=1)
forest.fit(X_resampled, y_resampled)

pred = forest.predict(X_test_std)

In [26]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.99      0.78      0.87      2602
           1       0.04      0.50      0.08        50

    accuracy                           0.77      2652
   macro avg       0.51      0.64      0.47      2652
weighted avg       0.97      0.77      0.85      2652



In [27]:
confusion_matrix(y_test, pred)

array([[2019,  583],
       [  25,   25]], dtype=int64)

In [28]:
from sklearn.metrics import f1_score
f1_score(y_test, pred)

0.07598784194528875

## 특징 선택 적용

In [29]:
forest = RandomForestClassifier(criterion='gini', n_estimators=500, max_leaf_nodes=16, random_state=1)
forest.fit(X_resampled_f, y_resampled)

pred = forest.predict(X_test_std_f)

In [30]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.99      0.77      0.87      2602
           1       0.04      0.50      0.07        50

    accuracy                           0.77      2652
   macro avg       0.51      0.64      0.47      2652
weighted avg       0.97      0.77      0.85      2652



In [31]:
confusion_matrix(y_test, pred)

array([[2008,  594],
       [  25,   25]], dtype=int64)

## GridSearchCV

In [15]:
# X, y = mort, outcome
## training data와 test data 나누기
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(mort, outcome, test_size=0.25, random_state=1)

scaler = StandardScaler()
X_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

from imblearn.over_sampling import SMOTE
sm = SMOTE(ratio='auto', kind='regular')
X_resampled, y_resampled = sm.fit_sample(X_std, y_train)

pd.Series(y_resampled).value_counts()

1    7723
0    7723
dtype: int64

In [44]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators':[800],
    'max_leaf_nodes':[64],
    'max_depth': [20]
}
forest_g = RandomForestClassifier(criterion='gini', random_state=1, n_jobs=-1)
gs = GridSearchCV(forest_g, param_grid, cv=5).fit(X_resampled_f, y_resampled)

In [45]:
gs.best_params_

{'max_depth': 20, 'max_leaf_nodes': 64, 'n_estimators': 800}

In [46]:
pred_gs = gs.predict(X_test_std_f)

In [47]:
print(classification_report(y_test, pred_gs))

              precision    recall  f1-score   support

           0       0.99      0.83      0.90      2602
           1       0.04      0.42      0.08        50

    accuracy                           0.82      2652
   macro avg       0.52      0.62      0.49      2652
weighted avg       0.97      0.82      0.88      2652



In [48]:
confusion_matrix(y_test, pred_gs)

array([[2152,  450],
       [  29,   21]], dtype=int64)