In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
mort = pd.read_csv('../data/data_active.csv', header=0) # data about mortgage
mort.head()

Unnamed: 0,Bo_Age,Ln_Orig,Orig_LTV_Ratio_Pct,Credit_score,First_home,Tot_mthly_debt_exp,Tot_mthly_incm,orig_apprd_val_amt,pur_prc_amt,DTI Ratio,Status,OUTCOME,State,Median_state_inc,UPB>Appraisal
0,18,154755.0,95,801,N,1950,57390,265000,265000,0.033978,Active,non-default,CA,49894,0
1,18,79500.0,89,721,N,1945,7000,283000,283000,0.277857,Pay-off,non-default,CA,49894,0
2,18,104800.0,80,684,N,0,3000,153000,153000,0.0,Active,non-default,LA,35523,0
3,18,238400.0,80,621,N,14277,51250,525000,525000,0.278576,Active,non-default,OH,44160,0
4,18,67900.0,97,660,N,0,6000,179000,178257,0.0,Active,non-default,NV,46984,0


In [3]:
mort['OUTCOME'].value_counts()

non-default    9948
default         659
Name: OUTCOME, dtype: int64

In [4]:
# mort = pd.read_csv('../data_active.csv', header=0) # data about mortgage
# mort.head()

## First_home 문자형 데이터 변환
mort['First_home'] = np.where(mort['First_home']=='Y',1,0)
mort['First_home']

## OUTCOME 데이터 변환
mort['OUTCOME'] = np.where(mort['OUTCOME']=='non-default',0,1) # non-default = 0, default = 1

## string 타입을 정수로 변환하기
for k in range(0,10607):
    mort.Median_state_inc[k] = np.float64(mort.Median_state_inc[k].replace(",", ""))

## Separating 'target' feature
outcome = mort.OUTCOME
mort.drop(['OUTCOME'], axis='columns', inplace=True)

mort = mort.drop('Status', axis=1)
mort = mort.drop('State', axis=1)
mort = mort.drop('orig_apprd_val_amt', axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [5]:
# X, y = mort, outcome
## training data와 test data 나누기
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(mort, outcome, test_size=0.25, random_state=1)

scaler = StandardScaler()
X_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

In [6]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(ratio='auto', kind='regular')
X_resampled, y_resampled = sm.fit_sample(X_std, y_train)

In [7]:
nn = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1).fit(X_resampled, y_resampled)
pred = nn.predict(X_test_std)

In [8]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.98      0.86      0.92      2478
           1       0.26      0.69      0.38       174

    accuracy                           0.85      2652
   macro avg       0.62      0.78      0.65      2652
weighted avg       0.93      0.85      0.88      2652



In [9]:
confusion_matrix(y_test, pred)

array([[2138,  340],
       [  54,  120]], dtype=int64)

## No Oversampling

In [10]:
mort_ = pd.read_csv('../data/data_active.csv', header=0) # data about mortgage
mort_.head()

Unnamed: 0,Bo_Age,Ln_Orig,Orig_LTV_Ratio_Pct,Credit_score,First_home,Tot_mthly_debt_exp,Tot_mthly_incm,orig_apprd_val_amt,pur_prc_amt,DTI Ratio,Status,OUTCOME,State,Median_state_inc,UPB>Appraisal
0,18,154755.0,95,801,N,1950,57390,265000,265000,0.033978,Active,non-default,CA,49894,0
1,18,79500.0,89,721,N,1945,7000,283000,283000,0.277857,Pay-off,non-default,CA,49894,0
2,18,104800.0,80,684,N,0,3000,153000,153000,0.0,Active,non-default,LA,35523,0
3,18,238400.0,80,621,N,14277,51250,525000,525000,0.278576,Active,non-default,OH,44160,0
4,18,67900.0,97,660,N,0,6000,179000,178257,0.0,Active,non-default,NV,46984,0


In [11]:
# mort = pd.read_csv('../data_active.csv', header=0) # data about mortgage
# mort.head()

## First_home 문자형 데이터 변환
mort_['First_home'] = np.where(mort_['First_home']=='Y',1,0)

## OUTCOME 데이터 변환
mort_['OUTCOME'] = np.where(mort_['OUTCOME']=='non-default',0,1) # non-default = 0, default = 1

## string 타입을 정수로 변환하기
for k in range(0,10607):
    mort_.Median_state_inc[k] = np.float64(mort_.Median_state_inc[k].replace(",", ""))

## Separating 'target' feature
outcome_ = mort_.OUTCOME
mort_.drop(['OUTCOME'], axis='columns', inplace=True)

mort_ = mort_.drop('Status', axis=1)
mort_ = mort_.drop('State', axis=1)
mort_ = mort_.drop('orig_apprd_val_amt', axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [12]:
# X, y = mort, outcome
## training data와 test data 나누기
from sklearn.model_selection import train_test_split
X_train_, X_test_, y_train_, y_test_ = train_test_split(mort_, outcome_, test_size=0.25, random_state=1)

scaler = StandardScaler()
X_std_ = scaler.fit_transform(X_train_)
X_test_std_ = scaler.transform(X_test_)

In [13]:
nn = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1).fit(X_std_, y_train_)
pred_ = nn.predict(X_test_std_)

In [14]:
print(classification_report(y_test_, pred_))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97      2478
           1       0.63      0.40      0.49       174

    accuracy                           0.94      2652
   macro avg       0.79      0.69      0.73      2652
weighted avg       0.94      0.94      0.94      2652



In [15]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test_, pred_)

array([[2437,   41],
       [ 105,   69]], dtype=int64)