In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
mort = pd.read_csv('./newdata.csv', header=0) # data about mortgage
mort.head()

Unnamed: 0,Bo_Age,Ln_Orig,Orig_LTV_Ratio_Pct,Credit_score,First_home,Tot_mthly_debt_exp,Tot_mthly_incm,pur_prc_amt,DTI Ratio,OUTCOME,Median_state_inc
0,37,75650.0,85,669,N,1707,6000,160000,0.2845,non-default,40171
1,46,390775.0,102,684,N,0,5025,309000,0.0,non-default,44228
2,30,112500.0,90,662,Y,1812,4800,176450,0.3775,non-default,49894
3,24,85250.0,97,647,N,3395,6934,110000,0.489616,non-default,43217
4,35,114000.0,100,791,N,3801,5504,103000,0.690589,non-default,57352


### Preprocessing

In [3]:
## First_home 문자형 데이터 변환
mort['First_home'] = np.where(mort['First_home']=='Y',1,0)
mort['First_home']

## OUTCOME 데이터 변환
mort['OUTCOME'] = np.where(mort['OUTCOME']=='non-default',0,1) # non-default = 0, default = 1

## string 타입을 정수로 변환하기
for k in range(0,10607):
    mort.Median_state_inc[k] = np.float64(mort.Median_state_inc[k].replace(",", ""))

outcome = mort.OUTCOME
mort.drop(['OUTCOME'], axis='columns', inplace=True)

mort

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,Bo_Age,Ln_Orig,Orig_LTV_Ratio_Pct,Credit_score,First_home,Tot_mthly_debt_exp,Tot_mthly_incm,pur_prc_amt,DTI Ratio,Median_state_inc
0,37,75650.0,85,669,0,1707,6000,160000,0.284500,40171
1,46,390775.0,102,684,0,0,5025,309000,0.000000,44228
2,30,112500.0,90,662,1,1812,4800,176450,0.377500,49894
3,24,85250.0,97,647,0,3395,6934,110000,0.489616,43217
4,35,114000.0,100,791,0,3801,5504,103000,0.690589,57352
...,...,...,...,...,...,...,...,...,...,...
10602,37,280200.0,95,652,1,727,1827,80000,0.397920,42590
10603,30,57000.0,94,589,0,1703,3941,108000,0.432124,45787
10604,33,153200.0,95,675,1,1726,3707,154000,0.465606,43217
10605,38,158850.0,100,786,1,1364,2500,92000,0.545600,40171


In [4]:
# X, y = mort, outcome
## training data와 test data 나누기
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(mort, outcome, test_size=0.2, random_state=0)

In [5]:
## Standardization
stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.fit_transform(X_test)

In [6]:
## GridSearchCV & Cross Validation
from sklearn.model_selection import KFold, GridSearchCV

In [7]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(ratio='auto', kind='regular')
X_resampled, y_resampled = sm.fit_sample(X_train_std, y_train)

## RandomForest

### - max_depth : integer or None, optional (default=None)
### The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.
### - random_state : int, RandomState instance or None, optional (default=None)
### If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random.

#### https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html

In [11]:
from sklearn.ensemble import RandomForestRegressor

# X_train_std, X_test_std = make_regression(n_features=4, n_informative=2, random_state=0, shuffle=False)
rf = RandomForestRegressor(random_state=1, n_estimators=100)

In [12]:
# train
rf.fit(X_resampled, y_resampled)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=6,
                      oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [31]:
pred = rf.predict(X_test)
print(pred)

[0.07 0.07 0.07 ... 0.07 0.19 0.12]


In [29]:
from sklearn.metrics import accuracy_score
import sklearn.metrics as metrics

In [32]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics    
 
    # 학습 진행

forest = RandomForestClassifier(criterion='gini', n_estimators=500, max_leaf_nodes=16, random_state=1)
forest.fit(X_resampled, y_resampled)
 
    # 예측
pred = forest.predict(X_test)
print(pred)

[0 0 0 ... 0 0 0]


In [30]:
print(metrics.accuracy_score(y_test, pred))

ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [28]:
from sklearn.metrics import classification_report, accuracy_score
errors = abs(pred - y_test)
errors

4579    0.07
2359    0.07
4558    0.07
7125    0.07
9193    0.07
        ... 
3601    0.19
667     0.19
6496    0.07
760     0.19
2029    0.12
Name: OUTCOME, Length: 2122, dtype: float64

In [25]:
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 0.12 degrees.


In [26]:
acc_sco = accuracy_score(y_test, pred)
acc_sco

ValueError: Classification metrics can't handle a mix of binary and continuous targets

In [15]:
from sklearn.metrics import classification_report, accuracy_score
print(classification_report(y_test, pred))

ValueError: Classification metrics can't handle a mix of binary and continuous targets