In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix
import sklearn.metrics as metrics

In [2]:
mort = pd.read_csv('../data/data_active.csv', header=0) # data about mortgage
mort.head()

Unnamed: 0,Bo_Age,Ln_Orig,Orig_LTV_Ratio_Pct,Credit_score,First_home,Tot_mthly_debt_exp,Tot_mthly_incm,orig_apprd_val_amt,pur_prc_amt,DTI Ratio,Status,OUTCOME,State,Median_state_inc,UPB>Appraisal
0,37,75650.0,85,669,N,1707,6000,169000,160000,0.2845,Active,non-default,FL,40171,0
1,46,390775.0,102,684,N,0,5025,310000,309000,0.0,Pay-off,non-default,NY,44228,1
2,30,112500.0,90,662,Y,1812,4800,177000,176450,0.3775,Pay-off,non-default,CA,49894,0
3,24,85250.0,97,647,N,3395,6934,113000,110000,0.489616,Pay-off,non-default,GA,43217,0
4,35,114000.0,100,791,N,3801,5504,103000,103000,0.690589,Pay-off,non-default,NH,57352,1


## 전처리 과정

In [3]:
## First_home 문자형 데이터 변환
mort['First_home'] = np.where(mort['First_home']=='Y',1,0)

## OUTCOME 데이터 변환
mort['OUTCOME'] = np.where(mort['OUTCOME']=='non-default',0,1) # non-default = 0, default = 1

## string 타입을 정수로 변환하기
for k in range(0,10607):
    mort.Median_state_inc[k] = np.float64(mort.Median_state_inc[k].replace(",", ""))
    
## Separating 'target' feature
outcome = mort.OUTCOME
mort.drop(['OUTCOME'], axis='columns', inplace=True)

## Delete feature
mort = mort.drop('Status', axis=1)
mort = mort.drop('State', axis=1)
mort = mort.drop('orig_apprd_val_amt', axis=1)
mort = mort.drop('UPB>Appraisal', axis=1)

mort

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Unnamed: 0,Bo_Age,Ln_Orig,Orig_LTV_Ratio_Pct,Credit_score,First_home,Tot_mthly_debt_exp,Tot_mthly_incm,pur_prc_amt,DTI Ratio,Median_state_inc
0,37,75650.0,85,669,0,1707,6000,160000,0.284500,40171
1,46,390775.0,102,684,0,0,5025,309000,0.000000,44228
2,30,112500.0,90,662,1,1812,4800,176450,0.377500,49894
3,24,85250.0,97,647,0,3395,6934,110000,0.489616,43217
4,35,114000.0,100,791,0,3801,5504,103000,0.690589,57352
...,...,...,...,...,...,...,...,...,...,...
10602,37,280200.0,95,652,1,727,1827,80000,0.397920,42590
10603,30,57000.0,94,589,0,1703,3941,108000,0.432124,45787
10604,33,153200.0,95,675,1,1726,3707,154000,0.465606,43217
10605,38,158850.0,100,786,1,1364,2500,92000,0.545600,40171


In [4]:
# X, y = mort, outcome
## training data와 test data 나누기
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(mort, outcome, test_size=0.25, random_state=1)

scaler = StandardScaler()
X_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)

In [5]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(ratio='auto', kind='regular')
X_resampled, y_resampled = sm.fit_sample(X_std, y_train)

In [6]:
pd.Series(y_resampled).value_counts()

1    7452
0    7452
dtype: int64

In [7]:
## 특징 선택
from sklearn.feature_selection import SelectKBest, f_classif
selector = SelectKBest(f_classif, k = 10).fit(X_resampled, y_resampled)
X_resampled_f = selector.transform(X_resampled)
X_test_std_f = selector.transform(X_test_std)

In [8]:
model_svm = svm.SVC(C=1000, kernel='rbf').fit(X_resampled, y_resampled)
pred = model_svm.predict(X_test_std)



In [9]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.96      0.93      0.94      2496
           1       0.26      0.40      0.31       156

    accuracy                           0.90      2652
   macro avg       0.61      0.67      0.63      2652
weighted avg       0.92      0.90      0.91      2652



In [10]:
confusion_matrix(y_test, pred)

array([[2312,  184],
       [  93,   63]], dtype=int64)

In [11]:
from sklearn.metrics import f1_score
f1_score(y_test, pred)

0.31265508684863524

## 특징 선택 적용

In [12]:
model_svm = svm.SVC(C=1000, kernel='rbf').fit(X_resampled, y_resampled)
pred = model_svm.predict(X_test_std)



In [13]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.96      0.93      0.94      2496
           1       0.26      0.40      0.31       156

    accuracy                           0.90      2652
   macro avg       0.61      0.67      0.63      2652
weighted avg       0.92      0.90      0.91      2652



In [14]:
confusion_matrix(y_test, pred)

array([[2312,  184],
       [  93,   63]], dtype=int64)

## Grid Search CV

In [15]:
# from sklearn.model_selection import GridSearchCV
# param_grid = {
#     'C':[1, 10, 100, 1000],
#     'gamma':[0.1, 1, 10]
# }
# results = GridSearchCV(model_svm, param_grid, cv=5).fit(X_resampled, y_resampled)

KeyboardInterrupt: 

In [None]:
# results.best_params_