## 스태킹

### 실습: 위스콘신 유방암 데이터 활용

In [4]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

In [5]:
import pandas as pd
import numpy as np

In [7]:
# 데이터 불러오기
dataset = load_breast_cancer()

cancer_df = pd.DataFrame(dataset.data, columns = dataset.feature_names)
cancer_df['target'] = dataset.target
cancer_df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [34]:
# 데이터 분할
x = dataset.data
y = dataset.target

X_train, X_test, y_train, y_test = train_test_split(x, y, 
                                                    test_size = 0.2, 
                                                    random_state = 0)

# 개별 ML모델 생성
kn_clf = KNeighborsClassifier(n_neighbors = 4)
dt_clf = DecisionTreeClassifier()
rf_clf = RandomForestClassifier(n_estimators = 100, random_state = 0)
ada_clf = AdaBoostClassifier(n_estimators = 100)

In [35]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

In [60]:
# 개별 모델-> 메타 모델 사용할 학습/테스트용 데이터 생성하기 위한 함수

def get_stacking_base_datasets(model, X_train, X_test, y_train, n_folds):
    # 지정된 n_folds 값으로 KFold 생성
    kf = KFold(n_splits = n_folds,
               shuffle = False)  # shuffle = false를 해야 데이터셋이 모델별로 같은 인덱스를 가짐
    
    # 추후에 메타 모델이 사용할 학습 데이터 반환을 위한 넘파이 배열 초기화
    train_fold_pred = np.zeros((X_train.shape[0], 1))
    test_pred = np.zeros((X_test.shape[0], n_folds))
    print(model.__class__.__name__, ' model 시작')
    
    
    # 학습데이터를 폴드 개수 만큼 나누고 학습/검증용으로 나누기
    for folder_counter, (train_index, valid_index) in enumerate(kf.split(X_train)):
        print(f'*** 폴드세트: {folder_counter} ****')
        X_tr = X_train[train_index]
        y_tr = y_train[train_index]
        X_te = X_train[valid_index]
        
        # 폴드 세트 내부에서 다시 만들어진 학습 데이터로 기반 모델의 학습 수행
        model.fit(X_tr, y_tr)
        
        # 폴드 세트 내부에서 다시 만들어진 검증 데이터로 기반 모델 예측 후 데이터 저장
        train_fold_pred[valid_index, :] = model.predict(X_te).reshape(-1, 1)
        
        # 입력된 원본 테스트 데이터를 폴드 세트내 학습된 기반 모델에서 예측 후 데이터 저장
        test_pred[:, folder_counter] = model.predict(X_test)
        
    # 폴드 세트 내에서 원본 테스트 데이터를 예측한 데이터 평균내서 테스트 데이터로 생성
    test_pred_mean = np.mean(test_pred, axis = 1).reshape(-1, 1)
    
    return train_fold_pred, test_pred_mean

In [61]:
kn_train, kn_test = get_stacking_base_datasets(kn_clf, X_train, X_test, y_train, 7)
rf_train, rf_test = get_stacking_base_datasets(rf_clf, X_train, X_test, y_train, 7)
dt_train, dt_test = get_stacking_base_datasets(dt_clf, X_train, X_test, y_train, 7)
ada_train, ada_test = get_stacking_base_datasets(ada_clf, X_train, X_test, y_train, 7)

KNeighborsClassifier  model 시작
*** 폴드세트: 0 ****
*** 폴드세트: 1 ****
*** 폴드세트: 2 ****
*** 폴드세트: 3 ****
*** 폴드세트: 4 ****
*** 폴드세트: 5 ****
*** 폴드세트: 6 ****
RandomForestClassifier  model 시작
*** 폴드세트: 0 ****
*** 폴드세트: 1 ****
*** 폴드세트: 2 ****
*** 폴드세트: 3 ****
*** 폴드세트: 4 ****
*** 폴드세트: 5 ****
*** 폴드세트: 6 ****
DecisionTreeClassifier  model 시작
*** 폴드세트: 0 ****
*** 폴드세트: 1 ****
*** 폴드세트: 2 ****
*** 폴드세트: 3 ****
*** 폴드세트: 4 ****
*** 폴드세트: 5 ****
*** 폴드세트: 6 ****
AdaBoostClassifier  model 시작
*** 폴드세트: 0 ****
*** 폴드세트: 1 ****
*** 폴드세트: 2 ****
*** 폴드세트: 3 ****
*** 폴드세트: 4 ****
*** 폴드세트: 5 ****
*** 폴드세트: 6 ****


In [62]:
# 메타모델에 적용할 최종 학습용 데이터 만들기
stack_final_X_train = np.concatenate([kn_train, rf_train, dt_train, ada_train], axis = 1)
print(X_train.shape, knn_train.shape, stack_final_X_train.shape)

stack_final_X_test = np.concatenate([kn_test, rf_test, dt_test, ada_test], axis = 1)
print(X_test.shape, knn_test.shape, stack_final_X_test.shape)

(455, 30) (455, 1) (455, 4)
(114, 30) (114, 1) (114, 4)


In [63]:
# 메타 ml모델 
lr_clf = LogisticRegression()

# 메타모델로 학습
lr_clf.fit(stack_final_X_train, y_train)

# 예측
stack_final = lr_clf.predict(stack_final_X_test)

# v평가
accuracy = accuracy_score(y_test, stack_final)

print(f'정확도: {accuracy:.4f}')

정확도: 0.9825
