# Modeling10: 모델 Stacking

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# 데이터 불러오기
data_origin = pd.read_csv("https://raw.githubusercontent.com/agtechresearch/LectureAlgorithm/main/csv/married_full.csv")
data_origin

Unnamed: 0,gender,age,age_partner,importance_same_religion,pref_of_partner_attractive,pref_of_partner_sincere,pref_of_partner_intelligence,pref_of_partner_funny,pref_of_partner_ambitious,pref_of_partner_shared_interests,...,my_eval_sincere,my_eval_intelligence,my_eval_funny,my_eval_ambition,my_eval_shared_interests,interests_correlate,expected_happy_with_couple_match,how_much_i_liked,guess_prob_liked,married
0,female,21.0,27.0,4.0,35.0,20.0,20.0,20.0,0.0,5.0,...,9.0,7.0,7.0,6.0,5.0,0.14,3.0,7.0,6.0,0
1,female,21.0,22.0,4.0,60.0,0.0,0.0,40.0,0.0,0.0,...,8.0,7.0,8.0,5.0,6.0,0.54,3.0,7.0,5.0,0
2,female,21.0,22.0,4.0,19.0,18.0,19.0,18.0,14.0,12.0,...,8.0,9.0,8.0,5.0,7.0,0.16,3.0,7.0,5.0,1
3,female,21.0,23.0,4.0,30.0,5.0,15.0,40.0,5.0,5.0,...,6.0,8.0,7.0,6.0,8.0,0.61,3.0,7.0,6.0,1
4,female,21.0,24.0,4.0,30.0,10.0,20.0,10.0,10.0,20.0,...,6.0,7.0,7.0,6.0,6.0,0.21,3.0,6.0,6.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8357,male,25.0,26.0,1.0,10.0,10.0,30.0,20.0,10.0,15.0,...,5.0,5.0,5.0,,,0.64,10.0,2.0,5.0,0
8358,male,25.0,24.0,1.0,50.0,20.0,10.0,5.0,10.0,5.0,...,6.0,8.0,4.0,4.0,,0.71,10.0,4.0,4.0,0
8359,male,25.0,29.0,1.0,40.0,10.0,30.0,10.0,10.0,,...,7.0,8.0,8.0,8.0,,-0.46,10.0,6.0,5.0,0
8360,male,25.0,22.0,1.0,10.0,25.0,25.0,10.0,10.0,20.0,...,6.0,5.0,4.0,,5.0,0.62,10.0,5.0,5.0,0


In [3]:
# 전처리를 위한 원본 데이터 복사
data = data_origin.copy()

In [4]:
# 메모리를 효율적으로 사용하기 위한 downcast 함수 정의
def downcast(df, verbose=True):     # verbose 옵션 추가: (True)인 경우 몇 퍼센트 압축됐는지 출력
    start_mem = df.memory_usage().sum() / 1024**2   # 초기 메모리 사용량
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        else:
            df[col] = pd.to_numeric(df[col], downcast='float')
    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print(f'{(100*(start_mem - end_mem) / start_mem):.1f}% 압축됨')

    return df

In [5]:
downcast(data)

49.6% 압축됨


Unnamed: 0,gender,age,age_partner,importance_same_religion,pref_of_partner_attractive,pref_of_partner_sincere,pref_of_partner_intelligence,pref_of_partner_funny,pref_of_partner_ambitious,pref_of_partner_shared_interests,...,my_eval_sincere,my_eval_intelligence,my_eval_funny,my_eval_ambition,my_eval_shared_interests,interests_correlate,expected_happy_with_couple_match,how_much_i_liked,guess_prob_liked,married
0,female,21.0,27.0,4.0,35.0,20.0,20.0,20.0,0.0,5.0,...,9.0,7.0,7.0,6.0,5.0,0.14,3.0,7.0,6.0,0
1,female,21.0,22.0,4.0,60.0,0.0,0.0,40.0,0.0,0.0,...,8.0,7.0,8.0,5.0,6.0,0.54,3.0,7.0,5.0,0
2,female,21.0,22.0,4.0,19.0,18.0,19.0,18.0,14.0,12.0,...,8.0,9.0,8.0,5.0,7.0,0.16,3.0,7.0,5.0,1
3,female,21.0,23.0,4.0,30.0,5.0,15.0,40.0,5.0,5.0,...,6.0,8.0,7.0,6.0,8.0,0.61,3.0,7.0,6.0,1
4,female,21.0,24.0,4.0,30.0,10.0,20.0,10.0,10.0,20.0,...,6.0,7.0,7.0,6.0,6.0,0.21,3.0,6.0,6.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8357,male,25.0,26.0,1.0,10.0,10.0,30.0,20.0,10.0,15.0,...,5.0,5.0,5.0,,,0.64,10.0,2.0,5.0,0
8358,male,25.0,24.0,1.0,50.0,20.0,10.0,5.0,10.0,5.0,...,6.0,8.0,4.0,4.0,,0.71,10.0,4.0,4.0,0
8359,male,25.0,29.0,1.0,40.0,10.0,30.0,10.0,10.0,,...,7.0,8.0,8.0,8.0,,-0.46,10.0,6.0,5.0,0
8360,male,25.0,22.0,1.0,10.0,25.0,25.0,10.0,10.0,20.0,...,6.0,5.0,4.0,,5.0,0.62,10.0,5.0,5.0,0


In [6]:
data_OHE = pd.get_dummies(data, columns=['gender'], drop_first=True)

> Train/Test data split

In [7]:
# 단순 랜덤 샘플링
#from sklearn.model_selection import train_test_split

#train_set, test_set = train_test_split(data, test_size = 0.2, random_state=42)


# 계층적 샘플링
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data_OHE, data_OHE['married']):
    sss_train_set = data_OHE.loc[train_index]
    sss_test_set = data_OHE.loc[test_index]

In [8]:
X_train = sss_train_set.drop("married", axis=1)
y_train = sss_train_set["married"].copy()

X_test = sss_test_set.drop("married", axis=1)
y_test = sss_test_set["married"].copy()

# Preprocessing

> 전처리 process 정리
> 1. 결측치 채우기(임의로 중앙값)
> 2. 수치형 변수 표준화(Standardization)
> 3. 범주형 변수 OneHotEncoding
> 4. SMOTE-Tomek 오버샘플링

In [10]:
# KNN 결측치 대체
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)

X_train_imputed = imputer.fit_transform(X_train)
X_train_imputed = pd.DataFrame(X_train_imputed, columns=X_train.columns)

X_test_imputed = imputer.transform(X_test)
X_test_imputed = pd.DataFrame(X_test_imputed, columns=X_test.columns)

### Scaling

In [16]:
# scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)

X_test_scaled = scaler.transform(X_test_imputed)

In [17]:
X_train_scaled

array([[ 0.4665692 , -0.6502732 ,  1.9189428 , ..., -1.1789867 ,
        -1.538104  ,  0.9900332 ],
       [-0.37945798,  1.8789358 ,  1.5611063 , ..., -0.07885458,
        -0.58333707,  0.9900332 ],
       [ 0.4665692 , -0.93129647,  0.8454334 , ..., -0.07885458,
        -0.10595358,  0.9900332 ],
       ...,
       [ 0.18456015, -1.2123196 , -0.22807595, ...,  0.47121146,
         0.84881335, -1.0100671 ],
       [-0.37945798,  2.159959  , -0.9437489 , ..., -0.07885458,
         0.3714299 ,  0.9900332 ],
       [ 0.18456015, -0.93129647,  1.5611063 , ..., -0.07885458,
        -0.58333707,  0.9900332 ]], dtype=float32)

### Over-Sampling

In [156]:
# SMOTE-Tomek 샘플링
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks

smoteto = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))
X_resampled, y_resampled = smoteto.fit_resample(X_train_scaled, y_train)

In [157]:
X_resampled.shape, y_resampled.shape

((10953, 32), (10953,))

> 기본적인 데이터셋 준비 완료

# Model Training

### Model Stacking

In [65]:
# 다양한 분류 모델 import
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [66]:
from vecstack import stacking

models = [ 
    LogisticRegression(random_state=42),
    SVC(kernel='rbf', C=1.0, gamma=0.1, class_weight='balanced', random_state=42),
    RandomForestClassifier(n_estimators=500, max_depth=10, random_state=42, class_weight='balanced'),
    XGBClassifier(seed = 42, n_jobs = -1, learning_rate = 0.1, n_estimators = 100, max_depth = 6),
    LGBMClassifier(objective='binary', metric='accuracy', random_state=42, verbose=0, n_estimators=300, max_depth=6),
    ]

In [67]:
S_train, S_test = stacking(models, 
                           X_train_scaled, y_train, X_test_scaled, 
                           regression = False,  # 분류문제라는 의미
                           metric = accuracy_score, 
                           n_folds = 5, stratified = True, shuffle = True,
                           random_state = 42, verbose = 2)

task:         [classification]
n_classes:    [2]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [5]

model  0:     [LogisticRegression]
    fold  0:  [0.86248132]
    fold  1:  [0.85799701]
    fold  2:  [0.85127055]
    fold  3:  [0.84977578]
    fold  4:  [0.84218399]
    ----
    MEAN:     [0.85274173] + [0.00699864]
    FULL:     [0.85274331]

model  1:     [SVC]


  S_test[:, model_counter] = st.mode(S_test_temp, axis = 1)[0].ravel()


    fold  0:  [0.83482810]
    fold  1:  [0.83781764]
    fold  2:  [0.83408072]
    fold  3:  [0.83408072]
    fold  4:  [0.80852655]
    ----
    MEAN:     [0.82986675] + [0.01075872]
    FULL:     [0.82986994]

model  2:     [RandomForestClassifier]


  S_test[:, model_counter] = st.mode(S_test_temp, axis = 1)[0].ravel()


    fold  0:  [0.84678625]
    fold  1:  [0.85351271]
    fold  2:  [0.83333333]
    fold  3:  [0.84230194]
    fold  4:  [0.81675393]
    ----
    MEAN:     [0.83853763] + [0.01271281]
    FULL:     [0.83854089]

model  3:     [XGBClassifier]


  S_test[:, model_counter] = st.mode(S_test_temp, axis = 1)[0].ravel()


    fold  0:  [0.86547085]
    fold  1:  [0.86920777]
    fold  2:  [0.85799701]
    fold  3:  [0.86322870]
    fold  4:  [0.84667165]


  S_test[:, model_counter] = st.mode(S_test_temp, axis = 1)[0].ravel()


    ----
    MEAN:     [0.86051520] + [0.00781623]
    FULL:     [0.86051727]

model  4:     [LGBMClassifier]
    fold  0:  [0.86322870]
    fold  1:  [0.86397608]
    fold  2:  [0.85650224]
    fold  3:  [0.86696562]
    fold  4:  [0.85938669]
    ----
    MEAN:     [0.86201187] + [0.00366404]
    FULL:     [0.86201226]



  S_test[:, model_counter] = st.mode(S_test_temp, axis = 1)[0].ravel()


In [None]:
# stacked_model = LGBMClassifier(**params, objective='binary', metric='accuracy', random_state=42, verbose=0)
# stacked_model = stacked_model.fit(S_train, y_resampled)
# y_pred = stacked_model.predict(S_test)
# print('Final prediction score: [%.8f]' % accuracy_score(y_test, y_pred))

In [69]:
# 베이지안 최적화
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

param_bounds = {
    'n_estimators': (100, 500),
    'max_depth': (3, 10),
    'num_leaves': (20, 100),
    'min_child_samples': (10, 30),
    'learning_rate': (0.01, 0.3),
    'scale_pos_weight': (1, 2),
}

def eval_function(n_estimators, max_depth, num_leaves, min_child_samples, learning_rate, scale_pos_weight):
    params = {
        'n_estimators': int(n_estimators),
        'max_depth': int(max_depth),
        'num_leaves': int(num_leaves),
        'min_child_samples': int(min_child_samples),
        'learning_rate': learning_rate,
        'scale_pos_weight': scale_pos_weight,
    }
    lgbm = LGBMClassifier(**params, objective='binary', metric='accuracy', random_state=42, verbose=0)

    lgbm.fit(S_train, y_train)
    y_pred = lgbm.predict(S_test)
    acc = accuracy_score(y_test, y_pred)
    return acc

In [71]:
from bayes_opt import BayesianOptimization

optimizer = BayesianOptimization(f=eval_function, pbounds=param_bounds, random_state=42)
optimizer.maximize(init_points=5, n_iter=50)

|   iter    |  target   | learni... | max_depth | min_ch... | n_esti... | num_le... | scale_... |
-------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.8458   [0m | [0m0.1186   [0m | [0m9.655    [0m | [0m24.64    [0m | [0m339.5    [0m | [0m32.48    [0m | [0m1.156    [0m |
| [0m2        [0m | [0m0.8368   [0m | [0m0.02684  [0m | [0m9.063    [0m | [0m22.02    [0m | [0m383.2    [0m | [0m21.65    [0m | [0m1.97     [0m |
| [95m3        [0m | [95m0.8482   [0m | [95m0.2514   [0m | [95m4.486    [0m | [95m13.64    [0m | [95m173.4    [0m | [95m44.34    [0m | [95m1.525    [0m |
| [0m4        [0m | [0m0.8458   [0m | [0m0.1353   [0m | [0m5.039    [0m | [0m22.24    [0m | [0m155.8    [0m | [0m43.37    [0m | [0m1.366    [0m |
| [0m5        [0m | [0m0.8458   [0m | [0m0.1423   [0m | [0m8.496    [0m | [0m13.99    [0m | [0m305.7    [0m | [0m67.39    [0m | [0m1.

In [72]:
# 최적화된 하이퍼파라미터로 모델 재학습
best_params = {
    'n_estimators': round(optimizer.max['params']['n_estimators']),
    'max_depth': round(optimizer.max['params']['max_depth']),
    'num_leaves': round(optimizer.max['params']['num_leaves']),
    'min_child_samples': round(optimizer.max['params']['min_child_samples']),
    'learning_rate': optimizer.max['params']['learning_rate'],
    'scale_pos_weight': optimizer.max['params']['scale_pos_weight'],
}

best_lgbm = LGBMClassifier(**best_params, objective='binary', metric='accuracy', random_state=42, verbose=0)
best_lgbm.fit(S_train, y_train)



In [73]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# Train data 성능 평가
y_train_pred = best_lgbm.predict(S_train)
print("<<Train Set Performance>>")
print("Accuracy: ", accuracy_score(y_train, y_train_pred))
print("F1 Score: ", f1_score(y_train, y_train_pred))
print(confusion_matrix(y_train, y_train_pred))

# Test data 성능 평가
y_test_pred = best_lgbm.predict(S_test)
print("<<Test Set Performance>>")
print("Accuracy: ", accuracy_score(y_test, y_test_pred))
print("F1 Score: ", f1_score(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))

<<Train Set Performance>>
Accuracy:  0.8642547465988937
F1 Score:  0.5575048732943471
[[5209  270]
 [ 638  572]]
<<Test Set Performance>>
Accuracy:  0.8475791990436342
F1 Score:  0.4910179640718563
[[1295   75]
 [ 180  123]]
