# Modeling9: 다양한 모델의 베이지안 최적화

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# 데이터 불러오기
data_origin = pd.read_csv("https://raw.githubusercontent.com/agtechresearch/LectureAlgorithm/main/csv/married_full.csv")
data_origin

Unnamed: 0,gender,age,age_partner,importance_same_religion,pref_of_partner_attractive,pref_of_partner_sincere,pref_of_partner_intelligence,pref_of_partner_funny,pref_of_partner_ambitious,pref_of_partner_shared_interests,...,my_eval_sincere,my_eval_intelligence,my_eval_funny,my_eval_ambition,my_eval_shared_interests,interests_correlate,expected_happy_with_couple_match,how_much_i_liked,guess_prob_liked,married
0,female,21.0,27.0,4.0,35.0,20.0,20.0,20.0,0.0,5.0,...,9.0,7.0,7.0,6.0,5.0,0.14,3.0,7.0,6.0,0
1,female,21.0,22.0,4.0,60.0,0.0,0.0,40.0,0.0,0.0,...,8.0,7.0,8.0,5.0,6.0,0.54,3.0,7.0,5.0,0
2,female,21.0,22.0,4.0,19.0,18.0,19.0,18.0,14.0,12.0,...,8.0,9.0,8.0,5.0,7.0,0.16,3.0,7.0,5.0,1
3,female,21.0,23.0,4.0,30.0,5.0,15.0,40.0,5.0,5.0,...,6.0,8.0,7.0,6.0,8.0,0.61,3.0,7.0,6.0,1
4,female,21.0,24.0,4.0,30.0,10.0,20.0,10.0,10.0,20.0,...,6.0,7.0,7.0,6.0,6.0,0.21,3.0,6.0,6.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8357,male,25.0,26.0,1.0,10.0,10.0,30.0,20.0,10.0,15.0,...,5.0,5.0,5.0,,,0.64,10.0,2.0,5.0,0
8358,male,25.0,24.0,1.0,50.0,20.0,10.0,5.0,10.0,5.0,...,6.0,8.0,4.0,4.0,,0.71,10.0,4.0,4.0,0
8359,male,25.0,29.0,1.0,40.0,10.0,30.0,10.0,10.0,,...,7.0,8.0,8.0,8.0,,-0.46,10.0,6.0,5.0,0
8360,male,25.0,22.0,1.0,10.0,25.0,25.0,10.0,10.0,20.0,...,6.0,5.0,4.0,,5.0,0.62,10.0,5.0,5.0,0


In [3]:
# 전처리를 위한 원본 데이터 복사
data = data_origin.copy()

In [4]:
# 메모리를 효율적으로 사용하기 위한 downcast 함수 정의
def downcast(df, verbose=True):     # verbose 옵션 추가: (True)인 경우 몇 퍼센트 압축됐는지 출력
    start_mem = df.memory_usage().sum() / 1024**2   # 초기 메모리 사용량
    for col in df.columns:
        dtype_name = df[col].dtype.name
        if dtype_name == 'object':
            pass
        elif dtype_name == 'bool':
            df[col] = df[col].astype('int8')
        elif dtype_name.startswith('int') or (df[col].round() == df[col]).all():
            df[col] = pd.to_numeric(df[col], downcast='integer')
        else:
            df[col] = pd.to_numeric(df[col], downcast='float')
    end_mem = df.memory_usage().sum() / 1024**2

    if verbose:
        print(f'{(100*(start_mem - end_mem) / start_mem):.1f}% 압축됨')

    return df

In [5]:
downcast(data)

49.6% 압축됨


Unnamed: 0,gender,age,age_partner,importance_same_religion,pref_of_partner_attractive,pref_of_partner_sincere,pref_of_partner_intelligence,pref_of_partner_funny,pref_of_partner_ambitious,pref_of_partner_shared_interests,...,my_eval_sincere,my_eval_intelligence,my_eval_funny,my_eval_ambition,my_eval_shared_interests,interests_correlate,expected_happy_with_couple_match,how_much_i_liked,guess_prob_liked,married
0,female,21.0,27.0,4.0,35.0,20.0,20.0,20.0,0.0,5.0,...,9.0,7.0,7.0,6.0,5.0,0.14,3.0,7.0,6.0,0
1,female,21.0,22.0,4.0,60.0,0.0,0.0,40.0,0.0,0.0,...,8.0,7.0,8.0,5.0,6.0,0.54,3.0,7.0,5.0,0
2,female,21.0,22.0,4.0,19.0,18.0,19.0,18.0,14.0,12.0,...,8.0,9.0,8.0,5.0,7.0,0.16,3.0,7.0,5.0,1
3,female,21.0,23.0,4.0,30.0,5.0,15.0,40.0,5.0,5.0,...,6.0,8.0,7.0,6.0,8.0,0.61,3.0,7.0,6.0,1
4,female,21.0,24.0,4.0,30.0,10.0,20.0,10.0,10.0,20.0,...,6.0,7.0,7.0,6.0,6.0,0.21,3.0,6.0,6.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8357,male,25.0,26.0,1.0,10.0,10.0,30.0,20.0,10.0,15.0,...,5.0,5.0,5.0,,,0.64,10.0,2.0,5.0,0
8358,male,25.0,24.0,1.0,50.0,20.0,10.0,5.0,10.0,5.0,...,6.0,8.0,4.0,4.0,,0.71,10.0,4.0,4.0,0
8359,male,25.0,29.0,1.0,40.0,10.0,30.0,10.0,10.0,,...,7.0,8.0,8.0,8.0,,-0.46,10.0,6.0,5.0,0
8360,male,25.0,22.0,1.0,10.0,25.0,25.0,10.0,10.0,20.0,...,6.0,5.0,4.0,,5.0,0.62,10.0,5.0,5.0,0


In [6]:
data_OHE = pd.get_dummies(data, columns=['gender'], drop_first=True)

> Train/Test data split

In [7]:
# 단순 랜덤 샘플링
#from sklearn.model_selection import train_test_split

#train_set, test_set = train_test_split(data, test_size = 0.2, random_state=42)


# 계층적 샘플링
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(data_OHE, data_OHE['married']):
    sss_train_set = data_OHE.loc[train_index]
    sss_test_set = data_OHE.loc[test_index]

In [8]:
X_train = sss_train_set.drop("married", axis=1)
y_train = sss_train_set["married"].copy()

X_test = sss_test_set.drop("married", axis=1)
y_test = sss_test_set["married"].copy()

# Preprocessing

In [9]:
# KNN 결측치 대체
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=3)

X_train_imputed = imputer.fit_transform(X_train)
X_train_imputed = pd.DataFrame(X_train_imputed, columns=X_train.columns)

X_test_imputed = imputer.transform(X_test)
X_test_imputed = pd.DataFrame(X_test_imputed, columns=X_test.columns)

In [10]:
# min-max scaling
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)

X_test_scaled = scaler.transform(X_test_imputed)

In [11]:
X_train_scaled

array([[0.2702703 , 0.16216218, 0.8888889 , ..., 0.4       , 0.2       ,
        1.        ],
       [0.1891892 , 0.4054054 , 0.7777778 , ..., 0.6       , 0.4       ,
        1.        ],
       [0.2702703 , 0.13513511, 0.5555556 , ..., 0.6       , 0.5       ,
        1.        ],
       ...,
       [0.24324328, 0.1081081 , 0.22222224, ..., 0.7       , 0.7       ,
        0.        ],
       [0.1891892 , 0.43243247, 0.        , ..., 0.6       , 0.6       ,
        1.        ],
       [0.24324328, 0.13513511, 0.7777778 , ..., 0.6       , 0.4       ,
        1.        ]], dtype=float32)

### Over-Sampling

In [12]:
# SMOTE-Tomek 샘플링
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks

smoteto = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))
X_resampled, y_resampled = smoteto.fit_resample(X_train_scaled, y_train)

In [13]:
X_resampled.shape, y_resampled.shape

((10956, 32), (10956,))

> 기본적인 데이터셋 준비 완료

# Model Training

### LighGBM

In [14]:
# 베이지안 최적화
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

param_bounds = {
    'n_estimators': (100, 500),
    'max_depth': (3, 10),
    'num_leaves': (20, 100),
    'min_child_samples': (10, 30),
    'learning_rate': (0.01, 0.3),
    'scale_pos_weight': (1, 2),
}

def eval_function(n_estimators, max_depth, num_leaves, min_child_samples, learning_rate, scale_pos_weight):
    params = {
        'n_estimators': int(n_estimators),
        'max_depth': int(max_depth),
        'num_leaves': int(num_leaves),
        'min_child_samples': int(min_child_samples),
        'learning_rate': learning_rate,
        'scale_pos_weight': scale_pos_weight,
    }
    lgbm = LGBMClassifier(**params, objective='binary', metric='accuracy', random_state=42, verbose=0)

    lgbm.fit(X_train_scaled, y_train)
    y_pred = lgbm.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    return acc

In [15]:
from bayes_opt import BayesianOptimization

optimizer = BayesianOptimization(f=eval_function, pbounds=param_bounds, random_state=42)
optimizer.maximize(init_points=5, n_iter=10)

|   iter    |  target   | learni... | max_depth | min_ch... | n_esti... | num_le... | scale_... |
-------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.8506   [0m | [0m0.1186   [0m | [0m9.655    [0m | [0m24.64    [0m | [0m339.5    [0m | [0m32.48    [0m | [0m1.156    [0m |
| [0m2        [0m | [0m0.8344   [0m | [0m0.02684  [0m | [0m9.063    [0m | [0m22.02    [0m | [0m383.2    [0m | [0m21.65    [0m | [0m1.97     [0m |
| [95m3        [0m | [95m0.8553   [0m | [95m0.2514   [0m | [95m4.486    [0m | [95m13.64    [0m | [95m173.4    [0m | [95m44.34    [0m | [95m1.525    [0m |
| [0m4        [0m | [0m0.8428   [0m | [0m0.1353   [0m | [0m5.039    [0m | [0m22.24    [0m | [0m155.8    [0m | [0m43.37    [0m | [0m1.366    [0m |
| [95m5        [0m | [95m0.8583   [0m | [95m0.1423   [0m | [95m8.496    [0m | [95m13.99    [0m | [95m305.7    [0m | [95m67.39    [0m |

In [16]:
# 최적화된 하이퍼파라미터로 모델 재학습
best_params = {
    'n_estimators': round(optimizer.max['params']['n_estimators']),
    'max_depth': round(optimizer.max['params']['max_depth']),
    'num_leaves': round(optimizer.max['params']['num_leaves']),
    'min_child_samples': round(optimizer.max['params']['min_child_samples']),
    'learning_rate': optimizer.max['params']['learning_rate'],
    'scale_pos_weight': optimizer.max['params']['scale_pos_weight'],
}

best_lgbm = LGBMClassifier(**best_params, objective='binary', metric='accuracy', random_state=42, verbose=0)
best_lgbm.fit(X_train_scaled, y_train)





In [17]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# Train data 성능 평가
y_train_pred = best_lgbm.predict(X_train_scaled)
print("<<Train Set Performance>>")
print("Accuracy: ", accuracy_score(y_train, y_train_pred))
print("F1 Score: ", f1_score(y_train, y_train_pred))
print(confusion_matrix(y_train, y_train_pred))

# Test data 성능 평가
y_test_pred = best_lgbm.predict(X_test_scaled)
print("<<Test Set Performance>>")
print("Accuracy: ", accuracy_score(y_test, y_test_pred))
print("F1 Score: ", f1_score(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))

<<Train Set Performance>>
Accuracy:  1.0
F1 Score:  1.0
[[5479    0]
 [   0 1210]]
<<Test Set Performance>>
Accuracy:  0.8595337716676629
F1 Score:  0.532803180914513
[[1304   66]
 [ 169  134]]


### XGBoost

In [18]:
# 베이지안 최적화
from xgboost import XGBClassifier

param_bounds = {
    'eta' : (0.01, 0.3),
    'max_leaves': (2, 1024),
    'n_estimators': (100, 1000),
    'gamma': (0, 10),
    'max_depth': (3, 15),
    'min_child_weight': (1, 10),
    'scale_pos_weight': (1, 3),
    }

def eval_function(max_leaves, eta, n_estimators, gamma, max_depth, min_child_weight, scale_pos_weight):
    params = {
        'eta' : eta, 
        'max_leaves': int(max_leaves),
        'n_estimators': int(n_estimators),
        'gamma': gamma,
        'max_depth': int(max_depth),
        'min_child_weight': min_child_weight,
        'scale_pos_weight': scale_pos_weight,
    }

    xgb_model = XGBClassifier(**params, objective='binary:logistic', random_state=42)
    xgb_model.fit(X_train_scaled, y_train)
    y_pred = xgb_model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    return acc

In [19]:
from bayes_opt import BayesianOptimization

optimizer = BayesianOptimization(f=eval_function, pbounds=param_bounds, random_state=42)
optimizer.maximize(init_points=5, n_iter=50)

|   iter    |  target   |    eta    |   gamma   | max_depth | max_le... | min_ch... | n_esti... | scale_... |
-------------------------------------------------------------------------------------------------------------


| [0m1        [0m | [0m0.8332   [0m | [0m0.1186   [0m | [0m9.507    [0m | [0m11.78    [0m | [0m613.8    [0m | [0m2.404    [0m | [0m240.4    [0m | [0m1.116    [0m |
| [95m2        [0m | [95m0.8362   [0m | [95m0.2612   [0m | [95m6.011    [0m | [95m11.5     [0m | [95m23.04    [0m | [95m9.729    [0m | [95m849.2    [0m | [95m1.425    [0m |
| [95m3        [0m | [95m0.8386   [0m | [95m0.06273  [0m | [95m1.834    [0m | [95m6.651    [0m | [95m538.3    [0m | [95m4.888    [0m | [95m362.1    [0m | [95m2.224    [0m |
| [95m4        [0m | [95m0.8404   [0m | [95m0.05045  [0m | [95m2.921    [0m | [95m7.396    [0m | [95m468.1    [0m | [95m8.067    [0m | [95m279.7    [0m | [95m2.028    [0m |
| [95m5        [0m | [95m0.8476   [0m | [95m0.1818   [0m | [95m0.4645   [0m | [95m10.29    [0m | [95m176.3    [0m | [95m1.585    [0m | [95m954.0    [0m | [95m2.931    [0m |
| [95m6        [0m | [95m0.8482   [0m | [95m0.288

In [20]:
# 최적화된 하이퍼파라미터로 모델 재학습
best_params = {
    'n_estimators': round(optimizer.max['params']['n_estimators']),
    'max_depth': round(optimizer.max['params']['max_depth']),
    'max_leaves': round(optimizer.max['params']['max_leaves']),
    'min_child_weight': round(optimizer.max['params']['min_child_weight']),
    'eta': optimizer.max['params']['eta'],
    'scale_pos_weight': optimizer.max['params']['scale_pos_weight'],
    'gamma': optimizer.max['params']['gamma'],
}

best_xgb = XGBClassifier(**best_params, objective='binary:logistic', random_state=42)
best_xgb.fit(X_train_scaled, y_train)

In [21]:
# Train data 성능 평가
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

y_train_pred = best_xgb.predict(X_train_scaled)
print("<<Train Set Performance>>")
print("Accuracy: ", accuracy_score(y_train, y_train_pred))
print("F1 Score: ", f1_score(y_train, y_train_pred))
print(confusion_matrix(y_train, y_train_pred))

# Test data 성능 평가
y_test_pred = best_xgb.predict(X_test_scaled)
print("<<Test Set Performance>>")
print("Accuracy: ", accuracy_score(y_test, y_test_pred))
print("F1 Score: ", f1_score(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))

<<Train Set Performance>>
Accuracy:  0.9846015846912842
F1 Score:  0.9565217391304348
[[5453   26]
 [  77 1133]]
<<Test Set Performance>>
Accuracy:  0.8439928272564255
F1 Score:  0.4990403071017275
[[1282   88]
 [ 173  130]]


### Random Forest

In [22]:
# rf 모델 학습
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=500, max_depth=10, random_state=42, class_weight='balanced')
rf.fit(X_train_scaled, y_train)

In [23]:
# 베이지안 최적화
from sklearn.ensemble import RandomForestClassifier

param_bounds = {
    'n_estimators': (100, 1000),
    'max_depth': (3, 15),
    }

def eval_function(n_estimators, max_depth):
    params = {
    'n_estimators': int(n_estimators),
    'max_depth': int(max_depth),
    }

    rf = RandomForestClassifier(**params, random_state=42)
    rf.fit(X_train_scaled, y_train)
    y_pred = rf.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    return acc

In [24]:
from bayes_opt import BayesianOptimization

optimizer = BayesianOptimization(f=eval_function, pbounds=param_bounds, random_state=42)
optimizer.maximize(init_points=2, n_iter=10)

|   iter    |  target   | max_depth | n_esti... |
-------------------------------------------------
| [0m1        [0m | [0m0.8386   [0m | [0m7.494    [0m | [0m955.6    [0m |
| [95m2        [0m | [95m0.841    [0m | [95m11.78    [0m | [95m638.8    [0m |
| [95m3        [0m | [95m0.8428   [0m | [95m13.07    [0m | [95m639.1    [0m |
| [0m4        [0m | [0m0.8416   [0m | [0m14.12    [0m | [0m639.7    [0m |
| [0m5        [0m | [0m0.8416   [0m | [0m14.06    [0m | [0m637.3    [0m |
| [0m6        [0m | [0m0.8422   [0m | [0m12.99    [0m | [0m640.0    [0m |
| [0m7        [0m | [0m0.8416   [0m | [0m12.62    [0m | [0m643.2    [0m |
| [0m8        [0m | [0m0.8428   [0m | [0m14.56    [0m | [0m644.7    [0m |
| [0m9        [0m | [0m0.8422   [0m | [0m14.94    [0m | [0m643.9    [0m |
| [0m10       [0m | [0m0.8428   [0m | [0m14.63    [0m | [0m646.3    [0m |
| [0m11       [0m | [0m0.8416   [0m | [0m12.99    [0m | [0m647.0 

In [25]:
# 최적화된 하이퍼파라미터로 모델 재학습
best_params_rf = {
    'n_estimators': round(optimizer.max['params']['n_estimators']),
    'max_depth': round(optimizer.max['params']['max_depth']),
}

best_rf = RandomForestClassifier(**best_params_rf, random_state=42)
best_rf.fit(X_train_scaled, y_train)

In [26]:
# Train data 성능 평가
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

y_train_pred = best_rf.predict(X_train_scaled)
print("<<Train Set Performance>>")
print("Accuracy: ", accuracy_score(y_train, y_train_pred))
print("F1 Score: ", f1_score(y_train, y_train_pred))
print(confusion_matrix(y_train, y_train_pred))

# Test data 성능 평가
y_test_pred = best_rf.predict(X_test_scaled)
print("<<Test Set Performance>>")
print("Accuracy: ", accuracy_score(y_test, y_test_pred))
print("F1 Score: ", f1_score(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))

<<Train Set Performance>>
Accuracy:  0.9850500822245478
F1 Score:  0.956896551724138
[[5479    0]
 [ 100 1110]]
<<Test Set Performance>>
Accuracy:  0.8427973699940227
F1 Score:  0.3811764705882353
[[1329   41]
 [ 222   81]]


### SVC

In [27]:
# 베이지안 최적화
from sklearn.svm import SVC

param_bounds = {
    'C': (0.1, 10),
    'gamma': (0.1, 10),
    }

def eval_function(C, gamma):

    svc = SVC(kernel='rbf', C=C, gamma=gamma, class_weight='balanced', random_state=42)
    svc.fit(X_train_scaled, y_train)
    y_pred = svc.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    return acc

In [28]:
from bayes_opt import BayesianOptimization

optimizer = BayesianOptimization(f=eval_function, pbounds=param_bounds, random_state=42)
optimizer.maximize(init_points=2, n_iter=5)

|   iter    |  target   |     C     |   gamma   |
-------------------------------------------------
| [0m1        [0m | [0m0.8231   [0m | [0m3.808    [0m | [0m9.512    [0m |
| [95m2        [0m | [95m0.8452   [0m | [95m7.347    [0m | [95m6.027    [0m |
| [0m3        [0m | [0m0.8452   [0m | [0m7.405    [0m | [0m6.015    [0m |
| [0m4        [0m | [0m0.835    [0m | [0m6.348    [0m | [0m1.866    [0m |
| [0m5        [0m | [0m0.8279   [0m | [0m10.0     [0m | [0m8.841    [0m |
| [95m6        [0m | [95m0.8494   [0m | [95m10.0     [0m | [95m3.585    [0m |
| [0m7        [0m | [0m0.7956   [0m | [0m10.0     [0m | [0m0.7087   [0m |


In [29]:
# 최적화된 하이퍼파라미터로 모델 재학습
best_params = optimizer.max['params']

best_svc = SVC(**best_params, kernel='rbf', class_weight='balanced', random_state=42)
best_svc.fit(X_train_scaled, y_train)

In [30]:
# Train data 성능 평가
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

y_train_pred = best_svc.predict(X_train_scaled)
print("<<Train Set Performance>>")
print("Accuracy: ", accuracy_score(y_train, y_train_pred))
print("F1 Score: ", f1_score(y_train, y_train_pred))
print(confusion_matrix(y_train, y_train_pred))

# Test data 성능 평가
y_test_pred = best_svc.predict(X_test_scaled)
print("<<Test Set Performance>>")
print("Accuracy: ", accuracy_score(y_test, y_test_pred))
print("F1 Score: ", f1_score(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))

<<Train Set Performance>>
Accuracy:  1.0
F1 Score:  1.0
[[5479    0]
 [   0 1210]]
<<Test Set Performance>>
Accuracy:  0.8493723849372385
F1 Score:  0.4473684210526316
[[1319   51]
 [ 201  102]]
