# 검증 코드

In [None]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

splits = 5
kf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)

train = pd.read_csv('./data/train_prep.csv')
test = pd.read_csv('./data/test_prep.csv')
X_train = train.drop('target',axis=1)
y_train = train['target']

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors = 1))
])

base_models = [
    RandomForestClassifier(class_weight= 'balanced_subsample',max_depth = 12,min_samples_leaf= 2, min_samples_split= 6, n_estimators =  150,random_state=42),
    XGBClassifier(eval_metric='logloss',colsample_bytree =0.8, gamma= 0.2, learning_rate =0.05, max_depth =6, min_child_weight= 1, n_estimators=250, scale_pos_weight=10, subsample= 0.9, random_state=42),
    pipeline,
    LogisticRegression(class_weight={1:9}, max_iter=1000)
]

# 메타 모델
meta_model = LogisticRegression(class_weight={1:9}, max_iter=1000)

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.1, stratify=y_train, random_state=42)

X_train.reset_index(drop=True,inplace=True)
X_test.reset_index(drop=True,inplace=True)
y_train.reset_index(drop=True,inplace=True)
y_test.reset_index(drop=True,inplace=True)

# 베이스 모델들에 대한 예측값을 저장할 배열
X_train_meta = np.zeros((len(X_train), len(base_models)))
X_test_meta = np.zeros((splits, len(X_test), len(base_models)))

for fold, (train_idx, valid_idx) in enumerate(kf.split(X_train,y_train)):
    train_x, valid_x = X_train.loc[train_idx], X_train.loc[valid_idx]
    train_y, valid_y = y_train[train_idx], y_train[valid_idx]

    for i, model in enumerate(base_models):
        # 베이스 모델 학습
        model.fit(train_x, train_y)

        # 테스트 데이터에 대한 예측값 저장
        X_train_meta[valid_idx, i] = model.predict_proba(valid_x)[:,1]
        X_test_meta[fold,:, i] = model.predict_proba(X_test)[:,1]

X_test_meta = X_test_meta.mean(axis=0)

# 메타 모델 학습
meta_model.fit(X_train_meta, y_train)

final_predictions = meta_model.predict(X_test_meta)


from sklearn.metrics import make_scorer, f1_score
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, final_predictions)
f1 = f1_score(y_test, final_predictions)
print(f"Accuracy: {accuracy:.4f}, F1-score: {f1:.4f}")

Accuracy: 0.8921, F1-score: 0.2453


# 예측 코드

In [None]:
splits = 5
kf = StratifiedKFold(n_splits=splits, shuffle=True, random_state=42)

train = pd.read_csv('./data/train_prep.csv')
X_test = pd.read_csv('./data/test_prep.csv')
X_train = train.drop('target',axis=1)
y_train = train['target']

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier(n_neighbors = 1))
])

base_models = [
    RandomForestClassifier(class_weight= 'balanced_subsample',max_depth = 12,min_samples_leaf= 2, min_samples_split= 6, n_estimators =  150,random_state=42),
    XGBClassifier(eval_metric='logloss',colsample_bytree =0.8, gamma= 0.2, learning_rate =0.05, max_depth =6, min_child_weight= 1, n_estimators=250, scale_pos_weight=10, subsample= 0.9, random_state=42),
    pipeline
]

# 메타 모델
meta_model = LogisticRegression(class_weight={1:9}, max_iter=1000)


X_train.reset_index(drop=True,inplace=True)
X_test.reset_index(drop=True,inplace=True)
y_train.reset_index(drop=True,inplace=True)

# 베이스 모델들에 대한 예측값을 저장할 배열
X_train_meta = np.zeros((len(X_train), len(base_models)))
X_test_meta = np.zeros((splits, len(X_test), len(base_models)))

for fold, (train_idx, valid_idx) in enumerate(kf.split(X_train,y_train)):
    train_x, valid_x = X_train.loc[train_idx], X_train.loc[valid_idx]
    train_y, valid_y = y_train[train_idx], y_train[valid_idx]

    for i, model in enumerate(base_models):
        # 베이스 모델 학습
        model.fit(train_x, train_y)

        # 테스트 데이터에 대한 예측값 저장
        X_train_meta[valid_idx, i] = model.predict_proba(valid_x)[:,1]
        X_test_meta[fold,:, i] = model.predict_proba(X_test)[:,1]

X_test_meta = X_test_meta.mean(axis=0)

# 메타 모델 학습
meta_model.fit(X_train_meta, y_train)

final_predictions = meta_model.predict(X_test_meta)