In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd

In [3]:
from xgboost import XGBClassifier
from lightgbm.sklearn import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [4]:
X = np.load("./tatanic_X_train.npy")
y = np.load("./tatanic_y_train.npy")

In [5]:
estimator1 = XGBClassifier(max_depth=3, learning_rate=0.5, n_estimators=50, n_jobs=-1)
estimator2 = LGBMClassifier(max_depth=2, learning_rate=0.5, n_estimators=50, n_jobs=-1)
estimator3 = RandomForestClassifier(n_estimators=500, max_depth=3, n_jobs=-1)
estimator4 = SVC(probability=True)
# estimator5 = MLPClassifier(hidden_layer_sizes=(512,256, 32))

In [6]:
base_estimators = [estimator1, estimator2, estimator3, estimator4]

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((533, 27), (356, 27), (533,), (356,))

In [9]:
for estimator in base_estimators:
            estimator.fit(X_train, y_train)

In [10]:
# test 데이터를 클래스 아니면 활률 값으로 바꿈(즉 X 데이터를 확률 데이터로 바꾼다.)
base_estimators[0].predict_proba(X_test)

array([[0.88502073, 0.11497927],
       [0.9078408 , 0.09215921],
       [0.3092692 , 0.6907308 ],
       [0.9308935 , 0.0691065 ],
       [0.00272995, 0.99727005],
       [0.60250175, 0.39749828],
       [0.930884  , 0.06911601],
       [0.04314405, 0.95685595],
       [0.8591654 , 0.14083464],
       [0.09440607, 0.90559393],
       [0.2622971 , 0.7377029 ],
       [0.91105944, 0.08894058],
       [0.929312  , 0.07068798],
       [0.6191321 , 0.3808679 ],
       [0.94228745, 0.05771256],
       [0.9678691 , 0.03213092],
       [0.91105944, 0.08894058],
       [0.37348413, 0.62651587],
       [0.0027954 , 0.9972046 ],
       [0.94906527, 0.05093472],
       [0.04280436, 0.95719564],
       [0.17066634, 0.82933366],
       [0.024216  , 0.975784  ],
       [0.9305407 , 0.06945929],
       [0.5432937 , 0.45670632],
       [0.9936541 , 0.00634591],
       [0.8636065 , 0.13639347],
       [0.978336  , 0.021664  ],
       [0.00518304, 0.99481696],
       [0.9467569 , 0.0532431 ],
       [0.

# 예측한 class 자체를 다시 또 학습한다

In [11]:
meta_train_set = np.array([estimator.predict(X_test) for estimator in base_estimators]).T

In [12]:
meta_train_set # 356*5
# phase 1. subset2로 다른 feature로 만들기 위한 모델을 subset1으로 학습한다.
# subset도 다른 featureset으로 
# 즉 vanilla ensemble처럼 데이터셋을 multiplicity voting 하는 것이 아니라 voting 하는 것 자체를 학습하자.
# 다른 방법) subset2 가 subset1 한 걸로 결과값을 만드는 모델에서 거기에 더해서 원래 subset2에 있던 x feature도 concat해서 학습하기

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [1., 1., 1., 1.],
       ...,
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [0., 0., 0., 0.]])

In [13]:
from sklearn.model_selection import cross_val_score

In [28]:
for estimator in base_estimators:
    result = cross_val_score(estimator, meta_train_set, y_test, scoring="accuracy" , cv=5).mean()
    print(result)

0.8261904761904763
0.8261904761904763
0.8290476190476191
0.8261904761904763


In [15]:
meta_train_set2 = np.array([estimator.predict_proba(X_test)[:,1] for estimator in base_estimators]).T

In [16]:
meta_train_set2

array([[0.11497927, 0.12179135, 0.14172594, 0.15354737],
       [0.09215921, 0.09122816, 0.13298038, 0.15323276],
       [0.69073081, 0.66027134, 0.65642621, 0.76662845],
       ...,
       [0.7743324 , 0.67166478, 0.71000297, 0.80446644],
       [0.8473016 , 0.71126666, 0.56628192, 0.7423236 ],
       [0.0199352 , 0.03801907, 0.29348125, 0.23871014]])

In [17]:
for estimator in base_estimators:
    result = cross_val_score(estimator, meta_train_set2, y_test, scoring="accuracy" , cv=5).mean()
    print(result)

0.8148412698412699
0.8094444444444445
0.8289682539682539
0.8318253968253968


In [18]:
new_X_test = np.concatenate([X_test, meta_train_set2], axis = 1)
new_X_test.shape

(356, 31)

In [19]:
for estimator in base_estimators:
    result = cross_val_score(estimator, new_X_test, y_test, scoring="accuracy" , cv=5).mean()
    print(result)

0.7980158730158731
0.7923809523809523
0.8234126984126984
0.8123809523809523
