### 앙상블 학습
1. Voting 방식
  - Hard Voting : 로지스틱 회귀, SVM, K-nearest

In [1]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [3]:
from sklearn.preprocessing import MinMaxScaler
cancer_scaled=MinMaxScaler().fit_transform(cancer.data)

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    cancer_scaled, cancer.target, stratify=cancer.target,
    test_size=0.2, random_state=2022
)

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [6]:
lrc = LogisticRegression(random_state=2022)
svc = SVC(random_state=2022)
knn = KNeighborsClassifier()

In [8]:
# 하드 보팅을 위한 앙상블 분류기
from sklearn.ensemble import VotingClassifier
voc = VotingClassifier(
    estimators=[('LRC', lrc), ('SVC',svc), ('KNN',knn)], voting='hard'
)

In [9]:
voc.fit(X_train, y_train)
voc.score(X_test, y_test)

1.0

In [11]:
# 개별 분류기의 성능
lrc.fit(X_train, y_train)
svc.fit(X_train, y_train)
knn.fit(X_train, y_train)
lrc.score(X_test, y_test), svc.score(X_test, y_test), knn.score(X_test, y_test)

(0.9912280701754386, 1.0, 0.9824561403508771)

1. Voting 방식
  - Soft Voting

In [None]:
dir(lrc)

In [13]:
# LRC

lrc.predict(X_test[:5])

array([0, 1, 0, 1, 0])

In [14]:
lrc.predict_proba(X_test[:5])

array([[0.99792166, 0.00207834],
       [0.07775117, 0.92224883],
       [0.9774613 , 0.0225387 ],
       [0.05952966, 0.94047034],
       [0.99554778, 0.00445222]])

In [17]:
# SVM

svc.predict(X_test[:5])

array([0, 1, 0, 1, 0])

In [16]:
svc.predict_proba(X_test[:5]) #probability=False 이므로 불가

AttributeError: ignored

In [20]:
svc2=SVC(probability=True, random_state=2022)
svc2.fit(X_train, y_train)
svc2.predict_proba(X_test[:5])

array([[9.99896299e-01, 1.03701492e-04],
       [3.84470713e-03, 9.96155293e-01],
       [9.99896384e-01, 1.03616009e-04],
       [5.97356113e-03, 9.94026439e-01],
       [9.99311796e-01, 6.88204061e-04]])

In [21]:
# KNN

knn.predict_proba(X_test[:5])

array([[1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.]])

In [22]:
knn.predict_proba(X_test[-5:])

array([[0.8, 0.2],
       [1. , 0. ],
       [0.8, 0.2],
       [0. , 1. ],
       [0. , 1. ]])

In [23]:
# Soft Voting
voc2 = VotingClassifier(
    estimators=[('LRC', lrc), ('SVC',svc2), ('KNN',knn)], voting='soft'
)

In [24]:
voc2.fit(X_train, y_train)
voc2.score(X_test, y_test)

1.0

In [25]:
voc2.predict_proba(X_test[:5])

array([[9.99272654e-01, 7.27346212e-04],
       [2.71986265e-02, 9.72801374e-01],
       [9.92452563e-01, 7.54743719e-03],
       [2.18344062e-02, 9.78165594e-01],
       [9.98286525e-01, 1.71347474e-03]])

In [26]:
# GridSearchCV

lrc.C, svc2.C

(1.0, 1.0)

In [27]:
params = {
    'LRC__C' : [0.1, 1, 10], #lrc.C 를 의미
    'SVC__C' : [0.1, 1, 10]  #svc2.C 를 의미
}

In [28]:
from sklearn.model_selection import GridSearchCV
grid_voc2= GridSearchCV(voc2, params, scoring='accuracy', cv=5)
grid_voc2.fit(X_train, y_train)
grid_voc2.score(X_test, y_test)
grid_voc2.best_params_

{'LRC__C': 10, 'SVC__C': 0.1}

In [30]:
params = {
    'LRC__C' : [5, 10, 30], #lrc.C 를 의미
    'SVC__C' : [0.05, 0.1, 0.3]  #svc2.C 를 의미
}

In [31]:
grid_voc2= GridSearchCV(voc2, params, scoring='accuracy', cv=5)
grid_voc2.fit(X_train, y_train)
grid_voc2.best_params_

{'LRC__C': 10, 'SVC__C': 0.05}

In [34]:
grid_voc2.best_estimator_.score(X_test, y_test)

1.0

#### 2. Bagging 방식 - RF

In [35]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=2022)
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2022,
 'verbose': 0,
 'warm_start': False}

In [36]:
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)

1.0