### 앙상블(Ensemble) 학습

In [2]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [3]:
from sklearn.preprocessing import MinMaxScaler
cancer_scaled = MinMaxScaler().fit_transform(cancer.data)

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    cancer_scaled, cancer.target, stratify = cancer.target, random_state = 1018, test_size = 0.2
)

### 1. Voting 방식
#### 1.1 Hard voting
  - 로지스틱 회귀
  - 서포트 벡터 머신
  - K최근접 이웃

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [17]:
lrc = LogisticRegression(random_state = 1018)
svc = SVC(random_state= 1018)
knn = KNeighborsClassifier() # K최근접 이웃은 random_state 파라미터가 없다

In [18]:
# 하드보팅을 위한 Ensemble 분류기
from sklearn.ensemble import VotingClassifier
voc = VotingClassifier(
  estimators = [("LRC",lrc),("SVC", svc), ("KNN", knn)], # 학습기 이름과 객체를 튜플 안에 넣어서 전달한다
  voting = "hard"
)

In [19]:
voc.fit(X_train, y_train)
voc.score(X_test, y_test)

0.9824561403508771

In [24]:
lrc.fit(X_train, y_train)
svc.fit(X_train, y_train)
knn.fit(X_train, y_train)
lrc.score(X_train,y_train), svc.score(X_train,y_train), knn.score(X_train,y_train)


(0.9736263736263736, 0.9846153846153847, 0.978021978021978)

1.2 Soft Voting

In [None]:
# 객체의 속성과 메소드 
dir(svc)

In [31]:
lrc.predict_log_proba(X_test[:5])

array([[-2.15438656e+00, -1.23269158e-01],
       [-6.22731608e-03, -5.08192189e+00],
       [-2.74778988e-02, -3.60808077e+00],
       [-4.97439406e-04, -7.60628551e+00],
       [-1.51547416e+00, -2.48081926e-01]])

In [33]:
lrc.predict(X_test[:5])

array([1, 0, 0, 0, 1])

- SVC

In [43]:
# SVC는 probability = True 값을 넣어줘야만 predict_proba 함수를 사용할수 있다
# fit을 이미 한 뒤에 set_params로 파라미터 값을 조정할 경우 fit을 다시 시켜야 한다
svc.set_params(probability = True)
svc.fit(X_train,y_train)
svc.predict_proba(X_test[:5])

array([[1.29683357e-02, 9.87031664e-01],
       [9.99942502e-01, 5.74980427e-05],
       [9.94128156e-01, 5.87184429e-03],
       [9.98645784e-01, 1.35421569e-03],
       [2.42988173e-02, 9.75701183e-01]])

In [42]:
svc2 = SVC(random_state = 1018, probability = True)
svc2.fit(X_train, y_train)
svc2.predict_proba(X_test[:5])

array([[1.29683357e-02, 9.87031664e-01],
       [9.99942502e-01, 5.74980427e-05],
       [9.94128156e-01, 5.87184429e-03],
       [9.98645784e-01, 1.35421569e-03],
       [2.42988173e-02, 9.75701183e-01]])

- knn

In [45]:
knn.predict_proba(X_test[-5:])

array([[0. , 1. ],
       [1. , 0. ],
       [1. , 0. ],
       [0.4, 0.6],
       [0. , 1. ]])

- soft Voting

In [46]:
voc2 = VotingClassifier(
  estimators = [("LRC",lrc),("SVC", svc2), ("KNN",knn)],
  voting = "soft"
)
voc2.fit(X_train, y_train)
voc2.score(X_test,y_test)

0.9824561403508771

In [47]:
voc2.predict_proba(X_test[-5:])

array([[1.71989551e-03, 9.98280104e-01],
       [9.99223609e-01, 7.76391494e-04],
       [9.98571039e-01, 1.42896059e-03],
       [4.09076798e-01, 5.90923202e-01],
       [3.28217241e-03, 9.96717828e-01]])

- GridSearchCV

In [48]:
lrc.C, svc2.C

(1.0, 1.0)

In [57]:
params = {
    'LRC__C':[5, 10, 12],
    "SVC__C":[0.5, 1,2],
}

In [58]:
from sklearn.model_selection import GridSearchCV
grid_voc2 = GridSearchCV(
    voc2, params, scoring = "accuracy", cv = 5
)
grid_voc2.fit(X_train, y_train)
grid_voc2.best_params_

{'LRC__C': 5, 'SVC__C': 2}

In [59]:
params = {
    'LRC__C':[3,4,5,6,7],
    "SVC__C":[1,2,3,4,5],
}
from sklearn.model_selection import GridSearchCV
grid_voc2 = GridSearchCV(
    voc2, params, scoring = "accuracy", cv = 5
)
grid_voc2.fit(X_train, y_train)
grid_voc2.best_params_

{'LRC__C': 4, 'SVC__C': 2}

In [60]:
grid_voc2.best_estimator_.score(X_test, y_test)

0.9824561403508771

### 2. Bagging 방식 - Random Forest

In [61]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state = 2022)
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2022,
 'verbose': 0,
 'warm_start': False}

In [62]:
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)

0.9649122807017544