# 앙상블(Ensemble) 학습

In [1]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [2]:
# 표준 정규 분포로 표준화
from sklearn.preprocessing import StandardScaler
cancer_std = StandardScaler().fit_transform(cancer.data)

In [3]:
# Train/Test dataset 분리
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    cancer_std, cancer.target, stratify=cancer.target, test_size=0.2, random_state=2022
)

### 1. Voting
#### 1.1 Hard  Voting
- 로지스틱 회귀
- SVM
- K 최근접 이웃

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [5]:
lrc = LogisticRegression(random_state=2022)
svc = SVC(random_state=2022)
knn = KNeighborsClassifier()

In [6]:
# 하드 보팅을 위한 앙상블 분류기
from sklearn.ensemble import VotingClassifier
voc = VotingClassifier(
    estimators=[('LRC',lrc), ('SVC',svc), ('KNN',knn)], 
    voting='hard'
)

In [7]:
voc.fit(X_train, y_train)
voc.score(X_test, y_test)

1.0

In [8]:
# 개별 분류기의 성능
lrc.fit(X_train, y_train)
svc.fit(X_train, y_train)
knn.fit(X_train, y_train)
print(lrc.score(X_test, y_test))
print(svc.score(X_test, y_test))
print(knn.score(X_test, y_test))

0.9912280701754386
0.9912280701754386
0.9912280701754386


#### 1.2 Soft Voting

- Logistic Regression

In [9]:
# 객체의 속성과 메소드
dir(lrc)

['C',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_check_feature_names',
 '_check_n_features',
 '_estimator_type',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_predict_proba_lr',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_validate_data',
 'class_weight',
 'classes_',
 'coef_',
 'decision_function',
 'densify',
 'dual',
 'fit',
 'fit_intercept',
 'get_params',
 'intercept_',
 'intercept_scaling',
 'l1_ratio',
 'max_iter',
 'multi_class',
 'n_features_in_',
 'n_iter_',
 'n_jobs',
 'penalty',
 'predict',
 'predict_log_proba',
 'predict_proba',
 'random_state',
 'score',
 'set_params',
 'solver',
 'sparsify'

In [10]:
lrc.predict(X_test[:5])

array([0, 1, 0, 1, 0])

In [11]:
# [0이 될 확률, 1이 될 확률]
lrc.predict_proba(X_test[:5])

array([[9.99999999e-01, 1.45395340e-09],
       [1.29048390e-02, 9.87095161e-01],
       [9.99988662e-01, 1.13377133e-05],
       [5.23798806e-03, 9.94762012e-01],
       [9.99999953e-01, 4.74465654e-08]])

- Support Vector Classifier

In [12]:
dir(svc)

['C',
 '__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_feature_names',
 '_check_n_features',
 '_check_proba',
 '_compute_kernel',
 '_decision_function',
 '_dense_decision_function',
 '_dense_fit',
 '_dense_predict',
 '_dense_predict_proba',
 '_dual_coef_',
 '_estimator_type',
 '_gamma',
 '_get_coef',
 '_get_param_names',
 '_get_tags',
 '_impl',
 '_intercept_',
 '_more_tags',
 '_n_support',
 '_pairwise',
 '_probA',
 '_probB',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_sparse',
 '_sparse_decision_function',
 '_sparse_fit',
 '_sparse_kernels',
 '_sparse_predict',
 '_sparse_pr

In [13]:
svc.predict_proba(X_test[:5])

AttributeError: ignored

In [14]:
# 하이퍼 파라메터로 probability=True로 하면 predict_proba()를 사용할 수 있음
svc2 = SVC(probability=True, random_state=2022)
svc2.fit(X_train, y_train)
svc2.predict_proba(X_test[:5])

array([[9.99896554e-01, 1.03445598e-04],
       [7.53631647e-06, 9.99992464e-01],
       [9.99957780e-01, 4.22200830e-05],
       [1.11084633e-05, 9.99988892e-01],
       [9.99216287e-01, 7.83713010e-04]])

- K Nearest Neighbors

In [15]:
knn.predict_proba(X_test[:5])

array([[1., 0.],
       [0., 1.],
       [1., 0.],
       [0., 1.],
       [1., 0.]])

In [17]:
knn.predict_proba(X_test[-5:])

array([[0.8, 0.2],
       [1. , 0. ],
       [1. , 0. ],
       [0. , 1. ],
       [0. , 1. ]])

- Soft Voting

In [18]:
voc2 = VotingClassifier(
    estimators=[('LRC',lrc), ('SVC',svc2), ('KNN',knn)], 
    voting='soft'
)

In [19]:
voc2.fit(X_train, y_train)
voc2.score(X_test, y_test)

1.0

In [20]:
voc2.predict(X_test[:5])

array([0, 1, 0, 1, 0])

In [21]:
voc2.predict_proba(X_test[:5])

array([[9.99965518e-01, 3.44823508e-05],
       [4.30412510e-03, 9.95695875e-01],
       [9.99982147e-01, 1.78525988e-05],
       [1.74969884e-03, 9.98250301e-01],
       [9.99738747e-01, 2.61253486e-04]])

- GridSearchCV

In [22]:
lrc.C, svc2.C

(1.0, 1.0)

In [23]:
params = {
    'LRC__C': [0.1, 1, 10],
    'SVC__C': [0.1, 1, 10]
}

In [24]:
from sklearn.model_selection import GridSearchCV
grid_voc = GridSearchCV(voc2, params, scoring='accuracy', cv=5)
grid_voc.fit(X_train, y_train)
grid_voc.best_params_

{'LRC__C': 10, 'SVC__C': 1}

In [25]:
# 위의 두 과정을 여러번 반복하여 베스트 파라메터를 구한 후
grid_voc.best_estimator_.score(X_test, y_test)

1.0

### 2. Bagging 방식 - Random Forest

In [26]:
# 하이퍼 파라메터
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=2022)
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2022,
 'verbose': 0,
 'warm_start': False}

In [27]:
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)

1.0

In [28]:
rfc.predict_proba(X_test[:5])

array([[1.  , 0.  ],
       [0.  , 1.  ],
       [1.  , 0.  ],
       [0.  , 1.  ],
       [0.99, 0.01]])