# 앙상블(Ensemble)

In [1]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [2]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
cancer_scaled = scaler.fit_transform(cancer.data)

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    cancer_scaled, cancer.target, stratify=cancer.target,
    test_size=0.2, random_state=2021
)

### 앙상블 학습을 위한 분류기 - 하드 보팅
- 로지스틱 회귀
- 서포트 벡터 머신
- K-최근접 이웃

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [5]:
lrc = LogisticRegression(random_state=2021)
svc = SVC(random_state=2021)
knn = KNeighborsClassifier()

In [6]:
from sklearn.ensemble import VotingClassifier

voc = VotingClassifier(
    estimators=[('LR',lrc), ('SVC',svc), ('KNN',knn)],
    voting='hard'
)

In [7]:
voc.fit(X_train, y_train)
pred_vo = voc.predict(X_test)

In [8]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred_vo)

0.9824561403508771

### 소프트 보팅
- 로지스틱 회귀
- K-최근접 이웃

In [9]:
voc = VotingClassifier(
    estimators=[('LR',lrc), ('KNN',knn)],
    voting='soft'
)
voc.fit(X_train, y_train)
pred_vo = voc.predict(X_test)
accuracy_score(y_test, pred_vo)

0.9824561403508771

In [10]:
prob = voc.predict_proba(X_test)
prob[:5]

array([[0.38174111, 0.61825889],
       [0.98581471, 0.01418529],
       [0.19347825, 0.80652175],
       [0.01066731, 0.98933269],
       [0.02741955, 0.97258045]])

### Random Forest

In [11]:
from sklearn.ensemble import RandomForestClassifier

In [12]:
rfc = RandomForestClassifier(random_state=2021)
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)

0.9736842105263158

In [13]:
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2021,
 'verbose': 0,
 'warm_start': False}

### K-Nearest Neighbor

In [14]:
from sklearn.neighbors import KNeighborsClassifier

In [15]:
knn = KNeighborsClassifier()        # n_neighbors=5 가 디폴트
knn.fit(X_train, y_train)
knn.score(X_test, y_test)

0.9824561403508771

In [16]:
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

### 로지스틱 회귀

In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
lrc = LogisticRegression(random_state=2021)
lrc.fit(X_train, y_train)
lrc.score(X_test, y_test)

0.9824561403508771

In [19]:
pred_lr = lrc.predict(X_test)
accuracy_score(y_test, pred_lr)

0.9824561403508771