# 앙상블(Ensemble)

* 일반화와 강건성(Robustness)을 향상시키기 위해 여러 모델의 예측 값을 결합하는 방법
* 앙상블에는 크게 두가지 종류가 존재
  * 평균 방법
    * 여러개의 추정값을 독립적으로 구한뒤 평균을 취함
    * 결합 추정값은 분산이 줄어들기 때문에 단일 추정값보다 좋은 성능을 보임
  * 부스팅 방법
    * 순차적으로 모델 생성
    * 결합된 모델의 편향을 감소 시키기 위해 노력
    * 부스팅 방법의 목표는 여러개의 약한 모델들을 결합해 하나의 강력한 앙상블 모델을 구축하는 것

## Bagging meta-estimator

* bagging은 bootstrap aggregating의 줄임말
* 원래 훈련 데이터셋의 일부를 사용해 여러 모델을 훈련
* 각각의 결과를 결합해 최종 결과를 생성
* 분산을 줄이고 과적합을 막음
* 강력하고 복잡한 모델에서 잘 동작

In [1]:
from sklearn.datasets import load_boston, load_wine, load_iris, load_breast_cancer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split, cross_validate, RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import make_pipeline

import xgboost as xg

In [44]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier

In [3]:
from sklearn.ensemble import BaggingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor

### Bagging을 사용한 분류

#### 데이터셋 불러오기

In [6]:
iris = load_iris()
wine = load_wine()
boston = load_boston()
cancer = load_breast_cancer()

#### KNN

##### 붓꽃 데이터

In [9]:
base_model = make_pipeline(
    StandardScaler(),
    KNeighborsClassifier()
)

bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [19]:
cross_val = cross_validate(
    estimator=base_model,
    X=iris.data, y=iris.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.0010824203491210938
avg score time : 0.0015468597412109375
avg test score : 0.96


In [20]:
cross_val = cross_validate(
    estimator=bagging_model,
    X=iris.data, y=iris.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.020367956161499022
avg score time : 0.007782983779907227
avg test score : 0.9199999999999999


##### 와인 데이터

In [None]:
base_model = make_pipeline(
    StandardScaler(),
    KNeighborsClassifier()
)

bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [21]:
cross_val = cross_validate(
    estimator=base_model,
    X=wine.data, y=wine.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.0014110565185546874
avg score time : 0.002644634246826172
avg test score : 0.9493650793650794


In [22]:
cross_val = cross_validate(
    estimator=bagging_model,
    X=wine.data, y=wine.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.020185327529907225
avg score time : 0.0078066825866699215
avg test score : 0.9607936507936508


##### 유방암 데이터

In [25]:
base_model = make_pipeline(
    StandardScaler(),
    KNeighborsClassifier()
)

bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

In [26]:
cross_val = cross_validate(
    estimator=base_model,
    X=cancer.data, y=cancer.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.0024413108825683595
avg score time : 0.0073527336120605465
avg test score : 0.9648501785437045


In [27]:
cross_val = cross_validate(
    estimator=bagging_model,
    X=cancer.data, y=cancer.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.024027585983276367
avg score time : 0.015720033645629884
avg test score : 0.9595870206489675


#### SVC

##### 붓꽃 데이터

In [28]:
base_model = make_pipeline(
    StandardScaler(),
    SVC()
)

bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

cross_val = cross_validate(
    estimator=base_model,
    X=iris.data, y=iris.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

cross_val = cross_validate(
    estimator=bagging_model,
    X=iris.data, y=iris.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.0014772891998291015
avg score time : 0.00044188499450683596
avg test score : 0.9666666666666666
avg fit time : 0.027752065658569337
avg score time : 0.0033451080322265624
avg test score : 0.9533333333333334


##### 와인 데이터

In [29]:
base_model = make_pipeline(
    StandardScaler(),
    SVC()
)

bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

cross_val = cross_validate(
    estimator=base_model,
    X=wine.data, y=wine.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

cross_val = cross_validate(
    estimator=bagging_model,
    X=wine.data, y=wine.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.0020192146301269533
avg score time : 0.0005350589752197265
avg test score : 0.9833333333333334
avg fit time : 0.029182243347167968
avg score time : 0.003511953353881836
avg test score : 0.9496825396825397


##### 유방암 데이터

In [30]:
base_model = make_pipeline(
    StandardScaler(),
    SVC()
)

bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

cross_val = cross_validate(
    estimator=base_model,
    X=cancer.data, y=cancer.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

cross_val = cross_validate(
    estimator=bagging_model,
    X=cancer.data, y=cancer.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.005692815780639649
avg score time : 0.0012214183807373047
avg test score : 0.9736376339077782
avg fit time : 0.03967585563659668
avg score time : 0.007697153091430664
avg test score : 0.9631113181183046


#### Decision Tree

##### 붓꽃 데이터

In [31]:
base_model = make_pipeline(
    StandardScaler(),
    DecisionTreeClassifier()
)

bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

cross_val = cross_validate(
    estimator=base_model,
    X=iris.data, y=iris.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

cross_val = cross_validate(
    estimator=bagging_model,
    X=iris.data, y=iris.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.0014908790588378906
avg score time : 0.00036869049072265627
avg test score : 0.9533333333333334
avg fit time : 0.025547313690185546
avg score time : 0.0025892257690429688
avg test score : 0.9333333333333332


##### 와인 데이터

In [32]:
base_model = make_pipeline(
    StandardScaler(),
    DecisionTreeClassifier()
)

bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

cross_val = cross_validate(
    estimator=base_model,
    X=wine.data, y=wine.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

cross_val = cross_validate(
    estimator=bagging_model,
    X=wine.data, y=wine.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.0016223430633544923
avg score time : 0.0003436088562011719
avg test score : 0.8874603174603175
avg fit time : 0.03054633140563965
avg score time : 0.002828502655029297
avg test score : 0.9330158730158731


##### 유방암 데이터

In [33]:
base_model = make_pipeline(
    StandardScaler(),
    DecisionTreeClassifier()
)

bagging_model = BaggingClassifier(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

cross_val = cross_validate(
    estimator=base_model,
    X=cancer.data, y=cancer.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

cross_val = cross_validate(
    estimator=bagging_model,
    X=cancer.data, y=cancer.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.008166646957397461
avg score time : 0.0005358219146728516
avg test score : 0.927899394503959
avg fit time : 0.03580632209777832
avg score time : 0.0024790287017822264
avg test score : 0.9613569321533924


### Bagging을 사용한 회귀

#### 데이터셋 불러오기

In [35]:
from sklearn.datasets import load_diabetes
boston = load_boston()
diabetes = load_diabetes()

#### KNN

##### 보스턴 주택 가격 데이터

In [41]:
base_model = make_pipeline(
    StandardScaler(),
    KNeighborsRegressor()
)

bagging_model = BaggingRegressor(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

cross_val = cross_validate(
    estimator=base_model,
    X=boston.data, y=boston.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

cross_val = cross_validate(
    estimator=bagging_model,
    X=boston.data, y=boston.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.001440715789794922
avg score time : 0.0016593456268310547
avg test score : 0.47357748833823543
avg fit time : 0.023872900009155273
avg score time : 0.008948707580566406
avg test score : 0.5113321554479036


##### 당뇨병 데이터

In [42]:
base_model = make_pipeline(
    StandardScaler(),
    KNeighborsRegressor()
)

bagging_model = BaggingRegressor(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

cross_val = cross_validate(
    estimator=base_model,
    X=diabetes.data, y=diabetes.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

cross_val = cross_validate(
    estimator=bagging_model,
    X=diabetes.data, y=diabetes.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.0013749122619628907
avg score time : 0.001666688919067383
avg test score : 0.3689720650295623
avg fit time : 0.019614362716674806
avg score time : 0.008825349807739257
avg test score : 0.407014191563816


#### SVR

##### 보스턴 주택 가격 데이터

In [45]:
base_model = make_pipeline(
    StandardScaler(),
    SVR()
)

bagging_model = BaggingRegressor(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

cross_val = cross_validate(
    estimator=base_model,
    X=boston.data, y=boston.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

cross_val = cross_validate(
    estimator=bagging_model,
    X=boston.data, y=boston.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.013436698913574218
avg score time : 0.0023053646087646484
avg test score : 0.17631266230186618
avg fit time : 0.0470644474029541
avg score time : 0.010325813293457031
avg test score : 0.1434902494365399


##### 당뇨병 데이터

In [46]:
base_model = make_pipeline(
    StandardScaler(),
    SVR()
)

bagging_model = BaggingRegressor(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

cross_val = cross_validate(
    estimator=base_model,
    X=diabetes.data, y=diabetes.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

cross_val = cross_validate(
    estimator=bagging_model,
    X=diabetes.data, y=diabetes.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.010359477996826173
avg score time : 0.0021965503692626953
avg test score : 0.14659936199629434
avg fit time : 0.03951563835144043
avg score time : 0.008149003982543946
avg test score : 0.06678396443589277


#### Decision Tree

##### 보스턴 주택 가격 데이터

In [48]:
base_model = make_pipeline(
    StandardScaler(),
    DecisionTreeRegressor()
)

bagging_model = BaggingRegressor(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

cross_val = cross_validate(
    estimator=base_model,
    X=boston.data, y=boston.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

cross_val = cross_validate(
    estimator=bagging_model,
    X=boston.data, y=boston.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.004863166809082031
avg score time : 0.0006844043731689453
avg test score : 0.11870815150991651
avg fit time : 0.03270645141601562
avg score time : 0.0026612281799316406
avg test score : 0.3434864838018063


##### 당뇨병 데이터

In [49]:
base_model = make_pipeline(
    StandardScaler(),
    DecisionTreeRegressor()
)

bagging_model = BaggingRegressor(base_model, n_estimators=10, max_samples=0.5, max_features=0.5)

cross_val = cross_validate(
    estimator=base_model,
    X=diabetes.data, y=diabetes.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

cross_val = cross_validate(
    estimator=bagging_model,
    X=diabetes.data, y=diabetes.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.0031306266784667967
avg score time : 0.0006098747253417969
avg test score : -0.13381034860851782
avg fit time : 0.02947096824645996
avg score time : 0.002658176422119141
avg test score : 0.364114298278775


## Forests of randomized trees

* `sklearn.ensemble` 모듈에는 무작위 결정 트리를 기반으로하는 두 개의 평균화 알고리즘이 존재
  * Random Forest
  * Extra-Trees
* 모델 구성에 임의성을 추가해 다양한 모델 집합이 생성
* 앙상블 모델의 예측은 각 모델의 평균

In [53]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

### Random Forests 분류

In [57]:
model = make_pipeline(
    StandardScaler(),
    RandomForestClassifier()
)

In [68]:
cross_val = cross_validate(
    estimator = model,
    X =iris.data, y = iris.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.11230559349060058
avg score time : 0.010661935806274414
avg test score : 0.9533333333333334


In [60]:
cross_val = cross_validate(
    estimator = model,
    X =wine.data, y = wine.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.1590709686279297
avg score time : 0.010297012329101563
avg test score : 0.9722222222222221


In [61]:
cross_val = cross_validate(
    estimator = model,
    X =cancer.data, y = cancer.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.22337846755981444
avg score time : 0.010760831832885741
avg test score : 0.9631113181183046


### Random Forests 회귀

In [63]:
model = make_pipeline(
    StandardScaler(),
    RandomForestRegressor()
)

In [64]:
cross_val = cross_validate(
    estimator = model,
    X =boston.data, y = boston.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.33797192573547363
avg score time : 0.009491157531738282
avg test score : 0.612025923707346


In [65]:
cross_val = cross_validate(
    estimator = model,
    X =diabetes.data, y = diabetes.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.2823489189147949
avg score time : 0.009240341186523438
avg test score : 0.4200046773863654


### Extremely Randomized Trees 분류

In [69]:
model = make_pipeline(
    StandardScaler(),
    ExtraTreesClassifier()
)

In [70]:
cross_val = cross_validate(
    estimator = model,
    X =iris.data, y = iris.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.10996417999267578
avg score time : 0.011180877685546875
avg test score : 0.9533333333333334


In [71]:
cross_val = cross_validate(
    estimator = model,
    X =wine.data, y = wine.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.11526150703430176
avg score time : 0.010832834243774413
avg test score : 0.9888888888888889


In [72]:
cross_val = cross_validate(
    estimator = model,
    X =cancer.data, y = cancer.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.13104147911071778
avg score time : 0.012049722671508788
avg test score : 0.9631268436578171


### Extremely Randomized Trees 회귀

In [73]:
model = make_pipeline(
    StandardScaler(),
    ExtraTreesRegressor()
)

In [74]:
cross_val = cross_validate(
    estimator = model,
    X =boston.data, y = boston.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.19987716674804687
avg score time : 0.010106134414672851
avg test score : 0.6373928437500431


In [75]:
cross_val = cross_validate(
    estimator = model,
    X =diabetes.data, y = diabetes.target,
    cv =5, n_jobs=-1, verbose=False
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.17924270629882813
avg score time : 0.009954595565795898
avg test score : 0.43480735829367473


### Random Forest, Extra Tree 시각화

* 결정 트리, Random Forest, Extra Tree의 결정 경계와 회귀식 시각화

## AdaBoost

* 대표적인 부스팅 알고리즘
* 일련의 약한 모델들을 학습
* 수정된 버전의 데이터를 반복 학습 (가중치가 적용된)
* 가중치 투표(또는 합)을 통해 각 모델의 예측 값을 결합
* 첫 단계에서는 원본 데이터를 학습하고 연속적인 반복마다 개별 샘플에 대한 가중치가 수정되고 다시 모델이 학습
  * 잘못 예측된 샘플은 가중치 증가, 올바르게 예측된 샘플은 가중치 감소
  * 각각의 약한 모델들은 예측하기 어려운 샘플에 집중하게 됨

![AdaBoost](https://scikit-learn.org/stable/_images/sphx_glr_plot_adaboost_hastie_10_2_0011.png)

In [76]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import AdaBoostRegressor

### AdaBoost 분류

In [77]:
model = make_pipeline(
  StandardScaler(), 
  AdaBoostClassifier()
)

In [78]:
cross_val = cross_validate(
    estimator =model,
    X=iris.data, y=iris.target, cv=5, verbose=False, n_jobs=-1
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

In [80]:
cross_val = cross_validate(
    estimator =model,
    X=wine.data, y=wine.target, cv=5, verbose=False, n_jobs=-1
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.08787250518798828
avg score time : 0.008778047561645509
avg test score : 0.8085714285714285


In [79]:
cross_val = cross_validate(
    estimator =model,
    X=cancer.data, y=cancer.target, cv=5, verbose=False, n_jobs=-1
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.154426908493042
avg score time : 0.009656143188476563
avg test score : 0.9718677224033534


### AdaBoost 회귀

In [81]:
model = make_pipeline(
  StandardScaler(), 
  AdaBoostRegressor()
)

In [82]:
cross_val = cross_validate(
    estimator =model,
    X=boston.data, y=boston.target, cv=5, verbose=False, n_jobs=-1
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.09816575050354004
avg score time : 0.00463571548461914
avg test score : 0.5828989442274427


In [83]:
cross_val = cross_validate(
    estimator =model,
    X=diabetes.data, y=diabetes.target, cv=5, verbose=False, n_jobs=-1
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.0701606273651123
avg score time : 0.00460968017578125
avg test score : 0.4352393937426764


## Gradient Tree Boosting

* 임의의 차별화 가능한 손실함수로 일반화한 부스팅 알고리즘
* 웹 검색, 분류 및 회귀 등 다양한 분야에서 모두 사용 가능

In [86]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor

### Gradient Tree Boosting 분류

In [87]:
model = make_pipeline(
  StandardScaler(), 
  GradientBoostingClassifier()
)

In [88]:
cross_val = cross_validate(
    estimator =model,
    X=wine.data, y=wine.target, cv=5, verbose=False, n_jobs=-1
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.2529268264770508
avg score time : 0.0009803295135498047
avg test score : 0.9385714285714286


In [89]:
cross_val = cross_validate(
    estimator =model,
    X=iris.data, y=iris.target, cv=5, verbose=False, n_jobs=-1
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.21070528030395508
avg score time : 0.00093994140625
avg test score : 0.9666666666666668


In [90]:
cross_val = cross_validate(
    estimator =model,
    X=cancer.data, y=cancer.target, cv=5, verbose=False, n_jobs=-1
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.45519351959228516
avg score time : 0.0008756637573242188
avg test score : 0.9648812296227295


### Gradient Tree Boosting 회귀

In [92]:
model = make_pipeline(
  StandardScaler(), 
  GradientBoostingRegressor()
)

In [93]:
cross_val = cross_validate(
    estimator =model,
    X=boston.data, y=boston.target, cv=5, verbose=False, n_jobs=-1
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.12517137527465821
avg score time : 0.0009468555450439453
avg test score : 0.6796360919394389


In [94]:
cross_val = cross_validate(
    estimator =model,
    X=diabetes.data, y=diabetes.target, cv=5, verbose=False, n_jobs=-1
)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.09932255744934082
avg score time : 0.001022005081176758
avg test score : 0.4088786415316532


## 투표 기반 분류 (Voting Classifier)

* 서로 다른 모델들의 결과를 투표를 통해 결합
* 두가지 방법으로 투표 가능
  * 가장 많이 예측된 클래스를 정답으로 채택 (hard voting)
  * 예측된 확률의 가중치 평균 (soft voting)

In [96]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score

In [99]:
model1 = SVC()
model2 = GaussianNB()
model3 = RandomForestClassifier()
vot_model = VotingClassifier(
  estimators = [("svc", model1), ("naive", model2), ("forest", model3)],
  voting = "hard"
)

In [114]:
for model in [model1, model2, model3, vot_model]:
  model_name = str(type(model)).split(".")[-1][:-2]
  score = cross_val_score(model, iris.data, iris.target, cv=5, n_jobs=-1)
  print("Accuracy : ", score.mean().round(2), f"[{model_name}]" )

Accuracy :  0.97 [SVC]
Accuracy :  0.95 [GaussianNB]
Accuracy :  0.97 [RandomForestClassifier]
Accuracy :  0.96 [VotingClassifier]


In [115]:
model1 = SVC(probability=True)
model2 = GaussianNB()
model3 = RandomForestClassifier()
vot_model = VotingClassifier(
  estimators = [("svc", model1), ("naive", model2), ("forest", model3)],
  voting = "soft",
  weights=[2,1,2])

In [116]:
for model in (model1, model2, model3, vot_model):
  model_name = str(type(model)).split(".")[-1][:-2]
  score = cross_val_score(model, iris.data, iris.target, cv=5, n_jobs=-1)
  print("Accuracy : ", score.mean().round(2), f"[{model_name}]" )

Accuracy :  0.97 [SVC]
Accuracy :  0.95 [GaussianNB]
Accuracy :  0.96 [RandomForestClassifier]
Accuracy :  0.96 [VotingClassifier]


### 결정 경계 시각화

## 투표 기반 회귀 (Voting Regressor)

* 서로 다른 모델의 예측 값의 평균을 사용

In [118]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, VotingRegressor

In [123]:
model1 = GradientBoostingRegressor()
model2 = RandomForestRegressor()
model3 = LinearRegression()
vot_model = VotingRegressor(
  estimators = [("gbr", model1), ("rfr", model2), ("lr", model3)],
  weights=[1,1,1]
)

In [124]:
for model in [model1, model2, model3, vot_model]:
  model_name = str(type(model)).split(".")[-1][:-2]
  score = cross_val_score(model, boston.data, boston.target, cv=5, n_jobs=-1)
  print("Accuracy : ", score.mean().round(2), f"[{model_name}]" )

Accuracy :  0.68 [GradientBoostingRegressor]
Accuracy :  0.63 [RandomForestRegressor]
Accuracy :  0.35 [LinearRegression]
Accuracy :  0.66 [VotingRegressor]


### 회귀식 시각화

## 스택 일반화 (Stacked Generalization)

* 각 모델의 예측 값을 최종 모델의 입력으로 사용
* 모델의 편향을 줄이는데 효과적

### 스택 회귀

In [125]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import StackingRegressor

In [127]:
estimators = [("ridge", Ridge()),
              ("lasso", Lasso()),
              ("svr", SVR())] 

In [128]:
model = make_pipeline(
    StandardScaler(), 
    StackingRegressor(
        estimators=estimators,
        final_estimator = GradientBoostingRegressor()
    )
)

In [134]:
cross_val = cross_validate(
    estimator = model,
    X =boston.data, y=boston.target,
    cv=5)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.11285190582275391
avg score time : 0.0022696971893310545
avg test score : 0.3289690307838619


#### 회귀식 시각화

### 스택 분류

In [135]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.naive_bayes import GaussianNB

In [137]:
estimators = [("logistic", LogisticRegression(max_iter=10000)),
              ("svc", SVC()),
              ("naive", GaussianNB())]

In [139]:
clf = StackingClassifier(
    estimators = estimators,
    final_estimator = RandomForestClassifier()
)

In [140]:
cross_val = cross_validate(
    estimator = clf,
    X =iris.data, y=iris.target,
    cv=5)

print("avg fit time :", cross_val["fit_time"].mean())
print("avg score time :", cross_val["score_time"].mean())
print("avg test score :", cross_val["test_score"].mean())

avg fit time : 0.2572622776031494
avg score time : 0.008333349227905273
avg test score : 0.9733333333333334


#### 결정 경계 시각화