<a href="https://colab.research.google.com/github/gear-patt/Machine-Learning-Templates/blob/main/EnsembleMethod_Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
df = pd.read_csv('/content/drive/MyDrive/Datasets/KU ML datasets/01-census-income-all.csv')
df.rename(columns={'captial-gain': 'capital-gain'},inplace=True)
X = df[ ['age', 'edu num','capital-gain', 'marital status', 'occupation', 'sex',

       'capital-loss', 'hours-per-week' ] ]
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
transformer = make_column_transformer(
    ( OneHotEncoder(), ['marital status', 'occupation', 'sex']),
    remainder='passthrough')
X_train_transformed = transformer.fit_transform(X_train)
X_test_transformed = transformer.transform(X_test)

## Voting Classifier

There are 2 kinds of voting, which are hard(base on number of votes) and soft(based on probability)

In [14]:
from sklearn.ensemble import VotingClassifier

In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

- decision_tree = {'max_depth': 10, 'max_leaf_nodes': 1000}
- knn = {'n_neighbors': 45}
- logistic_regression = {max_iter=5000, penalty='none'}

In [16]:
voting = VotingClassifier(
      estimators=[
                  ('knn', KNeighborsClassifier(n_neighbors=45)),
                  ('tree', DecisionTreeClassifier(max_depth= 10, max_leaf_nodes= 1000)),
                  ('log_reg', LogisticRegression(max_iter=5000, penalty='none'))
      ]
)

In [17]:
voting.fit(X_train_transformed, y_train)

VotingClassifier(estimators=[('knn', KNeighborsClassifier(n_neighbors=45)),
                             ('tree',
                              DecisionTreeClassifier(max_depth=10,
                                                     max_leaf_nodes=1000)),
                             ('log_reg',
                              LogisticRegression(max_iter=5000,
                                                 penalty='none'))])

In [18]:
voting.score(X_train_transformed, y_train)

0.8694494919765567

In [19]:
X_test_transformed = transformer.transform(X_test)

In [20]:
voting.score(X_test_transformed, y_test)

0.8610912068789026

In [21]:
voting.transform(X_test_transformed[:10])

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 1, 1],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [1, 0, 0],
       [0, 0, 0]])

## Bagging

In [22]:
from sklearn.ensemble import BaggingClassifier

In [33]:
bag = BaggingClassifier(DecisionTreeClassifier(max_depth=10),
                        n_estimators=100, oob_score=True)

In [34]:
bag.fit(X_train_transformed, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(max_depth=10),
                  n_estimators=100, oob_score=True)

In [35]:
bag.oob_score_ # average score of all estimators

0.8612085071532772

In [36]:
bag.score(X_train_transformed, y_train)

0.8723159214803061

In [37]:
bag.score(X_test_transformed, y_test)

0.8627290408434845

In [38]:
bag.estimators_

[DecisionTreeClassifier(max_depth=10, random_state=1803170913),
 DecisionTreeClassifier(max_depth=10, random_state=729787743),
 DecisionTreeClassifier(max_depth=10, random_state=686141773),
 DecisionTreeClassifier(max_depth=10, random_state=1448149898),
 DecisionTreeClassifier(max_depth=10, random_state=356419332),
 DecisionTreeClassifier(max_depth=10, random_state=65127307),
 DecisionTreeClassifier(max_depth=10, random_state=951749019),
 DecisionTreeClassifier(max_depth=10, random_state=242989296),
 DecisionTreeClassifier(max_depth=10, random_state=1566035936),
 DecisionTreeClassifier(max_depth=10, random_state=1790548356),
 DecisionTreeClassifier(max_depth=10, random_state=1366865827),
 DecisionTreeClassifier(max_depth=10, random_state=1005243904),
 DecisionTreeClassifier(max_depth=10, random_state=1840406553),
 DecisionTreeClassifier(max_depth=10, random_state=1448413207),
 DecisionTreeClassifier(max_depth=10, random_state=417120720),
 DecisionTreeClassifier(max_depth=10, random_sta

## Random Forest

In [39]:
from sklearn.ensemble import RandomForestClassifier

You should tune the hyperparamters.

In [47]:
rf = RandomForestClassifier(n_estimators=1000, max_depth=5, oob_score=True)

In [48]:
rf.fit(X_train_transformed, y_train)

RandomForestClassifier(max_depth=5, n_estimators=1000, oob_score=True)

In [49]:
rf.oob_score_

0.8506641414787705

In [50]:
rf.score(X_train_transformed, y_train)

0.8512271901312927

In [51]:
rf.score(X_test_transformed, y_test)

0.8479885351622479

## AdaBoost

In [52]:
from sklearn.ensemble import AdaBoostClassifier

In [93]:
adaboost = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=3),
    n_estimators=50,
    learning_rate=0.7
)

In [94]:
adaboost.fit(X_train_transformed, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3),
                   learning_rate=0.7)

In [95]:
adaboost.score(X_train_transformed, y_train)

0.875643027154301

In [96]:
adaboost.estimator_errors_

array([0.1551199 , 0.35177064, 0.40138927, 0.42757052, 0.4723369 ,
       0.47371178, 0.44667695, 0.48379725, 0.47583135, 0.47977931,
       0.4776482 , 0.47748546, 0.4781689 , 0.47606415, 0.48727218,
       0.49265825, 0.49633769, 0.49563011, 0.46558695, 0.48290334,
       0.49698762, 0.49593189, 0.49756113, 0.48760543, 0.49061179,
       0.49812064, 0.47990152, 0.49752233, 0.48899426, 0.4844982 ,
       0.49562558, 0.49112848, 0.49321606, 0.49272025, 0.49556209,
       0.49514808, 0.49600353, 0.49355967, 0.49383291, 0.49643937,
       0.49928493, 0.48778828, 0.49178926, 0.48443403, 0.49295973,
       0.49854637, 0.49899638, 0.49442573, 0.49881198, 0.4864673 ])

In [97]:
adaboost.score(X_test_transformed, y_test)

0.8696898351929573

Even if the hyperparameters are not tuned, the model performance is still better than others.

## Gradient Boosting

In [99]:
from sklearn.ensemble import GradientBoostingClassifier

In [100]:
gbst = GradientBoostingClassifier()

In [101]:
gbst.fit(X_train_transformed, y_train)

GradientBoostingClassifier()

In [102]:
gbst.score(X_train_transformed, y_train)

0.8685025465154966

In [103]:
gbst.score(X_test_transformed, y_test)

0.8679496366055891

## Stacking

In [104]:
from sklearn.ensemble import StackingClassifier

In [105]:
stacking = StackingClassifier(
      estimators=[
                  ('knn', KNeighborsClassifier(n_neighbors=45)),
                  ('tree', DecisionTreeClassifier(max_depth= 10, max_leaf_nodes= 1000)),
                  ('log_reg', LogisticRegression(max_iter=5000, penalty='none'))
      ],
      final_estimator=RandomForestClassifier(),
      cv=5
)

In [106]:
stacking.fit(X_train_transformed, y_train)

StackingClassifier(cv=5,
                   estimators=[('knn', KNeighborsClassifier(n_neighbors=45)),
                               ('tree',
                                DecisionTreeClassifier(max_depth=10,
                                                       max_leaf_nodes=1000)),
                               ('log_reg',
                                LogisticRegression(max_iter=5000,
                                                   penalty='none'))],
                   final_estimator=RandomForestClassifier())

In [107]:
stacking.score(X_train_transformed, y_train)

0.8591354643871727

In [108]:
stacking.score(X_test_transformed, y_test)

0.8461459719520934