# ABOUT:
- this notebook:
    - compared voting classifier and stacked generalisation against base estimators
        1. Soft voting with voting classifier will combine estimators by taking the **weighted average probabilities**
        2. Stacked generalization trains a final estimator using the **predictions of base estimators as input**
- findings:
    - **both voting and stacking improved performance**


In [1]:
import pandas as pd
train = pd.read_csv(r"C:\Users\tanch\Documents\Coding Competitions\DataDriven\Flu Shot Learning\local\data\cleaned_train_set.csv",index_col = "respondent_id")
target_labels = ['h1n1_vaccine', 'seasonal_vaccine']
X = train[[col for col in train.columns if col not in target_labels]]
y = train[target_labels[0]]
y.name

'h1n1_vaccine'

In [18]:
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.25, stratify =y, random_state = 0)

<IPython.core.display.Javascript object>

### instantiate base estimators

In [2]:
from catboost import CatBoostClassifier
cv_h1n1_best_params = {
    "learning_rate" : 0.01,
    "iterations" :  3000,
    'depth': 4, 
    'rsm': 0.2680717988907101, 
    'l2_leaf_reg': 5.087316180296697, 
    'auto_class_weights': 'Balanced',
    "loss_function" : "Logloss",
    "verbose":False,
    "task_type" : "CPU",
    "eval_metric":"AUC"
}
cb = CatBoostClassifier()
cb.set_params(**cv_h1n1_best_params)

<catboost.core.CatBoostClassifier at 0x213c95f7c70>

In [14]:
from xgboost import XGBClassifier
xgb_h1n1_best_params = {
    "n_estimators" : 1560,
    'learning_rate': 0.012906144911477856,
    'max_depth': 5,
    'reg_alpha': 5,
    'reg_lambda': 0,
    'min_child_weight': 2,
    'gamma': 0,
    'colsample_bytree': 0.23,
    'subsample': 0.98,
    'eval_metric': 'auc'}
xgb = XGBClassifier()
xgb.set_params(**xgb_h1n1_best_params)

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.23, eval_metric='auc',
              gamma=0, gpu_id=None, importance_type='gain',
              interaction_constraints=None, learning_rate=0.012906144911477856,
              max_delta_step=None, max_depth=5, min_child_weight=2, missing=nan,
              monotone_constraints=None, n_estimators=1560, n_jobs=None,
              num_parallel_tree=None, random_state=None, reg_alpha=5,
              reg_lambda=0, scale_pos_weight=None, subsample=0.98,
              tree_method=None, validate_parameters=None, verbosity=None)

# Voting Classifier

In [16]:
from sklearn.ensemble import VotingClassifier
vc = VotingClassifier(estimators = [("cb",cb),("xgb",xgb)],
                      voting = "soft",            
                      weights = None ,             # take equal weights
                      n_jobs = -1)

In [19]:
vc.fit(train_x, train_y)

VotingClassifier(estimators=[('cb',
                              <catboost.core.CatBoostClassifier object at 0x00000213C95F7C70>),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=0.23,
                                            eval_metric='auc', gamma=0,
                                            gpu_id=None, importance_type='gain',
                                            interaction_constraints=None,
                                            learning_rate=0.012906144911477856,
                                            max_delta_step=None, max_depth=5,
                                            min_child_weight=2, missing=nan,
                                            monotone_constraints=None,
                     

In [29]:
from sklearn.metrics import roc_auc_score
scores = {}
for name,model in zip(["catboost","xgboost","voting classifier"],[cb,xgb,vc]):
    model.fit(train_x, train_y)
    y_pred = model.predict_proba(test_x)
    scores[name] = roc_auc_score(test_y,y_pred[:,1])



### results
- voting classifier wins

In [32]:
pd.Series(scores).sort_values(ascending = False)

voting classifier    0.866627
catboost             0.866046
xgboost              0.865989
dtype: float64

# StackingClassifier
- we use two different final estimators to see if it matters

In [33]:
from sklearn.linear_model import LogisticRegression
final_estimator_1 = CatBoostClassifier()
final_estimator_2 = LogisticRegression()

In [39]:
from sklearn.ensemble import StackingClassifier
sc1 = StackingClassifier(estimators = [("cb",cb),("xgb",xgb)],
                        final_estimator = final_estimator_1,
                        stack_method = "predict_proba" ,          # predict_proba is called on the base estimators as input for final estimator
                        n_jobs = -1)

In [40]:
sc1.fit(train_x, train_y)
y_pred = sc1.predict_proba(test_x)
scores["Stacking_final_catboost"] = roc_auc_score(test_y,y_pred[:,1])

Learning rate set to 0.037047
0:	learn: 0.6614184	total: 11.5ms	remaining: 11.5s
1:	learn: 0.6312739	total: 21.7ms	remaining: 10.8s
2:	learn: 0.6043413	total: 33ms	remaining: 11s
3:	learn: 0.5798449	total: 43.7ms	remaining: 10.9s
4:	learn: 0.5590726	total: 52ms	remaining: 10.4s
5:	learn: 0.5389573	total: 61ms	remaining: 10.1s
6:	learn: 0.5209302	total: 69.2ms	remaining: 9.81s
7:	learn: 0.5050373	total: 76.7ms	remaining: 9.51s
8:	learn: 0.4899189	total: 86.1ms	remaining: 9.48s
9:	learn: 0.4768073	total: 95.2ms	remaining: 9.43s
10:	learn: 0.4650668	total: 102ms	remaining: 9.21s
11:	learn: 0.4543060	total: 110ms	remaining: 9.07s
12:	learn: 0.4451108	total: 119ms	remaining: 9.01s
13:	learn: 0.4367794	total: 127ms	remaining: 8.94s
14:	learn: 0.4289944	total: 134ms	remaining: 8.82s
15:	learn: 0.4216717	total: 143ms	remaining: 8.78s
16:	learn: 0.4151146	total: 151ms	remaining: 8.74s
17:	learn: 0.4090408	total: 159ms	remaining: 8.69s
18:	learn: 0.4035384	total: 167ms	remaining: 8.63s
19:	learn

170:	learn: 0.3414698	total: 1.48s	remaining: 7.17s
171:	learn: 0.3414559	total: 1.49s	remaining: 7.17s
172:	learn: 0.3414495	total: 1.5s	remaining: 7.16s
173:	learn: 0.3414358	total: 1.51s	remaining: 7.16s
174:	learn: 0.3414120	total: 1.52s	remaining: 7.15s
175:	learn: 0.3414003	total: 1.52s	remaining: 7.14s
176:	learn: 0.3413901	total: 1.53s	remaining: 7.13s
177:	learn: 0.3413691	total: 1.54s	remaining: 7.12s
178:	learn: 0.3413509	total: 1.55s	remaining: 7.11s
179:	learn: 0.3413307	total: 1.56s	remaining: 7.1s
180:	learn: 0.3413172	total: 1.57s	remaining: 7.09s
181:	learn: 0.3412947	total: 1.58s	remaining: 7.09s
182:	learn: 0.3412723	total: 1.58s	remaining: 7.08s
183:	learn: 0.3412403	total: 1.59s	remaining: 7.07s
184:	learn: 0.3412217	total: 1.6s	remaining: 7.05s
185:	learn: 0.3412044	total: 1.61s	remaining: 7.04s
186:	learn: 0.3411805	total: 1.62s	remaining: 7.03s
187:	learn: 0.3411584	total: 1.63s	remaining: 7.02s
188:	learn: 0.3411478	total: 1.63s	remaining: 7.01s
189:	learn: 0.3

347:	learn: 0.3367002	total: 3.02s	remaining: 5.65s
348:	learn: 0.3366822	total: 3.02s	remaining: 5.64s
349:	learn: 0.3366625	total: 3.03s	remaining: 5.63s
350:	learn: 0.3366440	total: 3.04s	remaining: 5.63s
351:	learn: 0.3366146	total: 3.05s	remaining: 5.62s
352:	learn: 0.3365797	total: 3.06s	remaining: 5.61s
353:	learn: 0.3365468	total: 3.07s	remaining: 5.6s
354:	learn: 0.3365056	total: 3.08s	remaining: 5.59s
355:	learn: 0.3364907	total: 3.09s	remaining: 5.58s
356:	learn: 0.3364752	total: 3.09s	remaining: 5.57s
357:	learn: 0.3364359	total: 3.1s	remaining: 5.56s
358:	learn: 0.3364086	total: 3.11s	remaining: 5.55s
359:	learn: 0.3363818	total: 3.12s	remaining: 5.54s
360:	learn: 0.3363391	total: 3.13s	remaining: 5.54s
361:	learn: 0.3363148	total: 3.14s	remaining: 5.53s
362:	learn: 0.3362894	total: 3.15s	remaining: 5.52s
363:	learn: 0.3362752	total: 3.15s	remaining: 5.51s
364:	learn: 0.3362578	total: 3.16s	remaining: 5.5s
365:	learn: 0.3362428	total: 3.17s	remaining: 5.49s
366:	learn: 0.3

514:	learn: 0.3323181	total: 4.5s	remaining: 4.24s
515:	learn: 0.3322960	total: 4.51s	remaining: 4.23s
516:	learn: 0.3322734	total: 4.52s	remaining: 4.22s
517:	learn: 0.3322467	total: 4.53s	remaining: 4.21s
518:	learn: 0.3322102	total: 4.53s	remaining: 4.2s
519:	learn: 0.3321730	total: 4.54s	remaining: 4.19s
520:	learn: 0.3321458	total: 4.55s	remaining: 4.18s
521:	learn: 0.3321342	total: 4.56s	remaining: 4.18s
522:	learn: 0.3320857	total: 4.57s	remaining: 4.17s
523:	learn: 0.3320521	total: 4.58s	remaining: 4.16s
524:	learn: 0.3320376	total: 4.59s	remaining: 4.15s
525:	learn: 0.3320074	total: 4.59s	remaining: 4.14s
526:	learn: 0.3319747	total: 4.6s	remaining: 4.13s
527:	learn: 0.3319593	total: 4.61s	remaining: 4.12s
528:	learn: 0.3319452	total: 4.62s	remaining: 4.11s
529:	learn: 0.3319041	total: 4.63s	remaining: 4.1s
530:	learn: 0.3318671	total: 4.64s	remaining: 4.1s
531:	learn: 0.3318500	total: 4.65s	remaining: 4.09s
532:	learn: 0.3318099	total: 4.66s	remaining: 4.08s
533:	learn: 0.331

685:	learn: 0.3275648	total: 6.24s	remaining: 2.86s
686:	learn: 0.3275396	total: 6.25s	remaining: 2.85s
687:	learn: 0.3275083	total: 6.26s	remaining: 2.84s
688:	learn: 0.3274842	total: 6.27s	remaining: 2.83s
689:	learn: 0.3274464	total: 6.28s	remaining: 2.82s
690:	learn: 0.3274284	total: 6.29s	remaining: 2.81s
691:	learn: 0.3273997	total: 6.29s	remaining: 2.8s
692:	learn: 0.3273942	total: 6.3s	remaining: 2.79s
693:	learn: 0.3273895	total: 6.31s	remaining: 2.78s
694:	learn: 0.3273616	total: 6.32s	remaining: 2.77s
695:	learn: 0.3273574	total: 6.33s	remaining: 2.76s
696:	learn: 0.3273406	total: 6.34s	remaining: 2.75s
697:	learn: 0.3273028	total: 6.35s	remaining: 2.75s
698:	learn: 0.3272813	total: 6.35s	remaining: 2.74s
699:	learn: 0.3272539	total: 6.36s	remaining: 2.73s
700:	learn: 0.3272131	total: 6.37s	remaining: 2.72s
701:	learn: 0.3271880	total: 6.38s	remaining: 2.71s
702:	learn: 0.3271630	total: 6.39s	remaining: 2.7s
703:	learn: 0.3271368	total: 6.4s	remaining: 2.69s
704:	learn: 0.32

853:	learn: 0.3236424	total: 7.75s	remaining: 1.32s
854:	learn: 0.3236200	total: 7.76s	remaining: 1.31s
855:	learn: 0.3236135	total: 7.77s	remaining: 1.31s
856:	learn: 0.3235916	total: 7.78s	remaining: 1.3s
857:	learn: 0.3235528	total: 7.79s	remaining: 1.29s
858:	learn: 0.3235317	total: 7.79s	remaining: 1.28s
859:	learn: 0.3234980	total: 7.8s	remaining: 1.27s
860:	learn: 0.3234858	total: 7.81s	remaining: 1.26s
861:	learn: 0.3234645	total: 7.82s	remaining: 1.25s
862:	learn: 0.3234476	total: 7.83s	remaining: 1.24s
863:	learn: 0.3234236	total: 7.84s	remaining: 1.23s
864:	learn: 0.3233993	total: 7.84s	remaining: 1.22s
865:	learn: 0.3233786	total: 7.85s	remaining: 1.22s
866:	learn: 0.3233572	total: 7.86s	remaining: 1.21s
867:	learn: 0.3233315	total: 7.87s	remaining: 1.2s
868:	learn: 0.3233201	total: 7.88s	remaining: 1.19s
869:	learn: 0.3233013	total: 7.89s	remaining: 1.18s
870:	learn: 0.3232726	total: 7.9s	remaining: 1.17s
871:	learn: 0.3232481	total: 7.91s	remaining: 1.16s
872:	learn: 0.32

In [42]:
from sklearn.ensemble import StackingClassifier
sc2 = StackingClassifier(estimators = [("cb",cb),("xgb",xgb)],
                        final_estimator = final_estimator_2,
                        stack_method = "predict_proba" ,          # predict_proba is called on the base estimators as input for final estimator
                        n_jobs = -1)

In [43]:
sc2.fit(train_x, train_y)
y_pred = sc2.predict_proba(test_x)
scores["Stacking_final_logistic"] = roc_auc_score(test_y,y_pred[:,1])

# Results
- **voting classifier wins. although Stacking with logistic regression as final estimator performed almost well**
- using catboost as final layer worsened performance 

In [45]:
pd.Series(scores).sort_values(ascending = False)

voting classifier          0.866627
Stacking_final_logistic    0.866574
catboost                   0.866046
xgboost                    0.865989
Stacking_final_catboost    0.861464
dtype: float64

### export 

In [46]:
import pickle
pickle.dump(sc2, open(r"C:\Users\tanch\Documents\Coding Competitions\DataDriven\Flu Shot Learning\local\model\h1n1_vaccine_stacking.pkl", 'wb'))
pickle.dump(vc, open(r"C:\Users\tanch\Documents\Coding Competitions\DataDriven\Flu Shot Learning\local\model\h1n1_vaccine_voting.pkl", 'wb'))