# ABOUT:
- this notebook:
    - compared voting classifier and stacked generalisation against base estimators
        1. Soft voting with voting classifier will combine estimators by taking the **weighted average probabilities**
        2. Stacked generalization trains a final estimator using the **predictions of base estimators as input**
- insight:
    - xgboost base estimator out performed both stacking and voting **stacking and voting does not always yield gains**

In [1]:
import pandas as pd
train = pd.read_csv(r"C:\Users\tanch\Documents\Coding Competitions\DataDriven\Flu Shot Learning\local\data\cleaned_train_set.csv",index_col = "respondent_id")
target_labels = ['h1n1_vaccine', 'seasonal_vaccine']
X = train[[col for col in train.columns if col not in target_labels]]
y = train[target_labels[1]]
y.name

'seasonal_vaccine'

In [2]:
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.25, stratify =y, random_state = 0)

<IPython.core.display.Javascript object>

### declare base estimators
- they are using the best parameters found in optuna

In [3]:
from catboost import CatBoostClassifier
cv_seasonal_best_params = {
    "learning_rate" : 0.01,
    "iterations" :  3000,
    'depth': 4, 
    'rsm': 0.93789016484649, 
    'l2_leaf_reg': 7.847914167208884, 
    'auto_class_weights': 'SqrtBalanced',
    "loss_function" : "Logloss",
    "verbose":False,
    "task_type" : "CPU",
    "eval_metric":"AUC"
}
cb = CatBoostClassifier()
cb.set_params(**cv_seasonal_best_params)

<catboost.core.CatBoostClassifier at 0x25195fb6d30>

In [4]:
from xgboost import XGBClassifier
xgb_seasonal_best_params = {
    'learning_rate': 0.014239918514545242,
    'max_depth': 5,
    'reg_alpha': 1,
    'reg_lambda': 5,
    'min_child_weight': 2,
    'gamma': 0,
    'colsample_bytree': 0.43000000000000005,
    'subsample': 0.71,
    'n_estimators': 962,
    'eval_metric': 'auc'
}
xgb = XGBClassifier()
xgb.set_params(**xgb_seasonal_best_params)

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=0.43000000000000005,
              eval_metric='auc', gamma=0, gpu_id=None, importance_type='gain',
              interaction_constraints=None, learning_rate=0.014239918514545242,
              max_delta_step=None, max_depth=5, min_child_weight=2, missing=nan,
              monotone_constraints=None, n_estimators=962, n_jobs=None,
              num_parallel_tree=None, random_state=None, reg_alpha=1,
              reg_lambda=5, scale_pos_weight=None, subsample=0.71,
              tree_method=None, validate_parameters=None, verbosity=None)

# Voting Classifier

In [5]:
from sklearn.ensemble import VotingClassifier

In [6]:
vc = VotingClassifier(estimators = [("cb",cb),("xgb",xgb)],
                      voting = "soft",            
                      weights = None ,             # take equal weights
                      n_jobs = -1)

In [7]:
from sklearn.metrics import roc_auc_score
scores = {}
for name,model in zip(["catboost","xgboost","voting classifier"],[cb,xgb,vc]):
    model.fit(train_x, train_y)
    y_pred = model.predict_proba(test_x)
    scores[name] = roc_auc_score(test_y,y_pred[:,1])



### results
- voting classifier wins

In [8]:
pd.Series(scores).sort_values(ascending = False)

xgboost              0.871051
voting classifier    0.870871
catboost             0.869629
dtype: float64

# StackingClassifier
- we use two different final estimators to see if it matters

In [9]:
from sklearn.linear_model import LogisticRegression
final_estimator = LogisticRegression()

In [11]:
from sklearn.ensemble import StackingClassifier
sc2 = StackingClassifier(estimators = [("cb",cb),("xgb",xgb)],
                        final_estimator = final_estimator,
                        stack_method = "predict_proba" ,          # predict_proba is called on the base estimators as input for final estimator
                        n_jobs = -1)

In [12]:
sc2.fit(train_x, train_y)
y_pred = sc2.predict_proba(test_x)
scores["Stacking_final_logistic"] = roc_auc_score(test_y,y_pred[:,1])

# Results
- **stacking and voting did not improve performance** here, xgboost was best performing

In [13]:
pd.Series(scores).sort_values(ascending = False)

xgboost                    0.871051
Stacking_final_logistic    0.870937
voting classifier          0.870871
catboost                   0.869629
dtype: float64

### export

In [14]:
import pickle
pickle.dump(sc2, open(r"C:\Users\tanch\Documents\Coding Competitions\DataDriven\Flu Shot Learning\local\model\seasonal_vaccine_stacking.pkl", 'wb'))
pickle.dump(vc, open(r"C:\Users\tanch\Documents\Coding Competitions\DataDriven\Flu Shot Learning\local\model\seasonal_vaccine_voting.pkl", 'wb'))

