# Gradient Boosting

In [None]:
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn import datasets


data = datasets.load_breast_cancer()
X, y = data.data, data.target

X_temp, X_test, y_temp, y_test = \
    train_test_split(X, y, test_size=0.3, random_state=123, stratify=y)

X_train, X_valid, y_train, y_valid = \
    train_test_split(X_temp, y_temp, test_size=0.2, random_state=123, stratify=y_temp)

print('Train/Valid/Test sizes:', y_train.shape[0], y_valid.shape[0], y_test.shape[0])

Train/Valid/Test sizes: 318 80 171


## Original gradient boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier


boost = GradientBoostingClassifier(
    learning_rate=0.1,
    n_estimators=100,
    max_depth=8,
    random_state=1)

boost.fit(X_train, y_train)
    
    
print("Training Accuracy: %0.2f" % boost.score(X_train, y_train))
print("Validation Accuracy: %0.2f" % boost.score(X_valid, y_valid))
print("Test Accuracy: %0.2f" % boost.score(X_test, y_test))

Training Accuracy: 1.00
Validation Accuracy: 0.90
Test Accuracy: 0.92


## HistGradientBoostingClassifier (inspired by LightGBM)

In [None]:
#from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier

boost = HistGradientBoostingClassifier(
    learning_rate=0.1,
    #n_estimators=100,
    #max_depth=8,
    random_state=1)

boost.fit(X_train, y_train)
    
print("Training Accuracy: %0.2f" % boost.score(X_train, y_train))
print("Validation Accuracy: %0.2f" % boost.score(X_valid, y_valid))
print("Test Accuracy: %0.2f" % boost.score(X_test, y_test))

Training Accuracy: 1.00
Validation Accuracy: 0.96
Test Accuracy: 0.97


```
import numpy as np
import xgboost as xgb


dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)


param = {
    'max_depth': 8,
    'eta': 0.1,  # learning rate
    'objective': 'multi:softprob',  # loss function for multiclass
    'num_class': 3}  # number of classes

boost = xgb.train(param, dtrain, num_boost_round=100)

y_pred = boost.predict(dtest)
y_labels = np.argmax(y_pred, axis=1)


print("Test Accuracy: %0.2f" % (y_labels == y_test).mean())
```

## XGBoost

In [None]:
# https://xgboost.readthedocs.io/en/latest/build.html

In [None]:
#!pip install xgboost

In [None]:
import numpy as np
import xgboost as xgb


boost = xgb.XGBClassifier()

boost.fit(X_train, y_train)
    
print("Training Accuracy: %0.2f" % boost.score(X_train, y_train))
print("Validation Accuracy: %0.2f" % boost.score(X_valid, y_valid))
print("Test Accuracy: %0.2f" % boost.score(X_test, y_test))

Training Accuracy: 1.00
Validation Accuracy: 0.95
Test Accuracy: 0.98


## LightGBM

In [None]:
# https://lightgbm.readthedocs.io/en/latest/Installation-Guide.html
# conda install -c conda-forge lightgbm

In [None]:
import lightgbm as lgb


boost = lgb.LGBMClassifier()

boost.fit(X_train, y_train)


print("Training Accuracy: %0.2f" % boost.score(X_train, y_train))
print("Validation Accuracy: %0.2f" % boost.score(X_valid, y_valid))
print("Test Accuracy: %0.2f" % boost.score(X_test, y_test))

Training Accuracy: 1.00
Validation Accuracy: 0.97
Test Accuracy: 0.97


## CatBoost

In [None]:
# https://catboost.ai
# conda install -c conda-forge catboost

In [None]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp38-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [None]:
from catboost import CatBoostClassifier


boost = CatBoostClassifier(verbose=0)

boost.fit(X_train, y_train)

print("Training Accuracy: %0.2f" % boost.score(X_train, y_train))
print("Validation Accuracy: %0.2f" % boost.score(X_valid, y_valid))
print("Test Accuracy: %0.2f" % boost.score(X_test, y_test))

Training Accuracy: 1.00
Validation Accuracy: 0.97
Test Accuracy: 0.98


## Ensemble methods

In [None]:
import pandas as pd
import numpy as np

# Demo Notebook Illustrating How To Use Common Gradient Boosting Implementations With Categorical Data

# Dataset Loading

- To keep things simple, we will be using the Titanic dataset. Consequently, please don't overinterpret the predictive performance values. This is more intended as a technical demo/reference for how to use categorical support, not how to achieve good predictive performance.
- Titanic dataset reference: https://www.openml.org/d/40945

In [None]:
data = pd.read_csv('/content/titanic.csv', sep=',')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Remove rows with missing values

In [None]:
data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [None]:
df = data.drop(['Age','Cabin'], axis = 1)
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Embarked'],
      dtype='object')

In [None]:
df = df.dropna(axis = 0)

In [None]:
df.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [None]:
df['Fare'] = df['Fare'].astype(float)

### Convert to array format

- To keep things simple, we will only use a few columns in this dataset:

In [None]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Embarked'],
      dtype='object')

In [None]:
y = df['Survived'].values

feature_names = ['Pclass', 'Sex', 'Fare', 'Embarked']

X = df[feature_names].values

X[:10]

array([[3, 'male', 7.25, 'S'],
       [1, 'female', 71.2833, 'C'],
       [3, 'female', 7.925, 'S'],
       [1, 'female', 53.1, 'S'],
       [3, 'male', 8.05, 'S'],
       [3, 'male', 8.4583, 'Q'],
       [1, 'male', 51.8625, 'S'],
       [3, 'male', 21.075, 'S'],
       [3, 'female', 11.1333, 'S'],
       [2, 'female', 30.0708, 'C']], dtype=object)

In [None]:
np.bincount(y)

array([549, 340])

In [None]:
df.Pclass.unique()

array([3, 1, 2])

In [None]:
df.Sex.unique()

array(['male', 'female'], dtype=object)

In [None]:
df.Embarked.unique()

array(['S', 'C', 'Q'], dtype=object)

- In this dataset, `'sex'` is a binary variable with only two values, so using categorical or onehot encoding is not necessary. However, we will do it anyways for demo purposes.
- Here, `'embarked'` is a categorical variable with 3 possible values.

### Onehot encoder pipeline

- The OneHot encoder pipeline encodes `'sex'` and `'embarked'` into a onehot-encoded form. The remaining features remain unchanged.

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


ohe_features = ['Sex', 'Embarked']
ohe_transformer = make_pipeline(OneHotEncoder(drop='first'))

ohe_preprocessor = ColumnTransformer(
    transformers=[
        ('ohe', ohe_transformer, ohe_features)],
    remainder='passthrough')

ohe_preprocessor.fit_transform(df[feature_names])

array([[ 1.    ,  0.    ,  1.    ,  3.    ,  7.25  ],
       [ 0.    ,  0.    ,  0.    ,  1.    , 71.2833],
       [ 0.    ,  0.    ,  1.    ,  3.    ,  7.925 ],
       ...,
       [ 0.    ,  0.    ,  1.    ,  3.    , 23.45  ],
       [ 1.    ,  0.    ,  0.    ,  1.    , 30.    ],
       [ 1.    ,  1.    ,  0.    ,  3.    ,  7.75  ]])

In [None]:
ohe_preprocessor.get_feature_names_out()

array(['ohe__Sex_male', 'ohe__Embarked_Q', 'ohe__Embarked_S',
       'remainder__Pclass', 'remainder__Fare'], dtype=object)

### Ordinal/Categorical encoder pipeline

- This pipeline will convert the string encoding of `'sex'` and `'embarked'` into an integer format.

In [None]:
from sklearn.preprocessing import OrdinalEncoder


cat_features = ['Sex', 'Embarked']
cat_transformer = make_pipeline(OrdinalEncoder())

cat_preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_transformer, cat_features)],
    remainder='passthrough')

cat_preprocessor.fit_transform(df[feature_names])

array([[ 1.    ,  2.    ,  3.    ,  7.25  ],
       [ 0.    ,  0.    ,  1.    , 71.2833],
       [ 0.    ,  2.    ,  3.    ,  7.925 ],
       ...,
       [ 0.    ,  2.    ,  3.    , 23.45  ],
       [ 1.    ,  0.    ,  1.    , 30.    ],
       [ 1.    ,  1.    ,  3.    ,  7.75  ]])

- Note that feature index 0 corresponds to `'sex'`, and feature index 1 to `'embarked'`

### Train/Valid/Test splits

- Next, we are splitting the dataset into the usual subsets.

In [None]:
from sklearn.model_selection import train_test_split


df_X_temp, df_X_test, df_y_temp, df_y_test = \
    train_test_split(df[feature_names], df['Survived'], test_size=0.20, random_state=123, stratify=df['Survived'])

df_X_train, df_X_valid, df_y_train, df_y_valid = \
    train_test_split(df_X_temp, df_y_temp, test_size=0.25, random_state=123, stratify=df_y_temp)

print('Train/Valid/Test sizes:', df_y_train.shape[0], df_y_valid.shape[0], df_y_test.shape[0])

Train/Valid/Test sizes: 533 178 178


## Performance Baselines

### Majority class prediction

In [None]:
bins = np.bincount(df_y_test)
print(f'Test accuracy: {100* np.max(bins) / np.sum(bins):.2f}%', )

Test accuracy: 61.80%


### Decision Tree (Onehot)

In [None]:
from sklearn.tree import DecisionTreeClassifier


tree = DecisionTreeClassifier(random_state=123)
clf_pipe = make_pipeline(ohe_preprocessor, tree)
clf_pipe.fit(df_X_train, df_y_train)

print(f"Training Accuracy: {100*clf_pipe.score(df_X_train, df_y_train):0.2f}%")
print(f"Validation Accuracy: {100*clf_pipe.score(df_X_valid, df_y_valid):0.2f}%")
print(f"Test Accuracy: {100*clf_pipe.score(df_X_test, df_y_test):0.2f}%")

Training Accuracy: 93.25%
Validation Accuracy: 79.78%
Test Accuracy: 78.65%


### Decision Tree (Ordinal)

- "Ordinal" means that the `'embarked'` variable is treated as an ordinal variable due to the integer encoding.

In [None]:
from sklearn.tree import DecisionTreeClassifier


tree = DecisionTreeClassifier(random_state=123)
clf_pipe = make_pipeline(cat_preprocessor, tree)

clf_pipe.fit(df_X_train, df_y_train)

print(f"Training Accuracy: {100*clf_pipe.score(df_X_train, df_y_train):0.2f}%")
print(f"Validation Accuracy: {100*clf_pipe.score(df_X_valid, df_y_valid):0.2f}%")
print(f"Test Accuracy: {100*clf_pipe.score(df_X_test, df_y_test):0.2f}%")

Training Accuracy: 93.25%
Validation Accuracy: 78.65%
Test Accuracy: 76.97%


## Original gradient boosting (Onehot)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier


boost = GradientBoostingClassifier(
    learning_rate=0.1,
    n_estimators=100,
    max_depth=4,
    random_state=1)

clf_pipe = make_pipeline(ohe_preprocessor, boost)
clf_pipe.fit(df_X_train, df_y_train)
    
print(f"Training Accuracy: {100*clf_pipe.score(df_X_train, df_y_train):0.2f}%")
print(f"Validation Accuracy: {100*clf_pipe.score(df_X_valid, df_y_valid):0.2f}%")
print(f"Test Accuracy: {100*clf_pipe.score(df_X_test, df_y_test):0.2f}%")

Training Accuracy: 91.74%
Validation Accuracy: 77.53%
Test Accuracy: 79.21%


## Original gradient boosting (Ordinal)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier


boost = GradientBoostingClassifier(
    learning_rate=0.1,
    n_estimators=100,
    max_depth=4,
    random_state=1)

clf_pipe = make_pipeline(cat_preprocessor, boost)
clf_pipe.fit(df_X_train, df_y_train)
    
print(f"Training Accuracy: {100*clf_pipe.score(df_X_train, df_y_train):0.2f}%")
print(f"Validation Accuracy: {100*clf_pipe.score(df_X_valid, df_y_valid):0.2f}%")
print(f"Test Accuracy: {100*clf_pipe.score(df_X_test, df_y_test):0.2f}%")

Training Accuracy: 91.56%
Validation Accuracy: 77.53%
Test Accuracy: 76.97%


## HistGradientBoostingClassifier (Onehot)

In [None]:
#from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier


boost = HistGradientBoostingClassifier(
    learning_rate=0.1,
    random_state=1)

clf_pipe = make_pipeline(ohe_preprocessor, boost)
clf_pipe.fit(df_X_train, df_y_train)
    
print(f"Training Accuracy: {100*clf_pipe.score(df_X_train, df_y_train):0.2f}%")
print(f"Validation Accuracy: {100*clf_pipe.score(df_X_valid, df_y_valid):0.2f}%")
print(f"Test Accuracy: {100*clf_pipe.score(df_X_test, df_y_test):0.2f}%")

Training Accuracy: 90.62%
Validation Accuracy: 78.65%
Test Accuracy: 80.34%


## HistGradientBoostingClassifier (Ordinal)

In [None]:
boost = HistGradientBoostingClassifier(
    learning_rate=0.1,
    random_state=1)

clf_pipe = make_pipeline(cat_preprocessor, boost)
clf_pipe.fit(df_X_train, df_y_train)
    
print(f"Training Accuracy: {100*clf_pipe.score(df_X_train, df_y_train):0.2f}%")
print(f"Validation Accuracy: {100*clf_pipe.score(df_X_valid, df_y_valid):0.2f}%")
print(f"Test Accuracy: {100*clf_pipe.score(df_X_test, df_y_test):0.2f}%")

Training Accuracy: 90.06%
Validation Accuracy: 78.09%
Test Accuracy: 79.78%


## HistGradientBoostingClassifier (Categorical)

- In contrast to the "ordinal" control above, the "categorical" sections shows how to use the implemented support for categorical variables

In [None]:
boost = HistGradientBoostingClassifier(
    learning_rate=0.1,
    categorical_features=[0, 1], # -> ['sex', 'embarked'],
    random_state=1)

clf_pipe = make_pipeline(cat_preprocessor, boost)
clf_pipe.fit(df_X_train, df_y_train)
    
print(f"Training Accuracy: {100*clf_pipe.score(df_X_train, df_y_train):0.2f}%")
print(f"Validation Accuracy: {100*clf_pipe.score(df_X_valid, df_y_valid):0.2f}%")
print(f"Test Accuracy: {100*clf_pipe.score(df_X_test, df_y_test):0.2f}%")

Training Accuracy: 90.06%
Validation Accuracy: 78.65%
Test Accuracy: 80.34%


## XGBoost (Onehot)

In [None]:
import numpy as np
import xgboost as xgb


boost = xgb.XGBClassifier()

clf_pipe = make_pipeline(ohe_preprocessor, boost)
clf_pipe.fit(df_X_train, df_y_train)
    
print(f"Training Accuracy: {100*clf_pipe.score(df_X_train, df_y_train):0.2f}%")
print(f"Validation Accuracy: {100*clf_pipe.score(df_X_valid, df_y_valid):0.2f}%")
print(f"Test Accuracy: {100*clf_pipe.score(df_X_test, df_y_test):0.2f}%")

Training Accuracy: 87.43%
Validation Accuracy: 76.40%
Test Accuracy: 76.97%


## XGBoost (Ordinal)

In [None]:
boost = xgb.XGBClassifier()

clf_pipe = make_pipeline(cat_preprocessor, boost)
clf_pipe.fit(df_X_train, df_y_train)
    
print(f"Training Accuracy: {100*clf_pipe.score(df_X_train, df_y_train):0.2f}%")
print(f"Validation Accuracy: {100*clf_pipe.score(df_X_valid, df_y_valid):0.2f}%")
print(f"Test Accuracy: {100*clf_pipe.score(df_X_test, df_y_test):0.2f}%")

Training Accuracy: 87.43%
Validation Accuracy: 76.97%
Test Accuracy: 77.53%


## LightGBM (Onehot)

In [None]:
import lightgbm as lgb


boost = lgb.LGBMClassifier()

clf_pipe = make_pipeline(ohe_preprocessor, boost)
clf_pipe.fit(df_X_train, df_y_train)
    
print(f"Training Accuracy: {100*clf_pipe.score(df_X_train, df_y_train):0.2f}%")
print(f"Validation Accuracy: {100*clf_pipe.score(df_X_valid, df_y_valid):0.2f}%")
print(f"Test Accuracy: {100*clf_pipe.score(df_X_test, df_y_test):0.2f}%")

Training Accuracy: 88.56%
Validation Accuracy: 77.53%
Test Accuracy: 78.65%


## LightGBM (Ordinal)

In [None]:
boost = lgb.LGBMClassifier()

clf_pipe = make_pipeline(cat_preprocessor, boost)
clf_pipe.fit(df_X_train, df_y_train)
    
print(f"Training Accuracy: {100*clf_pipe.score(df_X_train, df_y_train):0.2f}%")
print(f"Validation Accuracy: {100*clf_pipe.score(df_X_valid, df_y_valid):0.2f}%")
print(f"Test Accuracy: {100*clf_pipe.score(df_X_test, df_y_test):0.2f}%")

Training Accuracy: 88.93%
Validation Accuracy: 76.97%
Test Accuracy: 78.65%


## LightGBM (Categorical)

- In contrast to the "ordinal" control above, the "categorical" sections shows how to use the implemented support for categorical variables

In [None]:
boost = lgb.LGBMClassifier(categorical_feature="0,1")

clf_pipe = make_pipeline(cat_preprocessor, boost)
clf_pipe.fit(df_X_train, df_y_train)
    
print(f"Training Accuracy: {100*clf_pipe.score(df_X_train, df_y_train):0.2f}%")
print(f"Validation Accuracy: {100*clf_pipe.score(df_X_valid, df_y_valid):0.2f}%")
print(f"Test Accuracy: {100*clf_pipe.score(df_X_test, df_y_test):0.2f}%")

Training Accuracy: 89.12%
Validation Accuracy: 76.97%
Test Accuracy: 79.21%


Please use categorical_feature argument of the Dataset constructor to pass this parameter.


## CatBoost (Onehot)

In [None]:
from catboost import CatBoostClassifier


boost = CatBoostClassifier(verbose=0)

clf_pipe = make_pipeline(ohe_preprocessor, boost)
clf_pipe.fit(df_X_train, df_y_train)
    
print(f"Training Accuracy: {100*clf_pipe.score(df_X_train, df_y_train):0.2f}%")
print(f"Validation Accuracy: {100*clf_pipe.score(df_X_valid, df_y_valid):0.2f}%")
print(f"Test Accuracy: {100*clf_pipe.score(df_X_test, df_y_test):0.2f}%")

Training Accuracy: 87.24%
Validation Accuracy: 77.53%
Test Accuracy: 78.65%


## CatBoost (Ordinal)

In [None]:
from catboost import CatBoostClassifier


boost = CatBoostClassifier(verbose=0)

clf_pipe = make_pipeline(cat_preprocessor, boost)
clf_pipe.fit(df_X_train, df_y_train)
    
print(f"Training Accuracy: {100*clf_pipe.score(df_X_train, df_y_train):0.2f}%")
print(f"Validation Accuracy: {100*clf_pipe.score(df_X_valid, df_y_valid):0.2f}%")
print(f"Test Accuracy: {100*clf_pipe.score(df_X_test, df_y_test):0.2f}%")

Training Accuracy: 88.37%
Validation Accuracy: 78.09%
Test Accuracy: 78.09%


## CatBoost (Categorical)

- In contrast to the "ordinal" control above, the "categorical" sections shows how to use the implemented support for categorical variables

In [None]:
boost = CatBoostClassifier(verbose=0, cat_features=['Sex', 'Embarked'])


#clf_pipe = make_pipeline(cat_preprocessor, boost)
boost.fit(df_X_train, df_y_train)

print(f"Training Accuracy: {100*boost.score(df_X_train, df_y_train):0.2f}%")
print(f"Validation Accuracy: {100*boost.score(df_X_valid, df_y_valid):0.2f}%")
print(f"Test Accuracy: {100*boost.score(df_X_test, df_y_test):0.2f}%")

Training Accuracy: 85.37%
Validation Accuracy: 78.09%
Test Accuracy: 76.40%


# Random Forests

In [None]:
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn import datasets


data = datasets.load_breast_cancer()
X, y = data.data, data.target

X_temp, X_test, y_temp, y_test = \
    train_test_split(X, y, test_size=0.3, random_state=123, stratify=y)

X_train, X_valid, y_train, y_valid = \
    train_test_split(X_temp, y_temp, test_size=0.2, random_state=123, stratify=y_temp)

print('Train/Valid/Test sizes:', y_train.shape[0], y_valid.shape[0], y_test.shape[0])

Train/Valid/Test sizes: 318 80 171


In [None]:
from sklearn.ensemble import RandomForestClassifier


forest = RandomForestClassifier(n_estimators=100,
                                random_state=1)

forest.fit(X_train, y_train)
    
print("Training Accuracy: %0.2f" % forest.score(X_train, y_train))
print("Validation Accuracy: %0.2f" % forest.score(X_valid, y_valid))
print("Test Accuracy: %0.2f" % forest.score(X_test, y_test))

Training Accuracy: 1.00
Validation Accuracy: 0.95
Test Accuracy: 0.98


In [None]:
from sklearn.ensemble import ExtraTreesClassifier


forest = ExtraTreesClassifier(n_estimators=100,
                              random_state=1)

forest.fit(X_train, y_train)
    
    
print("Training Accuracy: %0.2f" % forest.score(X_train, y_train))
print("Validation Accuracy: %0.2f" % forest.score(X_valid, y_valid))
print("Test Accuracy: %0.2f" % forest.score(X_test, y_test))

Training Accuracy: 1.00
Validation Accuracy: 1.00
Test Accuracy: 0.98


# Stacking

## Dataset

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import datasets

data = datasets.load_breast_cancer()
X, y = data.data, data.target

X_temp, X_test, y_temp, y_test = \
    train_test_split(X, y, test_size=0.3, random_state=123, stratify=y)

X_train, X_valid, y_train, y_valid = \
    train_test_split(X_temp, y_temp, test_size=0.2, random_state=123, stratify=y_temp)

print('Train/Valid/Test sizes:', y_train.shape[0], y_valid.shape[0], y_test.shape[0])

Train/Valid/Test sizes: 318 80 171


## Stacking Classifier from scikit-learn (also includes CV)

In [None]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import StackingClassifier


clf1 = KNeighborsClassifier(n_neighbors=5)
clf2 = RandomForestClassifier(random_state=123)
clf3 = HistGradientBoostingClassifier(random_state=123)
clf4 = AdaBoostClassifier(random_state=123)
clf5 = DecisionTreeClassifier(random_state=123,
                              max_depth=None)

lr = LogisticRegression(random_state=123)

estimators = [('clf1', clf1),
              ('clf2', clf2),
              ('clf3', clf3),
              ('clf4', clf4),
              ('clf5', clf5)]

sclf = StackingClassifier(estimators=estimators, 
                          final_estimator=lr, 
                          cv=10)


sclf.fit(X_train, y_train)
print("Training Accuracy: %0.2f" % sclf.score(X_train, y_train))
print("Validation Accuracy: %0.2f" % sclf.score(X_valid, y_valid))
print("Test Accuracy: %0.2f" % sclf.score(X_test, y_test))

Training Accuracy: 1.00
Validation Accuracy: 0.99
Test Accuracy: 0.98
