# End To End

This notebook features some examples on some advanced end to end pipeline that really leverage NumerBlox's power. Consider this a testing ground on how well it integrates with sklearn and associated libraries.

In [1]:
import numpy as np
from xgboost import XGBRegressor
from sklego.preprocessing import ColumnSelector
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import make_pipeline, make_union
from sklearn.compose import TransformedTargetRegressor
from sklearn.tree import DecisionTreeClassifier

from numerblox.preprocessing import GroupStatsPreProcessor
from numerblox.meta import CrossValEstimator, make_meta_pipeline
from numerblox.ensemble import NumeraiEnsemble, PredictionReducer
from numerblox.neutralizers import FeatureNeutralizer

## 0. Get data

In [2]:
from numerblox.numerframe import create_numerframe

df = create_numerframe("../tests/test_assets/train_int8_5_eras.parquet")

In [3]:
X, y = df.get_feature_target_pair(multi_target=False)
fncv3_cols = df.get_fncv3_features.columns.tolist()
eras = df.get_era_data
features = df.get_feature_data

## 1. Neutralized XGBoost pipeline.

In [4]:
# !pip install xgboost sklego

In [5]:
# Preprocessing
gpp = GroupStatsPreProcessor(groups=['sunshine', 'rain'])
fncv3_selector = ColumnSelector(fncv3_cols)

preproc_pipe = make_union(gpp, fncv3_selector)

# Model
xgb = XGBRegressor()
cve = CrossValEstimator(estimator=xgb, cv=TimeSeriesSplit(n_splits=5))
ens = NumeraiEnsemble(donate_weighted=True)
fn = FeatureNeutralizer()
full_pipe = make_meta_pipeline(preproc_pipe, 
                          XGBRegressor(), fn)
full_pipe

In [6]:
# Train full model
full_pipe.fit(X, y, featureneutralizer__eras=eras, featureneutralizer__features=features);

In [7]:
# Unneutralized predictions
full_pipe[:-1].predict(X)[:5]

array([0.35509267, 0.65761864, 0.3440274 , 0.70437306, 0.56794727],
      dtype=float32)

In [8]:
# End to end predictions
preds = full_pipe.predict(X=X, features=features, eras=eras)
preds[:5]

array([[0.3909212 ],
       [0.61628961],
       [0.38016289],
       [0.60651495],
       [0.62008093]])

## 2. Multiclass Classification Ensemble

In [9]:
model = DecisionTreeClassifier()
crossval1 = CrossValEstimator(estimator=model, cv=TimeSeriesSplit(n_splits=3), predict_func='predict_proba')
pred_rud = PredictionReducer(n_models=3, n_classes=5)
ens2 = NumeraiEnsemble(donate_weighted=True)
pipe2 = make_pipeline(preproc_pipe, crossval1, pred_rud, ens2)
full_pipe = TransformedTargetRegressor(pipe2, func=lambda x: (x * 4).astype(int), inverse_func=lambda x: x)

In [10]:
full_pipe

In [11]:
preds = full_pipe.fit(X, y).predict(X, eras=eras)
preds[:5]

array([[0.05043436],
       [0.75120656],
       [0.05091699],
       [0.75168919],
       [0.25024131]])