In [None]:
%load_ext autoreload
%autoreload 2
%load_ext nb_black
%load_ext lab_black

<IPython.core.display.Javascript object>

In [None]:
# default_exp model_pipeline

<IPython.core.display.Javascript object>

# Model Pipeline

The functionality below uses the `Dataset`, `PreProcessor`, `Model` and `PostProcessor` objects to easily propagate
data and generate predictions that are ready to submit.

In [None]:
# hide
from nbdev.showdoc import *

<IPython.core.display.Javascript object>

In [None]:
#export
import uuid
from typing import List
from tqdm.auto import tqdm
from typeguard import typechecked
from rich import print as rich_print

from numerai_blocks.dataset import Dataset, create_dataset
from numerai_blocks.preprocessing import BaseProcessor, CopyPreProcessor, GroupStatsPreProcessor, FeatureSelectionPreProcessor
from numerai_blocks.model import BaseModel, ConstantModel, RandomModel
from numerai_blocks.postprocessing import FeatureNeutralizer, MeanEnsembler

<IPython.core.display.Javascript object>

## 1. ModelPipeline

`ModelPipeline` handles all preprocessing, model prediction and postprocessing.
After the `ModelPipeline` completes it returns a `Dataset` with the original data, metadata and added prediction columns.

In [None]:
#export
@typechecked
class ModelPipeline:
    """
    Execute all preprocessing, prediction and postprocessing for a given setup.

    :param models: Initiliazed (!) numerai-blocks Models that add prediction columns to a given input Dataset
    :param preprocessors: List of initialized (!) PreProcessors.
    :param postprocessors: List of initialized (!) PostProcessors.
    :param copy_first: Whether to copy the Dataset as a first preprocessing step.
    Highly recommended in order to avoid accidentally manipulating the original Dataset and/or DataFrame.
    :param pipeline_name: Unique name for pipeline.
    """
    def __init__(self,
                 models: List[BaseModel],
                 preprocessors: List[BaseProcessor] = [],
                 postprocessors: List[BaseProcessor] = [],
                 copy_first = True,
                 pipeline_name: str = None):
        self.pipeline_name = pipeline_name if pipeline_name else uuid.uuid4().hex
        self.models = models
        self.copy_first = copy_first
        self.preprocessors = preprocessors
        self.postprocessors = postprocessors

    def preprocess(self, dataset: Dataset) -> Dataset:
        if self.copy_first:
            dataset = CopyPreProcessor()(dataset)
        for preprocessor in tqdm(self.preprocessors,
                                 desc=f"{self.pipeline_name} Preprocessing:",
                                 position=0):
            rich_print(f":construction: Applying preprocessing: '[bold]{preprocessor.__class__.__name__}[/bold]' :construction:")
            dataset = preprocessor(dataset)
        return dataset

    def postprocess(self, dataset: Dataset) -> Dataset:
        for postprocessor in tqdm(self.postprocessors,
                                  desc=f"{self.pipeline_name} Postprocessing: ",
                                  position=0):
            rich_print(f":construction: Applying postprocessing: '[bold]{postprocessor.__class__.__name__}[/bold]' :construction:")
            dataset = postprocessor(dataset)
        return dataset

    def process_models(self, dataset: Dataset) -> Dataset:
        for model in tqdm(self.models,
                                  desc=f"{self.pipeline_name} Model prediction: ",
                                  position=0):
            rich_print(f":robot: Generating model predictions with '[bold]{model.__class__.__name__}[/bold]'. :robot:")
            dataset = model(dataset)
        return dataset

    def pipeline(self, dataset: Dataset) -> Dataset:
        """ Process full pipeline and return resulting Dataset. """
        preprocessed_dataset = self.preprocess(dataset)
        prediction_dataset = self.process_models(preprocessed_dataset)
        processed_prediction_dataset = self.postprocess(prediction_dataset)
        rich_print(f":checkered_flag: [green]Finished pipeline:[green] [bold blue]'{self.pipeline_name}'[bold blue]! :checkered_flag:")
        return processed_prediction_dataset

    def __call__(self, dataset: Dataset):
        return self.pipeline(dataset)

<IPython.core.display.Javascript object>

In [None]:
# Example using several preprocessor, dummy models and postprocessors
model_names = ["test_0.5", "test_0.8"]

dataset = create_dataset("test_assets/mini_numerai_version_1_data.csv", version=1)
preprocessors = [GroupStatsPreProcessor(), FeatureSelectionPreProcessor(feature_cols=['feature_intelligence_mean', 'feature_intelligence_std'])]
models = [ConstantModel(constant=0.5, model_name=model_names[0]), ConstantModel(constant=0.8, model_name=model_names[1])]
postprocessors = [MeanEnsembler(cols=[f"prediction_{name}" for name in model_names], final_col_name='prediction_ensembled'),
                  FeatureNeutralizer(feature_names=['feature_intelligence_mean', 'feature_intelligence_std'],
                                     pred_name='prediction_ensembled', proportion=0.8)]

<IPython.core.display.Javascript object>

In [None]:
test_pipeline = ModelPipeline(preprocessors=preprocessors, models=models, postprocessors=postprocessors, pipeline_name="test_pipeline")
processed_dataset = test_pipeline(dataset)

test_pipeline Preprocessing::   0%|          | 0/2 [00:00<?, ?it/s]

test_pipeline Model prediction:   0%|          | 0/2 [00:00<?, ?it/s]

test_pipeline Postprocessing:   0%|          | 0/2 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

In [None]:
processed_dataset.dataf

Unnamed: 0,feature_intelligence_mean,feature_intelligence_std,target,id,era,data_type,prediction_test_0.5,prediction_test_0.8,prediction_ensembled,prediction_ensembled_neutralized_0.8
0,0.333333,0.246183,0.5,n000315175b67977,era1,train,0.5,0.8,0.65,0.0
1,0.208333,0.234359,0.25,n0014af834a96cdd,era1,train,0.5,0.8,0.65,0.36088
2,0.479167,0.327843,0.25,n001c93979ac41d4,era1,train,0.5,0.8,0.65,0.350519
3,0.416667,0.288675,0.25,n0034e4143f22a13,era1,train,0.5,0.8,0.65,0.45091
4,0.270833,0.128732,0.75,n00679d1a636062f,era1,train,0.5,0.8,0.65,0.378539
5,0.5,0.213201,0.5,n009aa2d32389eca,era1,train,0.5,0.8,0.65,0.36154
6,0.604167,0.39107,0.25,n009ef1a5fe009b6,era1,train,0.5,0.8,0.65,0.719895
7,0.770833,0.270906,0.25,n00ae5d51f55fb0f,era1,train,0.5,0.8,0.65,0.339932
8,0.625,0.291937,0.5,n00b0ac86d77aed7,era1,train,0.5,0.8,0.65,0.71071
9,0.5625,0.284545,0.75,n00c63366aeaf76a,era1,train,0.5,0.8,0.65,1.0


<IPython.core.display.Javascript object>

## 2. ModelPipelineCollection

`ModelPipelineCollection` can be used to easily manage and run multiple `ModelPipeline` objects.

TODO: Add multiprocessing support.

In [None]:
#export
@typechecked
class ModelPipelineCollection:
    """
    Execute multiple initialized ModelPipelines in a sequence.
    :param pipelines: List of initialized ModelPipelines.
    """
    def __init__(self, pipelines: List[ModelPipeline]):
        self.pipelines = {pipe.pipeline_name: pipe for pipe in pipelines}
        self.pipeline_names = list(self.pipelines.keys())

    def process_all_pipelines(self, dataset: Dataset) -> List[Dataset]:
        """ Process all pipelines and return list of resulting Datasets. """
        result_datasets = []
        for name, pipeline in tqdm(self.pipelines.items(),
                                   desc="Processing Pipeline Collection"):
            result_datasets.append(self.process_single_pipeline(dataset, name))
        return result_datasets

    def process_single_pipeline(self, dataset: Dataset, pipeline_name: str) -> Dataset:
        rich_print(f":construction_worker: [bold green]Processing model pipeline:[/bold green] '{pipeline_name}' :construction_worker:")
        pipeline = self.get_pipeline(pipeline_name)
        dataset = pipeline(dataset)
        return dataset

    def get_pipeline(self, pipeline_name: str) -> ModelPipeline:
        available_pipelines = self.pipeline_names
        assert pipeline_name in available_pipelines, f"Requested pipeline '{pipeline_name}', but only the following models are in the collection: '{available_pipelines}'."
        return self.pipelines[pipeline_name]

    def __call__(self, dataset: Dataset) -> List[Dataset]:
        return self.process_all_pipelines(dataset=dataset)

<IPython.core.display.Javascript object>

In [None]:
# Different pipeline with no preprocessing or postprocessing. Only RandomModel
test_pipeline2 = ModelPipeline(models=[RandomModel()], pipeline_name="test_pipeline2")

<IPython.core.display.Javascript object>

In [None]:
collection = ModelPipelineCollection([test_pipeline, test_pipeline2])
assert collection.get_pipeline("test_pipeline2").pipeline_name == 'test_pipeline2'

<IPython.core.display.Javascript object>

In [None]:
result_datasets = collection(dataset=dataset)

Processing Pipeline Collection:   0%|          | 0/2 [00:00<?, ?it/s]

test_pipeline Preprocessing::   0%|          | 0/2 [00:00<?, ?it/s]

test_pipeline Model prediction:   0%|          | 0/2 [00:00<?, ?it/s]

test_pipeline Postprocessing:   0%|          | 0/2 [00:00<?, ?it/s]

test_pipeline2 Preprocessing:: 0it [00:00, ?it/s]

test_pipeline2 Model prediction:   0%|          | 0/1 [00:00<?, ?it/s]

test_pipeline2 Postprocessing: : 0it [00:00, ?it/s]

<IPython.core.display.Javascript object>

The `ModelPipelineCollection` return a list of `Dataset` objects, retaining all metadata and adding prediction columns for each. Note that the 1st `Dataset` had a feature selection step, so it did not retain all columns. However, the second dataset retained all feature columns, because it did not have a feature selection step.

In [None]:
result_datasets[0].dataf

Unnamed: 0,feature_intelligence_mean,feature_intelligence_std,target,id,era,data_type,prediction_test_0.5,prediction_test_0.8,prediction_ensembled,prediction_ensembled_neutralized_0.8
0,0.333333,0.246183,0.5,n000315175b67977,era1,train,0.5,0.8,0.65,0.0
1,0.208333,0.234359,0.25,n0014af834a96cdd,era1,train,0.5,0.8,0.65,0.36088
2,0.479167,0.327843,0.25,n001c93979ac41d4,era1,train,0.5,0.8,0.65,0.350519
3,0.416667,0.288675,0.25,n0034e4143f22a13,era1,train,0.5,0.8,0.65,0.45091
4,0.270833,0.128732,0.75,n00679d1a636062f,era1,train,0.5,0.8,0.65,0.378539
5,0.5,0.213201,0.5,n009aa2d32389eca,era1,train,0.5,0.8,0.65,0.36154
6,0.604167,0.39107,0.25,n009ef1a5fe009b6,era1,train,0.5,0.8,0.65,0.719895
7,0.770833,0.270906,0.25,n00ae5d51f55fb0f,era1,train,0.5,0.8,0.65,0.339932
8,0.625,0.291937,0.5,n00b0ac86d77aed7,era1,train,0.5,0.8,0.65,0.71071
9,0.5625,0.284545,0.75,n00c63366aeaf76a,era1,train,0.5,0.8,0.65,1.0


<IPython.core.display.Javascript object>

In [None]:
result_datasets[1].dataf

Unnamed: 0,id,era,data_type,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,...,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46,target,prediction_random
0,n000315175b67977,era1,train,0.0,0.5,0.25,0.0,0.5,0.25,0.25,...,1.0,0.75,0.5,0.75,0.5,1.0,0.5,0.75,0.5,0.117123
1,n0014af834a96cdd,era1,train,0.0,0.0,0.0,0.25,0.5,0.0,0.0,...,1.0,0.0,0.0,0.75,0.25,0.0,0.25,1.0,0.25,0.940155
2,n001c93979ac41d4,era1,train,0.25,0.5,0.25,0.25,1.0,0.75,0.75,...,0.5,0.0,0.0,0.5,1.0,0.0,0.25,0.75,0.25,0.843971
3,n0034e4143f22a13,era1,train,1.0,0.0,0.0,0.5,0.5,0.25,0.25,...,1.0,0.75,0.75,1.0,1.0,0.75,1.0,1.0,0.25,0.48647
4,n00679d1a636062f,era1,train,0.25,0.25,0.25,0.25,0.0,0.25,0.5,...,0.75,0.25,0.5,0.75,0.0,0.5,0.25,0.75,0.75,0.888108
5,n009aa2d32389eca,era1,train,0.5,0.5,0.25,0.25,0.75,0.75,0.75,...,0.75,0.0,0.0,0.75,0.5,0.0,0.25,0.0,0.5,0.959341
6,n009ef1a5fe009b6,era1,train,0.5,0.25,0.25,0.75,1.0,1.0,1.0,...,1.0,0.5,0.5,0.75,0.5,0.5,0.5,1.0,0.25,0.34206
7,n00ae5d51f55fb0f,era1,train,0.25,1.0,1.0,0.75,1.0,0.75,0.75,...,0.25,0.75,0.75,0.0,0.25,0.75,0.5,0.25,0.25,0.128034
8,n00b0ac86d77aed7,era1,train,0.5,0.5,0.5,1.0,1.0,0.25,0.5,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.5,0.541538
9,n00c63366aeaf76a,era1,train,0.5,1.0,1.0,0.25,0.75,0.25,0.25,...,0.0,1.0,1.0,0.75,0.5,1.0,1.0,0.75,0.75,0.684536


<IPython.core.display.Javascript object>

Note that multiple resulting `Dataset` objects can be merged. Convenient for ensembling applications based on multiple pipelines.

In [None]:
combined_dataset = result_datasets[0].merge_datasets(result_datasets[1], on='id', how='outer')
combined_dataset.get_prediction_data

Unnamed: 0,prediction_test_0.5,prediction_test_0.8,prediction_ensembled,prediction_ensembled_neutralized_0.8,prediction_random
0,0.5,0.8,0.65,0.0,0.117123
1,0.5,0.8,0.65,0.36088,0.940155
2,0.5,0.8,0.65,0.350519,0.843971
3,0.5,0.8,0.65,0.45091,0.48647
4,0.5,0.8,0.65,0.378539,0.888108
5,0.5,0.8,0.65,0.36154,0.959341
6,0.5,0.8,0.65,0.719895,0.34206
7,0.5,0.8,0.65,0.339932,0.128034
8,0.5,0.8,0.65,0.71071,0.541538
9,0.5,0.8,0.65,1.0,0.684536


<IPython.core.display.Javascript object>

-----------------------------------------------------------------------------

In [None]:
# hide
# Run this cell to sync all changes with library
from nbdev.export import notebook2script

notebook2script()

Converted 00_download.ipynb.
Converted 01_dataloaders.ipynb.
Converted 02_dataset.ipynb.
Converted 03_preprocessing.ipynb.
Converted 04_model.ipynb.
Converted 05_postprocessing.ipynb.
Converted 06_modelpipeline.ipynb.
Converted 07_evaluation.ipynb.
Converted 08_key.ipynb.
Converted 09_submission.ipynb.
Converted 10_staker.ipynb.
Converted index.ipynb.


<IPython.core.display.Javascript object>