In [None]:
%load_ext autoreload
%autoreload 2
%load_ext nb_black
%load_ext lab_black

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black
The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


<IPython.core.display.Javascript object>

In [None]:
# default_exp model_pipeline

<IPython.core.display.Javascript object>

# Model Pipeline

The functionality below uses the `Dataset`, `PreProcessor`, `Model` and `PostProcessor` objects to easily propagate
data through the processing+model.

In [None]:
# hide
from nbdev.showdoc import *

<IPython.core.display.Javascript object>

In [None]:
#export
import uuid
from typing import List
from tqdm.auto import tqdm
from typeguard import typechecked
from rich import print as rich_print

from numerai_blocks.dataset import Dataset
from numerai_blocks.preprocessing import BaseProcessor, CopyPreProcessor, display_processor_info
from numerai_blocks.model import BaseModel

<IPython.core.display.Javascript object>

## 1. ModelPipeline

`ModelPipeline` handles all preprocessing, model prediction and postprocessing.

In [None]:
#export
@typechecked
class ModelPipeline:
    """
    Execute all preprocessing, prediction and postprocessing for a given setup.

    :param model: A numerai-blocks Model that add prediction columns to a given input Dataset
    :param preprocessors: List of initialized (!) PreProcessors.
    :param postprocessors: List of initialized (!) PostProcessors.
    :param copy_first: Whether to copy the Dataset as a first preprocessing step.
    Highly recommended in order to avoid accidentally manipulating the original Dataset and/or DataFrame.
    :param pipeline_name: Name for display purposes
    """
    def __init__(self,
                 model: BaseModel,
                 preprocessors: List[BaseProcessor] = None,
                 postprocessors: List[BaseProcessor] = None,
                 copy_first = True,
                 pipeline_name: str = None):
        self.pipeline_name = pipeline_name if pipeline_name else uuid.uuid4().hex
        self.model = model
        self.preprocessors = preprocessors
        if copy_first:
            self.preprocessors.insert(0, CopyPreProcessor())
        self.postprocessors = postprocessors

    def preprocess(self, dataset: Dataset) -> Dataset:
        for preprocessor in tqdm(self.preprocessors,
                                 desc=f"{self.pipeline_name} Preprocessing:",
                                 position=0):
            rich_print(f":car: Applying preprocessing {preprocessor.__class__.__name__} :car:")
            dataset = preprocessor(dataset)
        return dataset

    def postprocess(self, dataset: Dataset) -> Dataset:
        for postprocessor in tqdm(self.postprocessors,
                                  desc=f"{self.pipeline_name} Postprocessing: ",
                                  position=0):
            rich_print(f":car: Applying postprocessing {postprocessor.__class__.__name} :car:")
            dataset = postprocessor(dataset)
        return dataset

    @display_processor_info
    def pipeline(self, dataset: Dataset) -> Dataset:
        preprocessed_dataset = self.preprocess(dataset)
        prediction_dataset = self.model(preprocessed_dataset)
        processed_prediction_dataset = self.postprocess(prediction_dataset)
        return processed_prediction_dataset

    def __call__(self, dataset: Dataset):
        return self.pipeline(dataset)

<IPython.core.display.Javascript object>

## 2. ModelPipelineCollection

`ModelPipelineCollection` wraps multiple `ModelPipelines` to easily run them in sequence.

TODO: Add multiprocessing support?

In [None]:
#export
@typechecked
class ModelPipelineCollection:
    """
    Execute multiple initialized ModelPipelines in a sequence.
    :param pipelines: List of initialized ModelPipelines.
    """
    def __init__(self, pipelines: List[ModelPipeline]):
        self.pipelines = {pipe.pipeline_name: pipe for pipe in pipelines}
        self.pipeline_names = list(self.pipelines.keys())

    def process_all_pipelines(self, dataset: Dataset) -> Dataset:
        for name, pipeline in tqdm(self.pipelines.items(),
                                   desc="Processing Pipeline Collection"):
            dataset = self.process_single_pipeline(dataset, name)
        return dataset

    def process_single_pipeline(self, dataset: Dataset, pipeline_name: str) -> Dataset:
        rich_print(f":construction_worker: [bold green]Processing model pipeline:[/bold green] '{pipeline_name}' :construction_worker:")
        pipeline = self.get_pipeline(pipeline_name)
        dataset = pipeline(dataset)
        return dataset

    def get_pipeline(self, pipeline_name: str) -> ModelPipeline:
        available_pipelines = self.pipeline_names
        assert pipeline_name in available_pipelines, f"Requested pipeline '{pipeline_name}', but only the following models are in the collection: '{available_pipelines}'."
        return self.pipelines[pipeline_name]

    def __call__(self, dataset: Dataset) -> Dataset:
        return self.process_all_pipelines(dataset=dataset)

<IPython.core.display.Javascript object>