In [1]:
%load_ext autoreload
%autoreload 2
%load_ext nb_black
%load_ext lab_black

<IPython.core.display.Javascript object>

In [2]:
# default_exp preprocessing

<IPython.core.display.Javascript object>

# Preprocessing

In [3]:
# hide
from nbdev.showdoc import *

<IPython.core.display.Javascript object>

In [4]:
#export
import uuid
import time
# import talib
import numpy as np
import pandas as pd
import datetime as dt
from typing import Union
from functools import wraps
# from scipy.special import gamma
from typeguard import typechecked
from abc import ABC, abstractmethod
from rich import print as rich_print

from numerai_blocks.dataset import Dataset, create_dataset

<IPython.core.display.Javascript object>

## 0. Base

### 0.1. BaseProcessor

In [5]:
#export
class BaseProcessor(ABC):
    """
    New Preprocessors and Postprocessors should inherit from this object
    and implement the transform method.
    """
    def __init__(self):
        ...

    @abstractmethod
    def transform(self, dataset: Dataset, *args, **kwargs) -> Dataset:
        ...

    def __call__(self, dataset: Dataset, *args, **kwargs) -> Dataset:
        return self.transform(dataset=dataset, *args, **kwargs)

<IPython.core.display.Javascript object>

### 0.2. Decorators

You might have preprocessing functions that work with Pandas DataFrames and want to make them compatible with `numerai-blocks` `Dataset`s. This can be done by adding a single decorator (`support_dataset_processing`) to a method where you call a DataFrame function.

In [6]:
#export
def support_dataset_processing(func):
    """
    Make existing DataFrame transformer compatible with Dataset input.
    :param func: Some function/method that takes Pandas DataFrame as input
    and return Pandas DataFrame.
    """
    @wraps(func)
    def wrapper(dataset: Dataset, *args, **kwargs) -> Dataset:
        dataf_transform = func(dataset.dataf, *args, **kwargs)
        metadata = dataset.__dict__
        metadata.pop("dataf", None)
        return Dataset(dataf_transform, metadata)
    return wrapper

<IPython.core.display.Javascript object>

#### Decorator example + tests.

In [7]:
# Random DataFrame
test_features = [f"feature_{l}" for l in "ABCDEFGHIK"]
id_col = [uuid.uuid4().hex for _ in range(100)]
df = pd.DataFrame(np.random.uniform(size=(100, 10)), columns=test_features)

def pandas_quad(df: pd.DataFrame, col: str):
    """ Simple DataFrame function which takes gives column to power of 2. """
    df_copy = df.copy()
    df_copy.loc[:, col] = df_copy.loc[:, col].apply(lambda x: x ** 2)
    return df_copy

@support_dataset_processing
def test_dataf_with_dataset_input(dataset: Dataset, col: str) -> Dataset:
    """ Put Dataset through function that normally only accepts Pandas DataFrame. """
    return pandas_quad(dataset, col=col)

<IPython.core.display.Javascript object>

The function `test_dataf_with_dataset_input` will behave as a `Dataset` preprocessor even though the function called within normally only accepts Pandas DataFrames.

`transform_method_1` and `transform_method_2` will there lead to the same result.

In [8]:
feature_to_transform = "feature_B"
transform_method_1 = test_dataf_with_dataset_input(Dataset(df), col=feature_to_transform).get_column_selection(feature_to_transform).round(8)
transform_method_2 = pandas_quad(df, col=feature_to_transform).loc[:, [feature_to_transform]].round(8)

assert transform_method_1.equals(transform_method_2)

<IPython.core.display.Javascript object>

### 0.3. Logging

We would like to keep an overview of which steps are done in a data pipeline and where processing bottlenecks occur.
The decorator below will display:
1. When a step has finished.
2. What the output shape of the data is.
3. How long the step took to finish.

Note that this wrapper only works for methods that return a `Dataset` object as a result. All implemented preprocessors, models and postprocessors have this property.

To use this functionality, simply add `@display_processor_info` as a decorator to the function/method you want to track.

In [9]:
#export
def display_processor_info(func):
    """ Fancy console output for data processing. """
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        class_name = func.__qualname__.split('.')[0]
        rich_print(f":white_check_mark: Finished step [bold]{class_name}[/bold]. Output shape={result.dataf.shape}. Time taken for step: [blue]{time_taken}[/blue]. :white_check_mark:")
        return result
    return wrapper

<IPython.core.display.Javascript object>

In [10]:
class TestDisplay:
    def __init__(self, dataset: Dataset):
        self.dataset = dataset
    @display_processor_info
    def test(self) -> Dataset:
        time.sleep(2)
        return self.dataset

dataset = create_dataset("test_assets/mini_numerai_version_1_data.csv")
TestDisplay(dataset).test();

<IPython.core.display.Javascript object>

## 1. Common preprocessing steps


### 1.1. Version agnostic

In [11]:
#export
@typechecked
class CopyPreProcessor(BaseProcessor):
    """Copy DataFrame to avoid manipulation of original DataFrame. """
    def __init__(self):
        super(CopyPreProcessor, self).__init__()

    @display_processor_info
    def transform(self, dataset: Dataset, *args, **kwargs) -> Dataset:
        return dataset.copy_dataset()

<IPython.core.display.Javascript object>

In [12]:
dataset = create_dataset("test_assets/mini_numerai_version_1_data.csv", version=1)
copied_dataset = CopyPreProcessor().transform(dataset)
assert copied_dataset.dataf.equals(dataset.dataf)
assert dataset.version, dataset.multi_target == (copied_dataset.version, copied_dataset.multi_target)

<IPython.core.display.Javascript object>

In [13]:
#export
@typechecked
class FeatureSelectionPreProcessor(BaseProcessor):
    """
    Keep only features given + all target, predictions and aux columns.
    """
    def __init__(self, feature_cols: Union[str, list]):
        super(FeatureSelectionPreProcessor, self).__init__()
        self.feature_cols = feature_cols

    @display_processor_info
    def transform(self, dataset: Dataset) -> Dataset:
        keep_cols = self.feature_cols + dataset.target_cols + dataset.prediction_cols + dataset.aux_cols
        dataset.dataf = dataset.dataf.loc[:, keep_cols]
        return Dataset(**dataset.__dict__)

<IPython.core.display.Javascript object>

In [14]:
selected_dataset = FeatureSelectionPreProcessor(feature_cols=['feature_wisdom1']).transform(dataset)
assert selected_dataset.get_feature_data.shape[1] == 1
assert dataset.version, dataset.multi_target == (selected_dataset.version, selected_dataset.multi_target)

<IPython.core.display.Javascript object>

In [15]:
selected_dataset.dataf.head(2)

Unnamed: 0,feature_wisdom1,target,id,era,data_type
0,0.25,0.5,n000315175b67977,era1,train
1,0.5,0.25,n0014af834a96cdd,era1,train


<IPython.core.display.Javascript object>

In [16]:
#export
@typechecked
class TargetSelectionPreProcessor(BaseProcessor):
    """
    Keep only features given + all target, predictions and aux columns.
    """
    def __init__(self, target_cols: Union[str, list]):
        super(TargetSelectionPreProcessor, self).__init__()
        self.target_cols = target_cols

    @display_processor_info
    def transform(self, dataset: Dataset) -> Dataset:
        keep_cols = self.target_cols + dataset.feature_cols + dataset.prediction_cols + dataset.aux_cols
        dataset.dataf = dataset.dataf.loc[:, keep_cols]
        return Dataset(**dataset.__dict__)

<IPython.core.display.Javascript object>

In [17]:
dataset = create_dataset("test_assets/mini_numerai_version_2_data.parquet", version=2)
target_cols = ['target', 'target_nomi_20', 'target_nomi_60']
selected_dataset = TargetSelectionPreProcessor(target_cols=target_cols).transform(dataset)
assert selected_dataset.get_target_data.shape[1] == len(target_cols)
selected_dataset.dataf.head(2)

Unnamed: 0_level_0,target,target_nomi_20,target_nomi_60,feature_dichasial_hammier_spawner,feature_rheumy_epistemic_prancer,feature_pert_performative_hormuz,feature_hillier_unpitied_theobromine,feature_perigean_bewitching_thruster,feature_renegade_undomestic_milord,feature_koranic_rude_corf,...,feature_drawable_exhortative_dispersant,feature_metabolic_minded_armorist,feature_investigatory_inerasable_circumvallation,feature_centroclinal_incentive_lancelet,feature_unemotional_quietistic_chirper,feature_behaviorist_microbiological_farina,feature_lofty_acceptable_challenge,feature_coactive_prefatorial_lucy,era,data_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n559bd06a8861222,0.25,0.25,0.5,0.25,0.75,0.25,0.75,0.25,0.5,1.0,...,1.0,0.0,0.0,0.25,0.0,0.0,1.0,0.25,297,train
n9d39dea58c9e3cf,0.5,0.5,0.75,0.75,0.5,0.75,1.0,0.5,0.25,0.5,...,0.25,0.5,0.0,0.25,0.75,1.0,0.75,1.0,3,train


<IPython.core.display.Javascript object>

### 1.2. Version 1 specific

In [18]:
#export
@typechecked
class GroupStatsPreProcessor(BaseProcessor):
    """
    WARNING: Only supported for Version 1 (legacy) data.
    Calculate group statistics for all data groups.
    :param groups: Groups to create features for. All groups by default.
    """
    def __init__(self, groups: list = None):
        super(GroupStatsPreProcessor, self).__init__()
        self.all_groups = ["intelligence", "wisdom", "charisma",
                           "dexterity", "strength", "constitution"]
        self.group_names = groups if groups else self.all_groups

    @display_processor_info
    def transform(self, dataset: Dataset, *args, **kwargs) -> Dataset:
        self._check_data_validity(dataset=dataset)
        dataset.dataf = dataset.dataf.pipe(self._add_group_features)
        return Dataset(**dataset.__dict__)

    def _add_group_features(self, dataf: pd.DataFrame) -> pd.DataFrame:
        """ Mean, standard deviation and skew for each group. """
        for group in self.group_names:
            cols = [col for col in dataf.columns if group in col]
            dataf[f"feature_{group}_mean"] = dataf[cols].mean(axis=1)
            dataf[f"feature_{group}_std"] = dataf[cols].std(axis=1)
            dataf[f"feature_{group}_skew"] = dataf[cols].skew(axis=1)
        return dataf

    def _check_data_validity(self, dataset: Dataset):
        assert hasattr(dataset, 'version'), f"Version should be specified for '{self.__class__.__name__}' This Preprocessor will only work on version 1 data."
        assert getattr(dataset, 'version') == 1, f"'{self.__class__.__name__}' only works on version 1 data. Got version: '{getattr(dataset, 'version')}'."

<IPython.core.display.Javascript object>

In [19]:
group_features_dataset = GroupStatsPreProcessor().transform(copied_dataset)
group_features_dataset.dataf.head(2)
assert group_features_dataset.version == 1

<IPython.core.display.Javascript object>

In [20]:
new_cols =  ['feature_intelligence_mean', 'feature_intelligence_std', 'feature_intelligence_skew',
             'feature_wisdom_mean', 'feature_wisdom_std', 'feature_wisdom_skew',
             'feature_charisma_mean', 'feature_charisma_std', 'feature_charisma_skew',
             'feature_dexterity_mean', 'feature_dexterity_std', 'feature_dexterity_skew',
             'feature_strength_mean', 'feature_strength_std', 'feature_strength_skew',
             'feature_constitution_mean', 'feature_constitution_std', 'feature_constitution_skew']
assert set(group_features_dataset.dataf.columns).intersection(new_cols)
group_features_dataset.get_feature_data[new_cols].head(2)

Unnamed: 0,feature_intelligence_mean,feature_intelligence_std,feature_intelligence_skew,feature_wisdom_mean,feature_wisdom_std,feature_wisdom_skew,feature_charisma_mean,feature_charisma_std,feature_charisma_skew,feature_dexterity_mean,feature_dexterity_std,feature_dexterity_skew,feature_strength_mean,feature_strength_std,feature_strength_skew,feature_constitution_mean,feature_constitution_std,feature_constitution_skew
0,0.333333,0.246183,0.558528,0.668478,0.236022,-0.115082,0.438953,0.25991,-0.004783,0.696429,0.200446,-0.60762,0.480263,0.292829,-0.372064,0.427632,0.27572,0.276155
1,0.208333,0.234359,0.382554,0.559783,0.358177,-0.062362,0.485465,0.252501,-0.021737,0.267857,0.249312,0.382267,0.407895,0.309866,0.220625,0.644737,0.33408,-0.794938


<IPython.core.display.Javascript object>

`GroupStatsPreProcessor` should break if `version != 1`.

In [21]:
def test_invalid_version(dataset: Dataset):
    copied_dataset = dataset.copy_dataset()
    copied_dataset.version = 2
    try:
        GroupStatsPreProcessor().transform(copied_dataset)
    except AssertionError:
        return True
    return False

test_invalid_version(dataset);

<IPython.core.display.Javascript object>

### 1.3. Version 2 specific

### 1.4. Signals specific

#### 1.4.1. Pattern Features

In [22]:
# # export
# @typechecked
# class TalibPatternFeatures(BaseProcessor):
#     """
#     Get all pattern recognition features available in TA-Lib
#     More information: https://mrjbq7.github.io/ta-lib/func_groups/pattern_recognition.html
#     """
#     def __init__(self, ticker_col: str = "ticker"):
#         super(TalibPatternFeatures, self).__init__()
#         self.ticker_col = ticker_col
#         # All pattern recognition features start with "CDL" in TA-Lib
#         self.funcs = [getattr(talib, name) for name in dir(talib) if name.startswith("CDL")]
#
#     @display_processor_info
#     def transform(self, dataset: Dataset, *args, **kwargs) -> Dataset:
#         all_tickers = list(dataset.dataf[self.ticker_col].unique())
#         for func in self.funcs:
#             for ticker in all_tickers:
#                 index_mask = dataset.dataf[self.ticker_col] == ticker
#                 sub_df = dataset.dataf.loc[index_mask, :]
#                 open, high = sub_df['Open'], sub_df['High']
#                 low, close = sub_df['Low'], sub_df['Close']
#                 dataset.dataf.loc[index_mask, f"feature_{func.__qualname__}"] = func(open, high, low, close)
#         return Dataset(**dataset.__dict__)

<IPython.core.display.Javascript object>

In [23]:
# signals_test_data = create_dataset("test_assets/bitcoin_time_series_test_data.csv")
# ticker_col = "Symbol"
# ta_patterns = TalibPatternFeatures(ticker_col=ticker_col)
# transformed_dataset = ta_patterns.transform(signals_test_data)
# assert len(transformed_dataset.feature_cols) == 61
# assert not (transformed_dataset.get_feature_data == 0).all().all()
# transformed_dataset.get_feature_data.head(2)

<IPython.core.display.Javascript object>

#### 1.4.2. Volume indicators

In [24]:
# # export
# @typechecked
# class TalibVolumeFeatures(BaseProcessor):
#     """
#     Get all volume features using TA-Lib
#     More information: https://mrjbq7.github.io/ta-lib/func_groups/volume_indicators.html
#
#     :param ticker_col: Column name that points to ticker names
#     :param fastperiod, slowperiod: Periodic arguments for ADOSC (Chaikin A/D Oscillator).
#     See http://www.tadoc.org/indicator/ADOSC.htm for more information on how these arguments are used.
#     """
#     def __init__(self, ticker_col: str = "ticker", fastperiod: int = 3, slowperiod: int = 10):
#         super(TalibVolumeFeatures, self).__init__()
#         self.ticker_col = ticker_col
#         self.volume_features = ['AD', 'ADOSC', 'OBV']
#         self.fastperiod = fastperiod
#         self.slowperiod = slowperiod
#
#     @display_processor_info
#     def transform(self, dataset: Dataset) -> Dataset:
#         all_tickers = list(dataset.dataf[self.ticker_col].unique())
#         for ticker in all_tickers:
#             index_mask = dataset.dataf[self.ticker_col] == ticker
#             sub_df = dataset.dataf.loc[index_mask, :]
#
#             high, low = sub_df['High'].to_numpy(), sub_df['Low'].to_numpy()
#             close, volume = sub_df['Close'].to_numpy(), sub_df['Volume'].to_numpy()
#
#             dataset.dataf.loc[index_mask, "feature_AD"] = talib.AD(high, low, close, volume)
#             adosc = talib.ADOSC(high, low, close, volume, fastperiod=self.fastperiod, slowperiod=self.slowperiod)
#             dataset.dataf.loc[index_mask, "feature_ADOSC"] = np.nan_to_num(adosc, nan=0.0)
#             dataset.dataf.loc[index_mask, "feature_OBV"] = talib.OBV(close, volume)
#         return Dataset(**dataset.__dict__)

<IPython.core.display.Javascript object>

In [25]:
# signals_test_data = create_dataset("test_assets/bitcoin_time_series_test_data.csv")
# ticker_col = "Symbol"
# fastperiod, slowperiod = 4, 11
# ta_volume_features = TalibVolumeFeatures(ticker_col=ticker_col, fastperiod=fastperiod, slowperiod=slowperiod)
# dataset_with_volume_features = ta_volume_features.transform(dataset=signals_test_data)
# assert fastperiod == ta_volume_features.fastperiod
# assert slowperiod == ta_volume_features.slowperiod
# assert list(dataset_with_volume_features.get_feature_data.columns) == [f"feature_{name}" for name in ta_volume_features.volume_features]
# dataset_with_volume_features.get_feature_data.head(2)

<IPython.core.display.Javascript object>

### 1.4.3. Realized volatility

Implement 30day and 90 days, 180 days and 360 days vol?

In [26]:
# export
# @typechecked
# class RealizedVolFeatures(BaseProcessor):
#     """
#     Features based on realized volatility features.
#     Dataset DataFrame should have a column named "Close".
#     Source and more information: https://dspyt.com/advanced-realized-volatility-and-quarticity/
#     """
#     def __init__(self, ticker_col: str = 'ticker', price_col: str = 'Close'):
#         super(RealizedVolFeatures, self).__init__()
#         self.ticker_col = ticker_col
#         self.price_col = price_col
#
#     @display_processor_info
#     def transform(self, dataset: Dataset, *args, **kwargs) -> Dataset:
#         """ Get all realized volatility features. """
#         series = dataset.dataf.loc[:, self.price_col]
#         tickers = dataset.dataf.loc[:, self.ticker_col]
#         dataset.dataf.loc[:, "feature_vol2"] = series.groupby(tickers).agg(self.realized_2)
#         dataset.dataf.loc[:, "feature_vol3"] = series.groupby(tickers).agg(self.realized_3)
#         dataset.dataf.loc[:, "feature_vol4"] = series.groupby(tickers).agg(self.realized_4)
#         # Parse all contents of Dataset to the next pipeline step
#         return Dataset(**dataset.__dict__)
#
#     @staticmethod
#     def simple_realized_vol(series: pd.Series) -> np.float64:
#         """ Most simple way to calculate realized volatility. """
#         return np.sqrt(np.sum(series**2))
#
#     @staticmethod
#     def realized_2(series: pd.Series) -> np.float64:
#         return np.sqrt(np.sum(series**4)/(6*np.sum(series**2)))
#
#     @staticmethod
#     def realized_3(series: pd.Series) -> np.float64:
#         return np.sqrt(((np.pi**2)*np.sum(abs(series.rolling(window=4).apply(np.product,
#                                                                              raw=True))))/(8*np.sum(series**2)))
#
#     @staticmethod
#     def realized_4(series: pd.Series) -> np.float64:
#         numerator = (gamma(1/2)**3)*np.sum((abs(series)**(4/3)).rolling(window=3).apply(np.prod))
#         denominator = 8 * (gamma(7/6)**3)*np.sum(series**2)
#         return np.sqrt(numerator/denominator)

<IPython.core.display.Javascript object>

In [27]:
# signals_test_data = create_dataset("test_assets/bitcoin_time_series_test_data.csv")
# ticker_col = "Symbol"
# vol_features = RealizedVolFeatures(ticker_col=ticker_col)
# dataset_with_vol_features = vol_features.transform(dataset=signals_test_data)
# all_vol_cols = ['feature_vol2', 'feature_vol3', 'feature_vol4']
# assert dataset_with_vol_features.feature_cols == all_vol_cols
# for col in all_vol_cols:
#     assert dataset_with_vol_features.dataf[col].value_counts().nunique() != 1
# # assert not dataset_with_vol_features.get_feature_data.isna().all().all()
# dataset_with_vol_features.get_feature_data.head(3)

<IPython.core.display.Javascript object>

### 1.4.4. Quarticity

In [28]:
# export
# @typechecked
# class QuarticityFeatures(BaseProcessor):
#     """
#     Quarticity (Vol of vol) features.
#     Source and more information: https://dspyt.com/advanced-realized-volatility-and-quarticity/
#     """
#     def __init__(self, ticker_col: str = 'ticker', price_col: str = 'Close'):
#         super(QuarticityFeatures, self).__init__()
#         self.ticker_col = ticker_col
#         self.price_col = price_col
#
#     @display_processor_info
#     def transform(self, dataset: Dataset, *args, **kwargs) -> Dataset:
#         """ Get most powerful quarticity features for every stock. """
#         series = dataset.dataf.loc[:, self.price_col]
#         tickers = dataset.dataf.loc[:, self.ticker_col]
#         quad_quarticity = series.groupby(tickers).agg(self.realized_quadpower_quarticity)
#         tripower_quarticity = series.groupby(tickers).agg(self.realized_tripower_quarticity)
#         dataset.dataf.loc[:, "feature_quadpower_quarticity"] = quad_quarticity
#         dataset.dataf.loc[:, "feature_tripower_quarticity"] = tripower_quarticity
#         return Dataset(**dataset.__dict__)
#
#     @staticmethod
#     def realized_quarticity(series: pd.Series) -> np.float64:
#         """
#         The realized fourth-power variation or realized quarticity
#         is a consistent estimator of the integrated quarticity.
#         """
#         return np.sum(series**4) * series.shape[0] / 3
#
#     @staticmethod
#     def realized_quadpower_quarticity(series: pd.Series) -> np.float64:
#         """
#         A more robust estimator compared to realized quarticity,
#         particularly in the presence of jumps, is the realized quad-power quarticity.
#         """
#         series = abs(series.rolling(window=4).apply(np.product, raw=True))
#         return (np.sum(series) * series.shape[0] * (np.pi**2)) / 4
#
#     @staticmethod
#     def realized_tripower_quarticity(series: pd.Series) -> np.float64:
#         """ Similarly robust estimator to quad power quarticity. """
#         series = series ** (4/3)
#         series = abs(series).rolling(window=3).apply(np.prod, raw=True)
#         return series.shape[0]*0.25*((gamma(1/2)**3)/(gamma(7/6)**3))*np.sum(series)

<IPython.core.display.Javascript object>

In [29]:
# signals_test_data = create_dataset("test_assets/bitcoin_time_series_test_data.csv")
# ticker_col = "Symbol"
# quart_features = QuarticityFeatures(ticker_col=ticker_col)
# dataset_with_quart_features = quart_features.transform(dataset=signals_test_data)
# assert dataset_with_quart_features.feature_cols == ["feature_quadpower_quarticity", "feature_tripower_quarticity"]
# assert dataset_with_quart_features.dataf["feature_quadpower_quarticity"].value_counts().nunique() != 1
# assert dataset_with_quart_features.dataf["feature_tripower_quarticity"].value_counts().nunique() != 1
# # assert not dataset_with_quart_features.get_feature_data.isna().all().all()
# dataset_with_quart_features.get_feature_data.head(2)

<IPython.core.display.Javascript object>

## 2. Custom preprocessors

There are an almost unlimited number of ways to preprocess data. We have only scratched the surface with the preprocessors currently implemented in `numerai-blocks`. We invite the Numerai community to develop Numerai Classic and Signals preprocessors for `numerai-blocks`.

A new Preprocessor should inherit from `BaseProcessor` and implement a `transform` method. The `transform` method should take a `Dataset` as input and return a `Dataset` object as output. An example is given below.

To enable fancy logging output. Add the `@display_processor_info` decorator to the `transform` method.

Note that arbitrary metadata can be added or changed in a preprocessing step.

In [30]:
# export
class AwesomePreProcessor(BaseProcessor):
    """
    - TEMPLATE -
    Do some awesome preprocessing.
    """
    def __init__(self, *args, **kwargs):
        super(AwesomePreProcessor, self).__init__()

    @display_processor_info
    def transform(self, dataset: Dataset, *args, **kwargs) -> Dataset:
        # Do processing
        ...
        # Parse all contents of Dataset to the next pipeline step
        return Dataset(**dataset.__dict__)

<IPython.core.display.Javascript object>

-------------------------------------------

In [31]:
# hide
# Run this cell to sync all changes with library
from nbdev.export import notebook2script

notebook2script()

Converted 01_download.ipynb.
Converted 02_dataset.ipynb.
Converted 03_preprocessing.ipynb.
Converted 04_model.ipynb.
Converted 05_postprocessing.ipynb.
Converted 06_modelpipeline.ipynb.
Converted 07_evaluation.ipynb.
Converted 08_key.ipynb.
Converted 09_submission.ipynb.
Converted 10_staking.ipynb.
Converted index.ipynb.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>