In [None]:
%load_ext autoreload
%autoreload 2
%load_ext nb_black
%load_ext lab_black

<IPython.core.display.Javascript object>

In [None]:
# default_exp preprocessing

<IPython.core.display.Javascript object>

# Preprocessing

In [None]:
# hide
from nbdev.showdoc import *

<IPython.core.display.Javascript object>

In [None]:
#export
import uuid
import time
import numpy as np
import pandas as pd
import datetime as dt
from functools import wraps
from typeguard import typechecked
from abc import ABC, abstractmethod
from rich import print as rich_print

from numerai_blocks.dataset import Dataset, create_dataset

<IPython.core.display.Javascript object>

## 0. Base

### 0.1. BaseProcessor

In [None]:
#export
@typechecked
class BaseProcessor(ABC):
    """
    New Preprocessors and Postprocessors should inherit from this object
    and implement the transform method.
    """
    def __init__(self):
        ...

    @abstractmethod
    def transform(self, dataset: Dataset, *args, **kwargs) -> Dataset:
        ...

    def __call__(self, dataset: Dataset, *args, **kwargs) -> Dataset:
        return self.transform(dataset=dataset, *args, **kwargs)

<IPython.core.display.Javascript object>

### 0.2. Decorators

You might have preprocessing functions that work with Pandas DataFrames and want to make them compatible with `numerai-blocks` `Dataset`s. This can be done by adding a single decorator (`support_dataset_processing`) to a method where you call a DataFrame function.

In [None]:
def support_dataset_processing(func):
    """
    Make existing DataFrame transformer compatible with Dataset input.
    :param func: Some function/method that takes Pandas DataFrame as input
    and return Pandas DataFrame.
    """
    @wraps(func)
    def wrapper(dataset: Dataset, *args, **kwargs) -> Dataset:
        dataf_transform = func(dataset.dataf, *args, **kwargs)
        metadata = dataset.__dict__
        metadata.pop("dataf", None)
        return Dataset(dataf_transform, metadata)
    return wrapper

<IPython.core.display.Javascript object>

#### Decorator example + tests.

In [None]:
# Random DataFrame
test_features = [f"feature_{l}" for l in "ABCDEFGHIK"]
id_col = [uuid.uuid4().hex for _ in range(100)]
df = pd.DataFrame(np.random.uniform(size=(100, 10)), columns=test_features)

def pandas_quad(df: pd.DataFrame, col: str):
    """ Simple DataFrame function which takes gives column to power of 2. """
    df_copy = df.copy()
    df_copy.loc[:, col] = df_copy.loc[:, col].apply(lambda x: x ** 2)
    return df_copy

@support_dataset_processing
def test_dataf_with_dataset_input(dataset: Dataset, col: str) -> Dataset:
    """ Put Dataset through function that normally only accepts Pandas DataFrame. """
    return pandas_quad(dataset, col=col)

<IPython.core.display.Javascript object>

The function `test_dataf_with_dataset_input` will behave as a `Dataset` preprocessor even though the function called within normally only accepts Pandas DataFrames.

`transform_method_1` and `transform_method_2` will there lead to the same result.

In [None]:
feature_to_transform = "feature_B"
transform_method_1 = test_dataf_with_dataset_input(Dataset(df), col=feature_to_transform).get_column_selection(feature_to_transform).round(8)
transform_method_2 = pandas_quad(df, col=feature_to_transform).loc[:, [feature_to_transform]].round(8)

assert transform_method_1.equals(transform_method_2)

<IPython.core.display.Javascript object>

### 0.3. Logging

We would like to keep an overview of which steps are done in a data pipeline and where processing bottlenecks occur.
The decorator below will display:
1. When a step has finished.
2. What the output shape of the data is.
3. How long the step took to finish

Note that this wrapper only works for methods that return a `Dataset` object as a result. All implemented preprocessors, models and postprocessors have this property.

To use this functionality, simply add `@display_processor_info` as a decorator to the function/method you want to track.

In [None]:
#export
def display_processor_info(func):
    """ Fancy console output for data processing. """
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        class_name = func.__qualname__.split('.')[0]
        rich_print(f":white_check_mark: Finished step [bold]{class_name}[/bold]. Output shape={result.dataf.shape}. Time taken for step: [blue]{time_taken}[/blue]. :white_check_mark:")
        return result
    return wrapper

<IPython.core.display.Javascript object>

In [None]:
class TestDisplay:
    def __init__(self, dataset: Dataset):
        self.dataset = dataset
    @display_processor_info
    def test(self) -> Dataset:
        time.sleep(2)
        return self.dataset

dataset = create_dataset("test_assets/mini_numerai_version_1_data.csv")
TestDisplay(dataset).test();

<IPython.core.display.Javascript object>

## 1. Common preprocessing steps


In [None]:
#export
@typechecked
class CopyPreProcessor(BaseProcessor):
    """Copy DataFrame to avoid manipulation of original DataFrame. """
    def __init__(self):
        super(CopyPreProcessor, self).__init__()

    @display_processor_info
    def transform(self, dataset: Dataset, *args, **kwargs) -> Dataset:
        return dataset.copy_dataset()

@typechecked
class GroupStatsPreProcessor(BaseProcessor):
    """
    WARNING: Only supported for Version 1 (legacy) data.
    Calculate group statistics for all data groups.
    """
    def __init__(self):
        super(GroupStatsPreProcessor, self).__init__()
        self.group_names = ["intelligence", "wisdom", "charisma",
                            "dexterity", "strength", "constitution"]

    @display_processor_info
    def transform(self, dataset: Dataset, *args, **kwargs) -> Dataset:
        self._check_data_validity(dataset=dataset)
        dataset.dataf = dataset.dataf.pipe(self._add_group_features)
        return Dataset(**dataset.__dict__)

    def _add_group_features(self, dataf: pd.DataFrame) -> pd.DataFrame:
        """ Mean, standard deviation and skew for each group. """
        for group in self.group_names:
            cols = [col for col in dataf.columns if group in col]
            dataf[f"feature_{group}_mean"] = dataf[cols].mean(axis=1)
            dataf[f"feature_{group}_std"] = dataf[cols].std(axis=1)
            dataf[f"feature_{group}_skew"] = dataf[cols].skew(axis=1)
        return dataf

    def _check_data_validity(self, dataset: Dataset):
        assert hasattr(dataset, 'version'), f"Version should be specified for '{self.__class__.__name__}' This Preprocessor will only work on version 1 data."
        assert getattr(dataset, 'version') == 1, f"'{self.__class__.__name__}' only works on version 1 data. Got version: '{getattr(dataset, 'version')}'."

<IPython.core.display.Javascript object>

#### Tests

In [None]:
dataset = create_dataset("test_assets/mini_numerai_version_1_data.csv", version=1)

<IPython.core.display.Javascript object>

In [None]:
copied_dataset = CopyPreProcessor().transform(dataset)
assert copied_dataset.dataf.equals(dataset.dataf)
assert dataset.version, dataset.multi_target == (copied_dataset.version, copied_dataset.multi_target)

<IPython.core.display.Javascript object>

In [None]:
group_features_dataset = GroupStatsPreProcessor().transform(copied_dataset)
group_features_dataset.dataf.head(2)
assert group_features_dataset.version == 1

<IPython.core.display.Javascript object>

In [None]:
new_cols =  ['feature_intelligence_mean', 'feature_intelligence_std', 'feature_intelligence_skew',
'feature_wisdom_mean', 'feature_wisdom_std', 'feature_wisdom_skew',
'feature_charisma_mean', 'feature_charisma_std', 'feature_charisma_skew',
'feature_dexterity_mean', 'feature_dexterity_std', 'feature_dexterity_skew',
'feature_strength_mean', 'feature_strength_std', 'feature_strength_skew',
'feature_constitution_mean', 'feature_constitution_std', 'feature_constitution_skew']
assert set(group_features_dataset.dataf.columns).intersection(new_cols)
group_features_dataset.get_feature_data[new_cols].head(2)

Unnamed: 0,feature_intelligence_mean,feature_intelligence_std,feature_intelligence_skew,feature_wisdom_mean,feature_wisdom_std,feature_wisdom_skew,feature_charisma_mean,feature_charisma_std,feature_charisma_skew,feature_dexterity_mean,feature_dexterity_std,feature_dexterity_skew,feature_strength_mean,feature_strength_std,feature_strength_skew,feature_constitution_mean,feature_constitution_std,feature_constitution_skew
0,0.333333,0.246183,0.558528,0.668478,0.236022,-0.115082,0.438953,0.25991,-0.004783,0.696429,0.200446,-0.60762,0.480263,0.292829,-0.372064,0.427632,0.27572,0.276155
1,0.208333,0.234359,0.382554,0.559783,0.358177,-0.062362,0.485465,0.252501,-0.021737,0.267857,0.249312,0.382267,0.407895,0.309866,0.220625,0.644737,0.33408,-0.794938


<IPython.core.display.Javascript object>

`GroupStatsPreProcessor` should break if `version != 1`.

In [None]:
def test_invalid_version(dataset: Dataset):
    copied_dataset = dataset.copy_dataset()
    copied_dataset.version = 2
    try:
        GroupStatsPreProcessor().transform(copied_dataset)
    except AssertionError:
        return True
    return False

test_invalid_version(dataset);

<IPython.core.display.Javascript object>

## 2. Custom preprocessors

There are an almost unlimited number of ways to preprocess data. We have only scratched the surface with the preprocessors currently implemented in `numerai-blocks`. We invite the Numerai community to develop Numerai Classic and Signals preprocessors for `numerai-blocks`.

A new Preprocessor should inherit from `BaseProcessor` and implement a `transform` method. The `transform` method should take a `Dataset` as input and return a `Dataset` object as output. An example is given below.

To enable fancy logging output. Add the `@display_processor_info` decorator to the `transform` method.

Note that arbitrary metadata can be added or changed in a preprocessing step.

In [None]:
class AwesomePreProcessor(BaseProcessor):
    """
    - TEMPLATE -
    Do some awesome preprocessing.
    """
    def __init__(self, *args, **kwargs):
        super(AwesomePreProcessor, self).__init__()

    @display_processor_info
    def transform(self, dataset: Dataset, *args, **kwargs) -> Dataset:
        # Do processing
        ...
        # Parse all contents of Dataset to the next pipeline step
        return Dataset(**dataset.__dict__)

<IPython.core.display.Javascript object>

-------------------------------------------

In [None]:
# hide
# Run this cell to sync all changes with library
from nbdev.export import notebook2script

notebook2script()

Converted 00_download.ipynb.
Converted 01_dataloaders.ipynb.
Converted 02_dataset.ipynb.
Converted 03_preprocessing.ipynb.
Converted 04_model.ipynb.
Converted 05_postprocessing.ipynb.
Converted 06_modelpipeline.ipynb.
Converted 07_evaluation.ipynb.
Converted 08_key.ipynb.
Converted 09_submission.ipynb.
Converted 10_staker.ipynb.
Converted index.ipynb.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>