In [None]:
%load_ext autoreload
%autoreload 2
%load_ext nb_black
%load_ext lab_black

<IPython.core.display.Javascript object>

In [None]:
# default_exp preprocessing

<IPython.core.display.Javascript object>

# Preprocessing

In [None]:
# hide
from nbdev.showdoc import *

<IPython.core.display.Javascript object>

In [None]:
#export
import uuid
import numpy as np
import pandas as pd
import datetime as dt
from functools import wraps
from typeguard import typechecked
from abc import ABC, abstractmethod
from rich import print as rich_print

from numerai_blocks.dataset import Dataset

<IPython.core.display.Javascript object>

## 0. Base

### 0.1. BaseProcessor

In [None]:
#export
@typechecked
class BaseProcessor(ABC):
    """
    New Preprocessors and Postprocessors should inherit from this object
    and implement the transform method.
    """
    def __init__(self):
        ...

    @abstractmethod
    def transform(self, dataset: Dataset, *args, **kwargs) -> Dataset:
        ...

    def __call__(self, dataset: Dataset, *args, **kwargs) -> Dataset:
        return self.transform(dataset=dataset, *args, **kwargs)

<IPython.core.display.Javascript object>

### 0.2. Decorators

In [None]:
#export
def support_dataf_processing(func):
    """
    Make Dataset processor compatible with DataFrame input.
    :param func: Some function/method that takes Dataset as input
    and returns Dataset.
    """
    @wraps(func)
    def wrapper(dataf: pd.DataFrame, *args, **kwargs) -> pd.DataFrame:
        return func(Dataset(dataf), *args, **kwargs).dataf
    return wrapper

def support_dataset_processing(func):
    """
    Make existing DataFrame transformer compatible with Dataset input.
    :param func: Some function/method that takes Pandas DataFrame as input
    and return Pandas DataFrame.
    """
    @wraps(func)
    def wrapper(dataset: Dataset, *args, **kwargs) -> Dataset:
        dataf_transform = func(dataset.dataf, *args, **kwargs)
        metadata = dataset.__dict__
        metadata.pop("dataf", None)
        return Dataset(dataf_transform, metadata)
    return wrapper

<IPython.core.display.Javascript object>

#### Decorator tests

In [None]:
# Random DataFrame
test_features = [f"feature_{l}" for l in "ABCDEFGHIK"]
id_col = [uuid.uuid4().hex for _ in range(100)]
df = pd.DataFrame(np.random.uniform(size=(100, 10)), columns=test_features)

def pandas_quad(df, col: str):
    """ Simple DataFrame function which takes gives column to power of 2. """
    df_copy = df.copy()
    df_copy.loc[:, col] = df_copy.loc[:, col].apply(lambda x: x ** 2)
    return df_copy

class QuadraticProcessor(BaseProcessor):
    """ Simple Dataset Processor which takes gives column to power of 2. """
    def __init__(self):
        super(QuadraticProcessor, self).__init__()

    def transform(self, dataset: Dataset, col: str) -> Dataset:
        dataset_copy = dataset.copy_dataset()
        dataset_copy.dataf.loc[:, col] = dataset_copy.get_column_selection(col).pipe(self._quadratic)
        return Dataset(**dataset_copy.__dict__)

    @staticmethod
    def _quadratic(dataf: pd.DataFrame):
        return dataf ** 2


@support_dataset_processing
def test_dataf_with_dataset_input(dataset: Dataset, col: str) -> Dataset:
    return pandas_quad(dataset, col=col)

@support_dataf_processing
def test_dataset_with_dataf_input(dataf: pd.DataFrame, col: str) -> pd.DataFrame:
    return QuadraticProcessor().transform(dataset=dataf, col=col)

<IPython.core.display.Javascript object>

In [None]:
feature_to_transform = "feature_B"
transformed_dataset = test_dataf_with_dataset_input(Dataset(df), col=feature_to_transform)
transformed_df = test_dataset_with_dataf_input(df, col=feature_to_transform)

<IPython.core.display.Javascript object>

In [None]:
transform_method_1 = transformed_dataset.get_column_selection(feature_to_transform).round(8)
transform_method_2 = transformed_df.loc[:, [feature_to_transform]].round(8)

assert transform_method_1.equals(transform_method_2)

<IPython.core.display.Javascript object>

### 0.3. Logging

In [None]:
#export
def display_processor_info(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        class_name = func.__qualname__.split('.')[0]
        rich_print(f":white_check_mark: Finished step [bold]{class_name}[/bold]. Output shape={result.dataf.shape}. Time taken for step: {time_taken}. :white_check_mark:")
        return result
    return wrapper

<IPython.core.display.Javascript object>

## 1. Common preprocessing steps


In [None]:
#export
@typechecked
class CopyPreProcessor(BaseProcessor):
    """Copy DataFrame to avoid manipulation of original DataFrame. """
    def __init__(self):
        super(CopyPreProcessor, self).__init__()

    @display_processor_info
    def transform(self, dataset: Dataset, *args, **kwargs) -> Dataset:
        return dataset.copy_dataset()

@typechecked
class GroupStatsPreProcessor(BaseProcessor):
    """
    WARNING: Only supported for Version 1 (legacy) data.
    Calculate group statistics for all data groups.
    """
    def __init__(self):
        super(GroupStatsPreProcessor, self).__init__()
        self.group_names = ["intelligence", "wisdom", "charisma",
                            "dexterity", "strength", "constitution"]

    @display_processor_info
    def transform(self, dataset: Dataset, *args, **kwargs) -> Dataset:
        self._check_data_validity(dataset=dataset)
        dataset.dataf = dataset.dataf.pipe(self._add_group_features)
        return Dataset(**dataset.__dict__)

    def _add_group_features(self, dataf: pd.DataFrame) -> pd.DataFrame:
        """ Mean, standard deviation and skew for each group. """
        for group in self.group_names:
            cols = [col for col in df.columns if group in col]
            dataf[f"feature_{group}_mean"] = dataf[cols].mean(axis=1)
            dataf[f"feature_{group}_std"] = dataf[cols].std(axis=1)
            dataf[f"feature_{group}_skew"] = dataf[cols].skew(axis=1)
        return dataf


    def _check_data_validity(self, dataset: Dataset):
        assert hasattr(dataset, 'version'), f"Version should be specified for '{self.__class__.__name__}' This Preprocessor will only work on version 1 data."
        assert getattr(dataset, 'version') == 1, f"'{self.__class__.__name__}' only works on version 1 data. Got version: '{getattr(dataset, 'version')}'."


<IPython.core.display.Javascript object>

#### Tests

In [None]:
# Top 10 rows of numerai version 1 (legacy) training data.
df = pd.read_csv("test_assets/mini_numerai_version_1_data.csv")
df.head(2)

Unnamed: 0,id,era,data_type,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,...,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46,target
0,n000315175b67977,era1,train,0.0,0.5,0.25,0.0,0.5,0.25,0.25,...,1.0,1.0,0.75,0.5,0.75,0.5,1.0,0.5,0.75,0.5
1,n0014af834a96cdd,era1,train,0.0,0.0,0.0,0.25,0.5,0.0,0.0,...,1.0,1.0,0.0,0.0,0.75,0.25,0.0,0.25,1.0,0.25


<IPython.core.display.Javascript object>

In [None]:
metadata = {"version": 1, "multi_target": False}
dataset = Dataset(df, metadata)

<IPython.core.display.Javascript object>

In [None]:
copied_dataset = CopyPreProcessor().transform(dataset)
assert copied_dataset.dataf.equals(dataset.dataf)
assert dataset.version, dataset.multi_target == (copied_dataset.version, copied_dataset.multi_target)

<IPython.core.display.Javascript object>

In [None]:
group_features_dataset = GroupStatsPreProcessor().transform(copied_dataset)
group_features_dataset.dataf.head(2)
assert group_features_dataset.version == 1

<IPython.core.display.Javascript object>

In [None]:
# copied_dataset.version = 2
# GroupStatsPreProcessor().transform(copied_dataset)

<IPython.core.display.Javascript object>

## 2. Custom preprocessors

There are an almost unlimited number of ways to preprocess data. We have only scratched the surface with the preprocessors currently implemented in `numerai-blocks`. We invite the Numerai community to develop Numerai Classic and Signals preprocessors for `numerai-blocks`.

A new Preprocessor should inherit from `BaseProcessor` and implement a `transform` method. The `transform` method should take a `Dataset` as input and return a `Dataset` object as output. An example is given below.

To enable fancy logging output. Add the `@display_processor_info` decorator to the `transform` method.

Note that arbitrary metadata can be added or changed in a preprocessing step.

In [None]:
class AwesomePreProcessor(BaseProcessor):
    """
    - TEMPLATE -
    Do some awesome preprocessing.
    """
    def __init__(self, *args, **kwargs):
        super(AwesomePreProcessor, self).__init__()

    @display_processor_info
    def transform(self, dataset: Dataset, *args, **kwargs) -> Dataset:
        # Do processing
        ...
        # Parse all contents of Dataset to the next pipeline step
        return Dataset(**dataset.__dict__)

<IPython.core.display.Javascript object>

-------------------------------------------

In [None]:
# hide
# Run this cell to sync all changes with library
from nbdev.export import notebook2script

notebook2script()

Converted 00_download.ipynb.
Converted 01_dataloaders.ipynb.
Converted 02_dataset.ipynb.
Converted 03_preprocessing.ipynb.
Converted 04a_model.ipynb.
Converted 04b_modelpipeline.ipynb.
Converted 05_postprocessing.ipynb.
Converted 06_prediction_dataset.ipynb.
Converted 07_evaluation.ipynb.
Converted 08_key.ipynb.
Converted 09_submission.ipynb.
Converted 10_staker.ipynb.
Converted index.ipynb.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>