In [1]:
# hide
%load_ext autoreload
%autoreload 2
%load_ext nb_black
%load_ext lab_black

<IPython.core.display.Javascript object>

In [2]:
# default_exp preprocessing

<IPython.core.display.Javascript object>

# Preprocessing
> Feature/target selection, engineering and manipulation.

## Overview
This section provides functionality for all data manipulation steps that are needed before data is passed into a model for prediction. We group all these steps under Preprocessing. This includes feature/target selection, feature/target engineering and feature/target manipulation.

Some preprocessors work with both Pandas DataFrames and NumerFrames. Most preprocessors use specific `NumerFrame` functionality.

In the last section we explain how you can implement your own Preprocessor that integrates well with the rest of this framework.

In [3]:
# hide
from nbdev.showdoc import *

<IPython.core.display.Javascript object>

In [4]:
# export
import os
import time
import warnings
import numpy as np
import pandas as pd
import datetime as dt
from umap import UMAP
import tensorflow as tf
from tqdm.auto import tqdm
from functools import wraps
from scipy.stats import rankdata
from typeguard import typechecked
from abc import ABC, abstractmethod
from rich import print as rich_print
from typing import Union, List, Tuple
from multiprocessing.pool import Pool
from sklearn.linear_model import Ridge
from sklearn.mixture import BayesianGaussianMixture
from sklearn.preprocessing import QuantileTransformer, MinMaxScaler

from numerblox.download import NumeraiClassicDownloader
from numerblox.numerframe import NumerFrame, create_numerframe

<IPython.core.display.Javascript object>

## 0. Base

These objects will provide a base for all pre- and post-processing functionality and log relevant information.

## 0.1. BaseProcessor

`BaseProcessor` defines common functionality for `preprocessing` and `postprocessing` (Section 5).

Every Preprocessor should inherit from `BaseProcessor` and implement the `.transform` method.

In [5]:
# export
class BaseProcessor(ABC):
    """Common functionality for preprocessors and postprocessors."""

    def __init__(self):
        ...

    @abstractmethod
    def transform(
        self, dataf: Union[pd.DataFrame, NumerFrame], *args, **kwargs
    ) -> NumerFrame:
        ...

    def __call__(
        self, dataf: Union[pd.DataFrame, NumerFrame], *args, **kwargs
    ) -> NumerFrame:
        return self.transform(dataf=dataf, *args, **kwargs)

<IPython.core.display.Javascript object>

## 0.2. Logging

We would like to keep an overview of which steps are done in a data pipeline and where processing bottlenecks occur.
The decorator below will display for a given function/method:
1. When it has finished.
2. What the output shape of the data is.
3. How long it took to finish.

To use this functionality, simply add `@display_processor_info` as a decorator to the function/method you want to track.

We will use this decorator throughout the pipeline (`preprocessing`, `model` and `postprocessing`).

Inspiration for this decorator: [Calmcode Pandas Pipe Logs](https://calmcode.io/pandas-pipe/logs.html)

In [6]:
# export
def display_processor_info(func):
    """Fancy console output for data processing."""

    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        class_name = func.__qualname__.split(".")[0]
        rich_print(
            f":white_check_mark: Finished step [bold]{class_name}[/bold]. Output shape={result.shape}. Time taken for step: [blue]{time_taken}[/blue]. :white_check_mark:"
        )
        return result

    return wrapper

<IPython.core.display.Javascript object>

In [7]:
# hide_input
class TestDisplay:
    """
    Small test for logging.
    Output should mention 'TestDisplay',
    Return output shape of (10, 314) and
    time taken for step should be close to 2 seconds.
    """

    def __init__(self, dataf: NumerFrame):
        self.dataf = dataf

    @display_processor_info
    def test(self) -> NumerFrame:
        time.sleep(2)
        return self.dataf


dataf = create_numerframe("test_assets/mini_numerai_version_1_data.csv")
TestDisplay(dataf).test()

Unnamed: 0,id,era,data_type,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,...,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46,target
0,n000315175b67977,era1,train,0.0,0.5,0.25,0.0,0.5,0.25,0.25,...,1.0,1.0,0.75,0.5,0.75,0.5,1.0,0.5,0.75,0.5
1,n0014af834a96cdd,era1,train,0.0,0.0,0.0,0.25,0.5,0.0,0.0,...,1.0,1.0,0.0,0.0,0.75,0.25,0.0,0.25,1.0,0.25
2,n001c93979ac41d4,era1,train,0.25,0.5,0.25,0.25,1.0,0.75,0.75,...,0.25,0.5,0.0,0.0,0.5,1.0,0.0,0.25,0.75,0.25
3,n0034e4143f22a13,era1,train,1.0,0.0,0.0,0.5,0.5,0.25,0.25,...,1.0,1.0,0.75,0.75,1.0,1.0,0.75,1.0,1.0,0.25
4,n00679d1a636062f,era1,train,0.25,0.25,0.25,0.25,0.0,0.25,0.5,...,0.75,0.75,0.25,0.5,0.75,0.0,0.5,0.25,0.75,0.75
5,n009aa2d32389eca,era1,train,0.5,0.5,0.25,0.25,0.75,0.75,0.75,...,0.75,0.75,0.0,0.0,0.75,0.5,0.0,0.25,0.0,0.5
6,n009ef1a5fe009b6,era1,train,0.5,0.25,0.25,0.75,1.0,1.0,1.0,...,1.0,1.0,0.5,0.5,0.75,0.5,0.5,0.5,1.0,0.25
7,n00ae5d51f55fb0f,era1,train,0.25,1.0,1.0,0.75,1.0,0.75,0.75,...,0.5,0.25,0.75,0.75,0.0,0.25,0.75,0.5,0.25,0.25
8,n00b0ac86d77aed7,era1,train,0.5,0.5,0.5,1.0,1.0,0.25,0.5,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.5
9,n00c63366aeaf76a,era1,train,0.5,1.0,1.0,0.25,0.75,0.25,0.25,...,0.0,0.0,1.0,1.0,0.75,0.5,1.0,1.0,0.75,0.75


<IPython.core.display.Javascript object>

## 1. Common preprocessing steps


This section implements commonly used preprocessing for Numerai. We invite the Numerai community to develop new preprocessors.

## 1.0 Tournament agnostic

Preprocessors that can be applied for both Numerai Classic and Numerai Signals.

### 1.0.1. CopyPreProcessor

The first and obvious preprocessor is copying, which is implemented as a default in `ModelPipeline` (Section 4) to avoid manipulation of the original DataFrame or `NumerFrame` that you load in.

In [8]:
# export
@typechecked
class CopyPreProcessor(BaseProcessor):
    """Copy DataFrame to avoid manipulation of original DataFrame."""

    def __init__(self):
        super().__init__()

    @display_processor_info
    def transform(self, dataf: Union[pd.DataFrame, NumerFrame]) -> NumerFrame:
        return NumerFrame(dataf.copy())

<IPython.core.display.Javascript object>

In [9]:
dataset = create_numerframe(
    "test_assets/mini_numerai_version_1_data.csv", metadata={"version": 1}
)
copied_dataset = CopyPreProcessor().transform(dataset)
assert np.array_equal(copied_dataset.values, dataset.values)
assert dataset.meta == copied_dataset.meta

<IPython.core.display.Javascript object>

### 1.0.2. FeatureSelectionPreProcessor

`FeatureSelectionPreProcessor` will keep all features that you pass + keeps all other columns that are not features.

In [10]:
# export
@typechecked
class FeatureSelectionPreProcessor(BaseProcessor):
    """
    Keep only features given + all target, predictions and aux columns.
    """

    def __init__(self, feature_cols: Union[str, list]):
        super().__init__()
        self.feature_cols = feature_cols

    @display_processor_info
    def transform(self, dataf: NumerFrame) -> NumerFrame:
        keep_cols = (
            self.feature_cols
            + dataf.target_cols
            + dataf.prediction_cols
            + dataf.aux_cols
        )
        dataf = dataf.loc[:, keep_cols]
        return NumerFrame(dataf)

<IPython.core.display.Javascript object>

In [11]:
selected_dataset = FeatureSelectionPreProcessor(
    feature_cols=["feature_wisdom1"]
).transform(dataset)

assert selected_dataset.get_feature_data.shape[1] == 1
assert dataset.meta == selected_dataset.meta

<IPython.core.display.Javascript object>

In [12]:
selected_dataset.head(2)

Unnamed: 0,feature_wisdom1,target,id,era,data_type
0,0.25,0.5,n000315175b67977,era1,train
1,0.5,0.25,n0014af834a96cdd,era1,train


<IPython.core.display.Javascript object>

### 1.0.3. TargetSelectionPreProcessor

`TargetSelectionPreProcessor` will keep all targets that you pass + all other columns that are not targets.

Not relevant for an inference pipeline, but especially convenient for Numerai Classic training if you train on a subset of the available targets. Can also be applied to Signals if you are using engineered targets in your pipeline.


In [13]:
# export
@typechecked
class TargetSelectionPreProcessor(BaseProcessor):
    """
    Keep only features given + all target, predictions and aux columns.
    """

    def __init__(self, target_cols: Union[str, list]):
        super().__init__()
        self.target_cols = target_cols

    @display_processor_info
    def transform(self, dataf: NumerFrame) -> NumerFrame:
        keep_cols = (
            self.target_cols
            + dataf.feature_cols
            + dataf.prediction_cols
            + dataf.aux_cols
        )
        dataf = dataf.loc[:, keep_cols]
        return NumerFrame(dataf)

<IPython.core.display.Javascript object>

In [14]:
dataset = create_numerframe(
    "test_assets/mini_numerai_version_2_data.parquet", metadata={"version": 2}
)
target_cols = ["target", "target_nomi_20", "target_nomi_60"]
selected_dataset = TargetSelectionPreProcessor(target_cols=target_cols).transform(
    dataset
)
assert selected_dataset.get_target_data.shape[1] == len(target_cols)
selected_dataset.head(2)

Unnamed: 0_level_0,target,target_nomi_20,target_nomi_60,feature_dichasial_hammier_spawner,feature_rheumy_epistemic_prancer,feature_pert_performative_hormuz,feature_hillier_unpitied_theobromine,feature_perigean_bewitching_thruster,feature_renegade_undomestic_milord,feature_koranic_rude_corf,...,feature_drawable_exhortative_dispersant,feature_metabolic_minded_armorist,feature_investigatory_inerasable_circumvallation,feature_centroclinal_incentive_lancelet,feature_unemotional_quietistic_chirper,feature_behaviorist_microbiological_farina,feature_lofty_acceptable_challenge,feature_coactive_prefatorial_lucy,era,data_type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n559bd06a8861222,0.25,0.25,0.5,0.25,0.75,0.25,0.75,0.25,0.5,1.0,...,1.0,0.0,0.0,0.25,0.0,0.0,1.0,0.25,297,train
n9d39dea58c9e3cf,0.5,0.5,0.75,0.75,0.5,0.75,1.0,0.5,0.25,0.5,...,0.25,0.5,0.0,0.25,0.75,1.0,0.75,1.0,3,train


<IPython.core.display.Javascript object>

### 1.0.4. ReduceMemoryProcessor

Numerai datasets can take up a lot of RAM and may put a strain on your compute environment.

For Numerai Classic, many of the feature and target columns can be downscaled to `float16`. `int8` if you are using the Numerai int8 datasets. For Signals it depends on the features you are generating.

`ReduceMemoryProcessor` downscales the type of your numeric columns to reduce the memory footprint as much as possible.

In [15]:
# export
class ReduceMemoryProcessor(BaseProcessor):
    """
    Reduce memory usage as much as possible.

    Credits to kainsama and others for writing about memory usage reduction for Numerai data:
    https://forum.numer.ai/t/reducing-memory/313

    :param deep_mem_inspect: Introspect the data deeply by interrogating object dtypes.
    Yields a more accurate representation of memory usage if you have complex object columns.
    """

    def __init__(self, deep_mem_inspect=False):
        super().__init__()
        self.deep_mem_inspect = deep_mem_inspect

    @display_processor_info
    def transform(self, dataf: Union[pd.DataFrame, NumerFrame]) -> NumerFrame:
        dataf = self._reduce_mem_usage(dataf)
        return NumerFrame(dataf)

    def _reduce_mem_usage(self, dataf: pd.DataFrame) -> pd.DataFrame:
        """
        Iterate through all columns and modify the numeric column types
        to reduce memory usage.
        """
        start_memory_usage = (
            dataf.memory_usage(deep=self.deep_mem_inspect).sum() / 1024**2
        )
        rich_print(
            f"Memory usage of DataFrame is [bold]{round(start_memory_usage, 2)} MB[/bold]"
        )

        for col in dataf.columns:
            col_type = dataf[col].dtype.name

            if col_type not in [
                "object",
                "category",
                "datetime64[ns, UTC]",
                "datetime64[ns]",
            ]:
                c_min = dataf[col].min()
                c_max = dataf[col].max()
                if str(col_type)[:3] == "int":
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        dataf[col] = dataf[col].astype(np.int16)
                    elif (
                        c_min > np.iinfo(np.int16).min
                        and c_max < np.iinfo(np.int16).max
                    ):
                        dataf[col] = dataf[col].astype(np.int16)
                    elif (
                        c_min > np.iinfo(np.int32).min
                        and c_max < np.iinfo(np.int32).max
                    ):
                        dataf[col] = dataf[col].astype(np.int32)
                    elif (
                        c_min > np.iinfo(np.int64).min
                        and c_max < np.iinfo(np.int64).max
                    ):
                        dataf[col] = dataf[col].astype(np.int64)
                else:
                    if (
                        c_min > np.finfo(np.float16).min
                        and c_max < np.finfo(np.float16).max
                    ):
                        dataf[col] = dataf[col].astype(np.float16)
                    elif (
                        c_min > np.finfo(np.float32).min
                        and c_max < np.finfo(np.float32).max
                    ):
                        dataf[col] = dataf[col].astype(np.float32)
                    else:
                        dataf[col] = dataf[col].astype(np.float64)

        end_memory_usage = (
            dataf.memory_usage(deep=self.deep_mem_inspect).sum() / 1024**2
        )
        rich_print(
            f"Memory usage after optimization is: [bold]{round(end_memory_usage, 2)} MB[/bold]"
        )
        rich_print(
            f"[green] Usage decreased by [bold]{round(100 * (start_memory_usage - end_memory_usage) / start_memory_usage, 2)}%[/bold][/green]"
        )
        return dataf

<IPython.core.display.Javascript object>

In [16]:
dataf = create_numerframe("test_assets/mini_numerai_version_2_data.parquet")
rmp = ReduceMemoryProcessor()
dataf = rmp.transform(dataf)

<IPython.core.display.Javascript object>

In [17]:
# hide
dataf.head(2)

Unnamed: 0_level_0,era,data_type,feature_dichasial_hammier_spawner,feature_rheumy_epistemic_prancer,feature_pert_performative_hormuz,feature_hillier_unpitied_theobromine,feature_perigean_bewitching_thruster,feature_renegade_undomestic_milord,feature_koranic_rude_corf,feature_demisable_expiring_millepede,...,target_paul_20,target_paul_60,target_george_20,target_george_60,target_william_20,target_william_60,target_arthur_20,target_arthur_60,target_thomas_20,target_thomas_60
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n559bd06a8861222,297,train,0.25,0.75,0.25,0.75,0.25,0.5,1.0,0.25,...,0.0,0.5,0.25,0.5,0.0,0.5,0.166626,0.5,0.333252,0.5
n9d39dea58c9e3cf,3,train,0.75,0.5,0.75,1.0,0.5,0.25,0.5,0.0,...,0.5,0.75,0.5,0.5,0.666504,0.666504,0.5,0.666504,0.5,0.666504


<IPython.core.display.Javascript object>

### 1.0.5. DeepDreamDataGenerator

Best known for its computer vision applications, DeepDream excites activations in a trained model to augment original input. It uses sevral steps of gradient ascent to achieve this. Numerai participant [nyuton (nemethpeti on Github)](https://github.com/nemethpeti/numerai/blob/main/DeepDream/deepdream.py) implemented a way to apply this technique on Numerai data. Therefore, it allows us to generate synthetic training data. Check out `nbs/edu_nbs/synthetic_data_generation.ipynb` for experiments that demonstrate the effectiveness of using this additional data for training Numerai models.

![](https://b2h3x3f6.stackpathcdn.com/assets/landing/img/gallery/4.jpg)
Source: Example of image generated with DeepDream (deepdreamgenerator.com)

In [18]:
# export
class DeepDreamGenerator(BaseProcessor):
    """
    Generate synthetic eras using DeepDream technique. \n
    Based on implementation by nemethpeti: \n
    https://github.com/nemethpeti/numerai/blob/main/DeepDream/deepdream.py

    :param model_path: Path to trained DeepDream model. Example can be downloaded from \n
    https://github.com/nemethpeti/numerai/blob/main/DeepDream/model.h5 \n
    :param batch_size: How much synthetic data to process in each batch. \n
    :param steps: Number of gradient ascent steps to perform. More steps will lead to more augmentation. \n
    :param step_size: How much to augment the batch based on computed gradients. \n
    Like with the number of steps, a larger step size will lead to more dramatic changes to the input features. \n
    The default parameters are found to work well in practice, but could be further optimized.
    """

    def __init__(
        self,
        model_path: str,
        batch_size: int = 200_000,
        steps: int = 5,
        step_size: float = 0.01,
        feature_names: list = None,
    ):
        super().__init__()
        tf.config.run_functions_eagerly(True)
        self.model_path = model_path
        self.model = self.__load_model(self.model_path)

        self.batch_size = batch_size
        self.steps = steps
        self.step_size = step_size

        self.feature_names = feature_names

    @display_processor_info
    def transform(self, dataf: NumerFrame) -> NumerFrame:
        dream_dataf = self.get_synthetic_batch(dataf)
        dataf = pd.concat([dataf, dream_dataf])
        return NumerFrame(dataf)

    def get_synthetic_batch(self, dataf: NumerFrame) -> NumerFrame:
        """
        Produce a synthetic version of the full input dataset.
        Target features will stay the same as in the original input data.
        """
        features = self.feature_names if self.feature_names else dataf.feature_cols
        targets = dataf.target_cols

        dream_dataf = pd.DataFrame(columns=features)
        for i in tqdm(
            np.arange(0, len(dataf), self.batch_size),
            desc="Deepdreaming Synthetic Batches",
        ):
            start = i
            end = np.minimum(i + self.batch_size - 1, len(dataf) - 1)
            sub_dataf = dataf.reset_index(drop=False).iloc[start:end]
            batch = tf.convert_to_tensor(
                sub_dataf.loc[:, features].astype(np.float32).values
            )

            dream_arr = self._dream(batch)
            batch_dataf = pd.DataFrame(dream_arr, columns=features)
            batch_dataf[targets] = sub_dataf[targets]

            dream_dataf = pd.concat([dream_dataf, batch_dataf])
        return NumerFrame(dream_dataf)

    def _dream(self, batch: tf.Tensor) -> np.ndarray:
        """
        Perform gradient ascent on batch of data.
        This loop perturbs the original features to create synthetic data.
        """
        for _ in tf.range(self.steps):
            with tf.GradientTape() as tape:
                tape.watch(batch)
                layer_activations = self.model(batch)
                loss = tf.math.reduce_mean(layer_activations, -1)

            gradients = tape.gradient(loss, batch)
            gradients /= tf.expand_dims(tf.math.reduce_std(gradients, -1), 1) + 1e-8

            # In gradient ascent, the "loss" is maximized so that the input row increasingly "excites" the layers.
            batch = batch + gradients * self.step_size
        batch = tf.clip_by_value(batch, 0, 1)
        return batch.numpy()

    @staticmethod
    def __load_model(
        model_path: str, output_layer_name: str = "concat"
    ) -> tf.keras.Model:
        """
        Load in Keras model from given path.
        output_layer_name will be the layer used to augment data.
        """
        base_model = tf.keras.models.load_model(model_path)
        base_model.compile(run_eagerly=True)
        # Maximize the activations of these layers
        layers = base_model.get_layer(output_layer_name).output
        # Create the feature extraction model
        dream_model = tf.keras.Model(inputs=base_model.input, outputs=layers)
        return dream_model

<IPython.core.display.Javascript object>

In [19]:
# hide
directory = "deepdream_test/"
downloader = NumeraiClassicDownloader(directory_path=directory)
downloader.download_single_dataset(
    filename="numerai_validation_data.parquet",
    dest_path=directory + "numerai_validation_data.parquet",
)
val_dataf = create_numerframe(f"{directory}numerai_validation_data.parquet")

2022-05-26 18:31:27,863 INFO numerapi.utils: starting download
deepdream_test/numerai_validation_data.parquet: 228MB [00:46, 4.87MB/s]                            


<IPython.core.display.Javascript object>

For our example we will use the model open sourced by [nemethpeti](https://github.com/nemethpeti) which you can download [here](https://github.com/nemethpeti/numerai/blob/main/DeepDream/model.h5). This model works on the v3 medium feature set. We therefore use v3 data in this example. The v3 medium feature set can be easily retrieved using `NumeraiClassicDownloader`.

In [20]:
# hide_output
feature_set = downloader.get_classic_features(filename="v3/features.json")
feature_names = feature_set["feature_sets"]["medium"]

2022-05-26 18:32:18,129 INFO numerapi.utils: starting download
deepdream_test/features.json: 441kB [00:00, 721kB/s]                            


<IPython.core.display.Javascript object>

[Download link to deepdream_model.h5 used here (Github).](https://github.com/nemethpeti/numerai/blob/main/DeepDream/model.h5)

In [21]:
ddg = DeepDreamGenerator(
    model_path="test_assets/deepdream_model.h5", feature_names=feature_names
)

2022-05-26 18:32:18.872832: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


<IPython.core.display.Javascript object>

In [22]:
sample_dataf = NumerFrame(val_dataf.sample(100))

<IPython.core.display.Javascript object>

In [23]:
dreamed_dataf = ddg.transform(sample_dataf)

Deepdreaming Synthetic Batches:   0%|          | 0/1 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

The new dreamed `NumerFrame` consists of the original data and 100 new additional rows. Note that targets are the same.

Also, `era`, `data_type` and any other columns besides features and targets will be `NaN`s.

In [24]:
print(dreamed_dataf.shape)
dreamed_dataf.tail()

(199, 1073)


Unnamed: 0,era,data_type,feature_dichasial_hammier_spawner,feature_rheumy_epistemic_prancer,feature_pert_performative_hormuz,feature_hillier_unpitied_theobromine,feature_perigean_bewitching_thruster,feature_renegade_undomestic_milord,feature_koranic_rude_corf,feature_demisable_expiring_millepede,...,target_paul_20,target_paul_60,target_george_20,target_george_60,target_william_20,target_william_60,target_arthur_20,target_arthur_60,target_thomas_20,target_thomas_60
94,,,0.006177,0.253282,0.562395,0.0,0.521255,0.496594,0.742214,0.0,...,0.75,1.0,0.5,1.0,0.666667,0.833333,0.666667,0.833333,0.666667,0.833333
95,,,0.229461,0.275013,0.211137,0.278999,0.040938,0.747025,0.254441,0.740341,...,0.5,0.75,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
96,,,0.525625,0.794636,0.224619,0.529903,0.220084,1.0,0.747399,0.131461,...,1.0,1.0,0.5,0.75,0.666667,0.5,0.666667,0.5,1.0,0.5
97,,,0.783618,0.0,0.992784,0.0,0.0,0.0,0.566844,0.955341,...,0.5,0.5,0.5,0.5,0.5,0.5,0.333333,0.5,0.333333,0.5
98,,,0.0,0.072171,0.748574,0.036015,0.801692,0.481805,0.248885,0.202161,...,0.25,0.25,0.25,0.25,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333


<IPython.core.display.Javascript object>

To only keep new synthetic data use `.get_synthetic_batch`.

In [25]:
synth_dataf = ddg.get_synthetic_batch(sample_dataf)

Deepdreaming Synthetic Batches:   0%|          | 0/1 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

In [26]:
print(synth_dataf.shape)
synth_dataf.head()

(99, 441)


Unnamed: 0,feature_abstersive_emotional_misinterpreter,feature_accessorial_aroused_crochet,feature_acerb_venusian_piety,feature_affricative_bromic_raftsman,feature_agile_unrespited_gaucho,feature_agronomic_cryptal_advisor,feature_alkaline_pistachio_sunstone,feature_altern_unnoticed_impregnation,feature_ambisexual_boiled_blunderer,feature_amoebaean_wolfish_heeler,...,target_paul_20,target_paul_60,target_george_20,target_george_60,target_william_20,target_william_60,target_arthur_20,target_arthur_60,target_thomas_20,target_thomas_60
0,1.0,0.202469,0.215039,1.0,0.805737,0.76092,0.759131,0.506008,0.503724,0.414889,...,0.75,0.75,1.0,0.75,0.666667,0.5,0.666667,0.5,0.833333,0.666667
1,0.953419,0.500796,0.779212,1.0,0.744971,0.508278,0.703535,0.504623,0.717189,0.197521,...,0.25,0.5,0.5,0.5,0.333333,0.5,0.333333,0.333333,0.333333,0.333333
2,0.488666,0.479599,0.954576,0.450511,0.741274,1.0,1.0,0.722007,0.456649,0.67614,...,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5,0.5
3,0.712898,0.732197,0.029215,0.0,0.765652,0.26516,0.273366,0.506347,0.884447,0.181741,...,0.25,0.5,0.5,0.5,0.333333,0.5,0.333333,0.5,0.5,0.666667
4,0.7894,0.286655,0.034882,0.746666,0.537442,0.574934,0.573383,0.223768,0.0,0.0,...,0.5,0.5,0.5,0.75,0.833333,0.666667,0.833333,0.666667,1.0,1.0


<IPython.core.display.Javascript object>

### 1.0.6. UMAPFeatureGenerator

Uniform Manifold Approximation and Projection (UMAP) is a dimensionality reduction technique that we can utilize to generate new Numerai features. This processor uses [umap-learn](https://pypi.org/project/umap-learn) under the hood to model the manifold. The dimension of the input data will be reduced to `n_components` number of features.

In [27]:
# export
class UMAPFeatureGenerator(BaseProcessor):
    """
    Generate new Numerai features using UMAP. Uses umap-learn under the hood: \n
    https://pypi.org/project/umap-learn/
    :param n_components: How many new features to generate.
    :param n_neighbors: Number of neighboring points used in local approximations of manifold structure.
    :param min_dist: How tightly the embedding is allows to compress points together.
    :param metric: Metric to measure distance in input space. Correlation by default.
    :param feature_names: Selection of features used to perform UMAP on. All features by default.
    *args, **kwargs will be passed to initialization of UMAP.
    """

    def __init__(
        self,
        n_components: int = 5,
        n_neighbors: int = 15,
        min_dist: float = 0.0,
        metric: str = "correlation",
        feature_names: list = None,
        *args,
        **kwargs,
    ):
        super().__init__()
        self.n_components = n_components
        self.n_neighbors = n_neighbors
        self.min_dist = min_dist
        self.feature_names = feature_names
        self.metric = metric
        self.umap = UMAP(
            n_components=self.n_components,
            n_neighbors=self.n_neighbors,
            min_dist=self.min_dist,
            metric=self.metric,
            *args,
            **kwargs,
        )

    def transform(self, dataf: NumerFrame, *args, **kwargs) -> NumerFrame:
        feature_names = self.feature_names if self.feature_names else dataf.feature_cols
        new_feature_data = self.umap.fit_transform(dataf[feature_names])
        umap_feature_names = [f"feature_umap_{i}" for i in range(self.n_components)]
        norm_new_feature_data = MinMaxScaler().fit_transform(new_feature_data)
        dataf.loc[:, umap_feature_names] = norm_new_feature_data
        return NumerFrame(dataf)

<IPython.core.display.Javascript object>

In [28]:
n_components = 3
umap_gen = UMAPFeatureGenerator(n_components=n_components, n_neighbors=9)
dataf = create_numerframe("test_assets/mini_numerai_version_2_data.parquet")
dataf = umap_gen(dataf)

OMP: Info #271: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


<IPython.core.display.Javascript object>

The new features will be names with the convention `f"feature_umap_{i}"`.

In [29]:
umap_features = [f"feature_umap_{i}" for i in range(n_components)]
dataf[umap_features].head(3)

Unnamed: 0_level_0,feature_umap_0,feature_umap_1,feature_umap_2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
n559bd06a8861222,1.0,0.467906,0.160752
n9d39dea58c9e3cf,0.689324,0.283587,0.0
nb64f06d3a9fc9f1,0.400365,1.0,0.388111


<IPython.core.display.Javascript object>

## 1.1. Numerai Classic

The Numerai Classic dataset has a certain structure that you may not encounter in the Numerai Signals tournament.
Therefore, this section has all preprocessors that can only be applied to Numerai Classic.

### 1.1.0 Numerai Classic: Version agnostic

Preprocessors that work for all Numerai Classic versions.

#### 1.1.0.1. BayesianGMMTargetProcessor

In [30]:
# export
class BayesianGMMTargetProcessor(BaseProcessor):
    """
    Generate synthetic (fake) target using a Bayesian Gaussian Mixture model. \n
    Based on Michael Oliver's GitHub Gist implementation: \n
    https://gist.github.com/the-moliver/dcdd2862dc2c78dda600f1b449071c93

    :param target_col: Column from which to create fake target. \n
    :param feature_names: Selection of features used for Bayesian GMM. All features by default.
    :param n_components: Number of components for fitting Bayesian Gaussian Mixture Model.
    """

    def __init__(
        self,
        target_col: str = "target",
        feature_names: list = None,
        n_components: int = 6,
    ):
        super().__init__()
        self.target_col = target_col
        self.feature_names = feature_names
        self.n_components = n_components
        self.ridge = Ridge(fit_intercept=False)
        self.bins = [0, 0.05, 0.25, 0.75, 0.95, 1]

    @display_processor_info
    def transform(self, dataf: NumerFrame, *args, **kwargs) -> NumerFrame:
        all_eras = dataf[dataf.meta.era_col].unique()
        coefs = self._get_coefs(dataf=dataf, all_eras=all_eras)
        bgmm = self._fit_bgmm(coefs=coefs)
        fake_target = self._generate_target(dataf=dataf, bgmm=bgmm, all_eras=all_eras)
        dataf[f"{self.target_col}_fake"] = fake_target
        return NumerFrame(dataf)

    def _get_coefs(self, dataf: NumerFrame, all_eras: list) -> np.ndarray:
        """
        Generate coefficients for BGMM.
        Data should already be scaled between 0 and 1
        (Already done with Numerai Classic data)
        """
        coefs = []
        for era in all_eras:
            features, target = self.__get_features_target(dataf=dataf, era=era)
            self.ridge.fit(features, target)
            coefs.append(self.ridge.coef_)
        stacked_coefs = np.vstack(coefs)
        return stacked_coefs

    def _fit_bgmm(self, coefs: np.ndarray) -> BayesianGaussianMixture:
        """
        Fit Bayesian Gaussian Mixture model on coefficients and normalize.
        """
        bgmm = BayesianGaussianMixture(n_components=self.n_components)
        bgmm.fit(coefs)
        # make probability of sampling each component equal to better balance rare regimes
        bgmm.weights_[:] = 1 / self.n_components
        return bgmm

    def _generate_target(
        self, dataf: NumerFrame, bgmm: BayesianGaussianMixture, all_eras: list
    ) -> np.ndarray:
        """Generate fake target using Bayesian Gaussian Mixture model."""
        fake_target = []
        for era in tqdm(all_eras, desc="Generating fake target"):
            features, _ = self.__get_features_target(dataf=dataf, era=era)
            # Sample a set of weights from GMM
            beta, _ = bgmm.sample(1)
            # Create fake continuous target
            fake_targ = features @ beta[0]
            # Bin fake target like real target
            fake_targ = (rankdata(fake_targ) - 0.5) / len(fake_targ)
            fake_targ = (np.digitize(fake_targ, self.bins) - 1) / 4
            fake_target.append(fake_targ)
        return np.concatenate(fake_target)

    def __get_features_target(self, dataf: NumerFrame, era) -> tuple:
        """Get features and target for one era and center data."""
        sub_df = dataf[dataf[dataf.meta.era_col] == era]
        features = self.feature_names if self.feature_names else sub_df.get_feature_data
        target = sub_df[self.target_col]
        features = features.values - 0.5
        target = target.values - 0.5
        return features, target

<IPython.core.display.Javascript object>

In [31]:
bgmm = BayesianGMMTargetProcessor()
sample_dataf = bgmm(sample_dataf)
sample_dataf[["target", "target_fake"]].head(3)



Generating fake target:   0%|          | 0/60 [00:00<?, ?it/s]

Unnamed: 0_level_0,target,target_fake
id,Unnamed: 1_level_1,Unnamed: 2_level_1
ned455e047de7e97,0.75,0.5
n6abb0e68edd1571,0.25,0.5
n7e76644dbadbd6b,0.5,0.5


<IPython.core.display.Javascript object>

In [32]:
# hide
downloader.remove_base_directory()

<IPython.core.display.Javascript object>

### 1.1.1. Numerai Classic: Version 1 specific

Preprocessors that only work for version 1 (legacy data).
When using version 1 preprocessor it is recommended that the input `NumerFrame` has `version` in its metadata.
This avoids using version 1 preprocessors on version 2 data and encountering confusing error messages.

As a new user we recommend to start modeling the version 2 data and avoid version 1.
The preprocessors below are only there for legacy and compatibility reasons.

#### 1.1.1.1. GroupStatsPreProcessor

The version 1 legacy data has 6 groups of features which allows us to calculate aggregate features.

In [33]:
# export
class GroupStatsPreProcessor(BaseProcessor):
    """
    WARNING: Only supported for Version 1 (legacy) data. \n
    Calculate group statistics for all data groups. \n
    | :param groups: Groups to create features for. All groups by default.
    """

    def __init__(self, groups: list = None):
        super().__init__()
        self.all_groups = [
            "intelligence",
            "wisdom",
            "charisma",
            "dexterity",
            "strength",
            "constitution",
        ]
        self.group_names = groups if groups else self.all_groups

    @display_processor_info
    def transform(self, dataf: NumerFrame, *args, **kwargs) -> NumerFrame:
        """Check validity and add group features."""
        self._check_data_validity(dataf=dataf)
        dataf = dataf.pipe(self._add_group_features)
        return NumerFrame(dataf)

    def _add_group_features(self, dataf: pd.DataFrame) -> pd.DataFrame:
        """Mean, standard deviation and skew for each group."""
        for group in self.group_names:
            cols = [col for col in dataf.columns if group in col]
            dataf[f"feature_{group}_mean"] = dataf[cols].mean(axis=1)
            dataf[f"feature_{group}_std"] = dataf[cols].std(axis=1)
            dataf[f"feature_{group}_skew"] = dataf[cols].skew(axis=1)
        return dataf

    def _check_data_validity(self, dataf: NumerFrame):
        """Make sure this is only used for version 1 data."""
        assert hasattr(
            dataf.meta, "version"
        ), f"Version should be specified for '{self.__class__.__name__}' This Preprocessor will only work on version 1 data."
        assert (
            getattr(dataf.meta, "version") == 1
        ), f"'{self.__class__.__name__}' only works on version 1 data. Got version: '{getattr(dataf.meta, 'version')}'."

<IPython.core.display.Javascript object>

In [34]:
dataf = create_numerframe(
    "test_assets/mini_numerai_version_1_data.csv", metadata={"version": 1}
)
group_features_dataf = GroupStatsPreProcessor().transform(dataf)
group_features_dataf.head(2)
assert group_features_dataf.meta.version == 1

<IPython.core.display.Javascript object>

In [35]:
# hide
new_cols = [
    "feature_intelligence_mean",
    "feature_intelligence_std",
    "feature_intelligence_skew",
    "feature_wisdom_mean",
    "feature_wisdom_std",
    "feature_wisdom_skew",
    "feature_charisma_mean",
    "feature_charisma_std",
    "feature_charisma_skew",
    "feature_dexterity_mean",
    "feature_dexterity_std",
    "feature_dexterity_skew",
    "feature_strength_mean",
    "feature_strength_std",
    "feature_strength_skew",
    "feature_constitution_mean",
    "feature_constitution_std",
    "feature_constitution_skew",
]
assert set(group_features_dataf.columns).intersection(new_cols)
group_features_dataf.get_feature_data[new_cols].head(2)

Unnamed: 0,feature_intelligence_mean,feature_intelligence_std,feature_intelligence_skew,feature_wisdom_mean,feature_wisdom_std,feature_wisdom_skew,feature_charisma_mean,feature_charisma_std,feature_charisma_skew,feature_dexterity_mean,feature_dexterity_std,feature_dexterity_skew,feature_strength_mean,feature_strength_std,feature_strength_skew,feature_constitution_mean,feature_constitution_std,feature_constitution_skew
0,0.333333,0.246183,0.558528,0.668478,0.236022,-0.115082,0.438953,0.25991,-0.004783,0.696429,0.200446,-0.60762,0.480263,0.292829,-0.372064,0.427632,0.27572,0.276155
1,0.208333,0.234359,0.382554,0.559783,0.358177,-0.062362,0.485465,0.252501,-0.021737,0.267857,0.249312,0.382267,0.407895,0.309866,0.220625,0.644737,0.33408,-0.794938


<IPython.core.display.Javascript object>

`GroupStatsPreProcessor` should break if `version != 1`.

In [36]:
# hide
def test_invalid_version(dataf: NumerFrame):
    copied_dataf = dataf.copy()
    copied_dataf.version = 2
    try:
        GroupStatsPreProcessor().transform(copied_dataf)
    except AssertionError:
        return True
    return False


test_invalid_version(dataf)

False

<IPython.core.display.Javascript object>

### 1.1.2. Numerai Classic: Version 2 specific

Preprocessors that are only compatible with version 2 data. If the preprocessor is agnostic to Numerai Classic version implement under heading 1.1.0.

In [37]:
# 1.1.2
# No version 2 specific Numerai Classic preprocessors implemented yet.

<IPython.core.display.Javascript object>

## 1.2. Numerai Signals

Preprocessors that are specific to Numerai Signals.

### 1.2.1. TA-Lib Features (TalibFeatureGenerator)

[TA-Lib](https://mrjbq7.github.io/ta-lib) is an optimized technical analysis library. It is based on Cython and includes 150+ indicators. We have selected features based on feature importances, SHAP and correlation with the Numerai Signals target. If you want to implement other features check out the [TA-Lib documentation](https://mrjbq7.github.io/ta-lib/index.html).

Installation of TA-Lib is a bit more involved than just a pip install and is an optional dependency for this library. Visit the [installation documentation](https://mrjbq7.github.io/ta-lib/install.html) for instructions.

In [38]:
# export
class TalibFeatureGenerator(BaseProcessor):
    """
    Generate relevant features available in TA-Lib. \n
    More info: https://mrjbq7.github.io/ta-lib \n
    Input DataFrames for these functions should have the following columns defined:
    ['open', 'high', 'low', 'close', 'volume'] \n
    Make sure that all values are sorted in chronological order (by ticker). \n
    :param windows: List of ranges for window features.
    Windows will be applied for all features specified in self.window_features. \n
    :param ticker_col: Which column to groupby for feature generation.
    """

    def __init__(self, windows: List[int], ticker_col: str = "bloomberg_ticker"):
        self.__check_talib_import()
        super().__init__()

        self.windows = windows
        self.ticker_col = ticker_col
        self.window_features = [
            "NATR",
            "ADXR",
            "AROONOSC",
            "DX",
            "MFI",
            "MINUS_DI",
            "MINUS_DM",
            "MOM",
            "ROCP",
            "ROCR100",
            "PLUS_DI",
            "PLUS_DM",
            "BETA",
            "RSI",
            "ULTOSC",
            "TRIX",
            "ADXR",
            "CCI",
            "CMO",
            "WILLR",
        ]
        self.no_window_features = ["AD", "OBV", "APO", "MACD", "PPO"]
        self.hlocv_cols = ["open", "high", "low", "close", "volume"]

    def get_no_window_features(self, dataf: pd.DataFrame):
        for func in tqdm(self.no_window_features, desc="No window features"):
            dataf.loc[:, f"feature_{func}"] = (
                dataf.groupby(self.ticker_col)
                .apply(lambda x: pd.Series(self._no_window(x, func)).bfill())
                .values.astype(np.float32)
            )
        return dataf

    def get_window_features(self, dataf: pd.DataFrame):
        for win in tqdm(self.windows, position=0, desc="Window features"):
            for func in tqdm(self.window_features, position=1):
                dataf.loc[:, f"feature_{func}_{win}"] = (
                    dataf.groupby(self.ticker_col)
                    .apply(lambda x: pd.Series(self._window(x, func, win)).bfill())
                    .values.astype(np.float32)
                )
        return dataf

    def get_all_features(self, dataf: pd.DataFrame) -> pd.DataFrame:
        dataf = self.get_no_window_features(dataf)
        dataf = self.get_window_features(dataf)
        return dataf

    def transform(self, dataf: pd.DataFrame, *args, **kwargs) -> NumerFrame:
        return NumerFrame(self.get_all_features(dataf=dataf))

    def _no_window(self, dataf: pd.DataFrame, func) -> pd.Series:
        from talib import abstract as tab

        inputs = self.__get_inputs(dataf)
        if func in ["MACD"]:
            # MACD outputs tuple of 3 elements (value, signal and hist)
            return tab.Function(func)(inputs["close"])[0]
        else:
            return tab.Function(func)(inputs)

    def _window(self, dataf: pd.DataFrame, func, window: int) -> pd.Series:
        from talib import abstract as tab

        inputs = self.__get_inputs(dataf)
        if func in ["ULTOSC"]:
            # ULTOSC requires 3 timeperiods as input
            return tab.Function(func)(
                inputs["high"],
                inputs["low"],
                inputs["close"],
                timeperiod1=window,
                timeperiod2=window * 2,
                timeperiod3=window * 4,
            )
        else:
            return tab.Function(func)(inputs, timeperiod=window)

    def __get_inputs(self, dataf: pd.DataFrame) -> dict:
        return {col: dataf[col].values.astype(np.float64) for col in self.hlocv_cols}

    @staticmethod
    def __check_talib_import():
        try:
            from talib import abstract as tab
        except ImportError:
            raise ImportError(
                "TA-Lib is not installed for this environment. If you are using this class make sure to have TA-Lib installed. check https://mrjbq7.github.io/ta-lib/install.html for instructions on installation."
            )

<IPython.core.display.Javascript object>

In [39]:
# hide
# Example usage
# dataf = pd.DataFrame() # Your Signals DataFrame here.
# tfg = TalibFeatureGenerator(windows=[10, 20, 40], ticker_col="bloomberg_ticker")
# ta_dataf = tfg.transform(dataf=dataf)
# ta_dataf.head(2)

<IPython.core.display.Javascript object>

### 1.2.2. KatsuFeatureGenerator

[Katsu1110](https://www.kaggle.com/code1110) provides an excellent and fast feature engineering scheme in his [Kaggle notebook on starting with Numerai Signals](https://www.kaggle.com/code1110/numeraisignals-starter-for-beginners). It is surprisingly effective, fast and works well for modeling. This preprocessor is based on his feature engineering setup in that notebook.

Features generated:
1. MACD and MACD signal
2. RSI
3. Percentage rate of return
4. Volatility
5. MA (moving average) gap


In [40]:
# export
class KatsuFeatureGenerator(BaseProcessor):
    """
    Effective feature engineering setup based on Katsu's starter notebook.
    Based on source by Katsu1110: https://www.kaggle.com/code1110/numeraisignals-starter-for-beginners

    :param windows: Time interval to apply for window features: \n
    1. Percentage Rate of change \n
    2. Volatility \n
    3. Moving Average gap \n
    :param ticker_col: Columns with tickers to iterate over. \n
    :param close_col: Column name where you have closing price stored.
    """

    warnings.filterwarnings("ignore")

    def __init__(
        self,
        windows: list,
        ticker_col: str = "ticker",
        close_col: str = "close",
        num_cores: int = None,
    ):
        super().__init__()
        self.windows = windows
        self.ticker_col = ticker_col
        self.close_col = close_col
        self.num_cores = num_cores if num_cores else os.cpu_count()

    @display_processor_info
    def transform(self, dataf: Union[pd.DataFrame, NumerFrame]) -> NumerFrame:
        """Multiprocessing feature engineering."""
        tickers = dataf.loc[:, self.ticker_col].unique().tolist()
        rich_print(
            f"Feature engineering for {len(tickers)} tickers using {self.num_cores} CPU cores."
        )
        dataf_list = [
            x
            for _, x in tqdm(
                dataf.groupby(self.ticker_col), desc="Generating ticker DataFrames"
            )
        ]
        dataf = self._generate_features(dataf_list=dataf_list)
        return NumerFrame(dataf)

    def feature_engineering(self, dataf: pd.DataFrame) -> pd.DataFrame:
        """Feature engineering for single ticker."""
        close_series = dataf.loc[:, self.close_col]
        for x in self.windows:
            dataf.loc[
                :, f"feature_{self.close_col}_ROCP_{x}"
            ] = close_series.pct_change(x)

            dataf.loc[:, f"feature_{self.close_col}_VOL_{x}"] = (
                np.log1p(close_series).pct_change().rolling(x).std()
            )

            dataf.loc[:, f"feature_{self.close_col}_MA_gap_{x}"] = (
                close_series / close_series.rolling(x).mean()
            )

        dataf.loc[:, "feature_RSI"] = self._rsi(close_series)
        macd, macd_signal = self._macd(close_series)
        dataf.loc[:, "feature_MACD"] = macd
        dataf.loc[:, "feature_MACD_signal"] = macd_signal
        return dataf.bfill()

    def _generate_features(self, dataf_list: list) -> pd.DataFrame:
        """Add features for list of ticker DataFrames and concatenate."""
        with Pool(self.num_cores) as p:
            feature_datafs = list(
                tqdm(
                    p.imap(self.feature_engineering, dataf_list),
                    desc="Generating features",
                    total=len(dataf_list),
                )
            )
        return pd.concat(feature_datafs)

    @staticmethod
    def _rsi(close: pd.Series, period: int = 14) -> pd.Series:
        """
        See source https://github.com/peerchemist/finta
        and fix https://www.tradingview.com/wiki/Talk:Relative_Strength_Index_(RSI)
        """
        delta = close.diff()
        up, down = delta.copy(), delta.copy()
        up[up < 0] = 0
        down[down > 0] = 0

        gain = up.ewm(com=(period - 1), min_periods=period).mean()
        loss = down.abs().ewm(com=(period - 1), min_periods=period).mean()

        rs = gain / loss
        return pd.Series(100 - (100 / (1 + rs)))

    def _macd(
        self, close: pd.Series, span1=12, span2=26, span3=9
    ) -> Tuple[pd.Series, pd.Series]:
        """Compute MACD and MACD signal."""
        exp1 = self.__ema1(close, span1)
        exp2 = self.__ema1(close, span2)
        macd = 100 * (exp1 - exp2) / exp2
        signal = self.__ema1(macd, span3)
        return macd, signal

    @staticmethod
    def __ema1(series: pd.Series, span: int) -> pd.Series:
        """Exponential moving average"""
        a = 2 / (span + 1)
        return series.ewm(alpha=a).mean()

<IPython.core.display.Javascript object>

In [41]:
# other
from numerblox.download import KaggleDownloader

# Get price data from Kaggle
home_dir = "katsu_features_test/"
kd = KaggleDownloader(home_dir)
kd.download_training_data("code1110/yfinance-stock-price-data-for-numerai-signals")

<IPython.core.display.Javascript object>

In [42]:
# other
dataf = create_numerframe(f"{home_dir}/full_data.parquet")
dataf.loc[:, "friday_date"] = dataf["date"]
# Take 500 ticker sample for test
dataf = dataf[dataf["ticker"].isin(dataf["ticker"].unique()[:500])]

<IPython.core.display.Javascript object>

In [43]:
# other
kfpp = KatsuFeatureGenerator(windows=[20, 40, 60], num_cores=8)
new_dataf = kfpp.transform(dataf)

Generating ticker DataFrames:   0%|          | 0/500 [00:00<?, ?it/s]

2022-05-26 18:35:32,931 INFO numexpr.utils: Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2022-05-26 18:35:32,937 INFO numexpr.utils: NumExpr defaulting to 8 threads.


Generating features:   0%|          | 0/500 [00:00<?, ?it/s]

2022-05-26 18:35:32,937 INFO numexpr.utils: Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2022-05-26 18:35:32,942 INFO numexpr.utils: NumExpr defaulting to 8 threads.
2022-05-26 18:35:32,945 INFO numexpr.utils: Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2022-05-26 18:35:32,951 INFO numexpr.utils: NumExpr defaulting to 8 threads.
2022-05-26 18:35:32,953 INFO numexpr.utils: Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2022-05-26 18:35:32,960 INFO numexpr.utils: NumExpr defaulting to 8 threads.
2022-05-26 18:35:32,962 INFO numexpr.utils: Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2022-05-26 18:35:32,968 INFO numexpr.utils: NumExpr defaulting to 8 threads.
2022-05-26 18:35:32,973 INFO numexpr.utils: Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe

<IPython.core.display.Javascript object>

12 features are generated in this test (3*3 window features + 3 non window features).

In [44]:
# other
new_dataf.sort_values(["ticker", "date"]).get_feature_data.tail(2)

Unnamed: 0,feature_close_ROCP_20,feature_close_VOL_20,feature_close_MA_gap_20,feature_close_ROCP_40,feature_close_VOL_40,feature_close_MA_gap_40,feature_close_ROCP_60,feature_close_VOL_60,feature_close_MA_gap_60,feature_RSI,feature_MACD,feature_MACD_signal
1977264,0.077144,0.001958,1.033635,0.067426,0.001993,1.06379,-0.023033,0.002412,1.059153,60.233996,1.701916,1.214393
1977265,0.051667,0.001904,1.026899,0.070268,0.001989,1.057691,-0.038643,0.002404,1.055659,58.878408,1.692849,1.310084


<IPython.core.display.Javascript object>

### 1.2.3. EraQuantileProcessor

Numerai Signals' objective is predicting a ranking of equities. Therefore, we can benefit from creating rankings out of the features. Doing this reduces noise and works as a normalization mechanism for your features. `EraQuantileProcessor` bins features in a given number of quantiles for each era in the dataset.

In [45]:
# export
class EraQuantileProcessor(BaseProcessor):
    """
    Transform features into quantiles on a per-era basis

    :param num_quantiles: Number of buckets to split data into. \n
    :param era_col: Era column name in the dataframe to perform each transformation. \n
    :param features: All features that you want quantized. All feature cols by default. \n
    :param num_cores: CPU cores to allocate for quantile transforming. All available cores by default. \n
    :param random_state: Seed for QuantileTransformer.
    """

    def __init__(
        self,
        num_quantiles: int = 50,
        era_col: str = "friday_date",
        features: list = None,
        num_cores: int = None,
        random_state: int = 0,
    ):
        super().__init__()
        self.num_quantiles = num_quantiles
        self.era_col = era_col
        self.num_cores = num_cores if num_cores else os.cpu_count()
        self.features = features
        self.random_state = random_state

    def _process_eras(self, groupby_object):
        quantizer = QuantileTransformer(
            n_quantiles=self.num_quantiles, random_state=self.random_state
        )
        qt = lambda x: quantizer.fit_transform(x.values.reshape(-1, 1)).ravel()

        column = groupby_object.transform(qt)
        return column

    @display_processor_info
    def transform(
        self,
        dataf: Union[pd.DataFrame, NumerFrame],
    ) -> NumerFrame:
        """Multiprocessing quantile transforms by era."""
        self.features = self.features if self.features else dataf.feature_cols
        rich_print(
            f"Quantiling for {len(self.features)} features using {self.num_cores} CPU cores."
        )

        date_groups = dataf.groupby(self.era_col)
        groupby_objects = [date_groups[feature] for feature in self.features]

        with Pool() as p:
            results = list(
                tqdm(
                    p.imap(self._process_eras, groupby_objects),
                    total=len(groupby_objects),
                )
            )

        quantiles = pd.concat(results, axis=1)
        dataf[
            [f"{feature}_quantile{self.num_quantiles}" for feature in self.features]
        ] = quantiles
        return NumerFrame(dataf)

<IPython.core.display.Javascript object>

In [46]:
# other
new_dataf = new_dataf.sample(10000)
era_quantiler = EraQuantileProcessor(num_quantiles=50)
era_dataf = era_quantiler.transform(new_dataf)

  0%|          | 0/12 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

In [47]:
# other
era_dataf.get_feature_data.tail(2)

Unnamed: 0,feature_close_ROCP_20,feature_close_VOL_20,feature_close_MA_gap_20,feature_close_ROCP_40,feature_close_VOL_40,feature_close_MA_gap_40,feature_close_ROCP_60,feature_close_VOL_60,feature_close_MA_gap_60,feature_RSI,...,feature_close_MA_gap_20_quantile50,feature_close_ROCP_40_quantile50,feature_close_VOL_40_quantile50,feature_close_MA_gap_40_quantile50,feature_close_ROCP_60_quantile50,feature_close_VOL_60_quantile50,feature_close_MA_gap_60_quantile50,feature_RSI_quantile50,feature_MACD_quantile50,feature_MACD_signal_quantile50
940379,0.145235,0.003546,1.104706,0.109971,0.003225,1.116272,0.143504,0.002889,1.113181,74.316053,...,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
1742422,0.069231,0.00756,1.028868,0.125506,0.007451,1.055629,0.007246,0.008667,1.053163,59.981264,...,1.0,0.5,1.0,0.5,0.5,1.0,0.5,1.0,0.5,0.5


<IPython.core.display.Javascript object>

In [48]:
# other
# hide
kd.remove_base_directory()

<IPython.core.display.Javascript object>

### 1.2.4. TickerMapper

Numerai Signals data APIs may work with different ticker formats. Our goal with `TickerMapper` is to map `ticker_col` to `target_ticker_format`.

In [49]:
# export
class TickerMapper(BaseProcessor):
    """
    Map ticker from one format to another. \n
    :param ticker_col: Column used for mapping. Must already be present in the input data. \n
    :param target_ticker_format: Format to map tickers to. Must be present in the ticker map. \n
    For default mapper supported ticker formats are: ['ticker', 'bloomberg_ticker', 'yahoo'] \n
    :param mapper_path: Path to CSV file containing at least ticker_col and target_ticker_format columns. \n
    Can be either a web link of local path. Numerai Signals mapping by default.
    """

    def __init__(
        self, ticker_col: str = "ticker", target_ticker_format: str = "bloomberg_ticker",
        mapper_path: str = "https://numerai-signals-public-data.s3-us-west-2.amazonaws.com/signals_ticker_map_w_bbg.csv"
    ):
        super().__init__()
        self.ticker_col = ticker_col
        self.target_ticker_format = target_ticker_format

        self.signals_map_path = mapper_path
        self.ticker_map = pd.read_csv(self.signals_map_path)

        assert (
            self.ticker_col in self.ticker_map.columns
        ), f"Ticker column '{self.ticker_col}' is not available in ticker mapping."
        assert (
            self.target_ticker_format in self.ticker_map.columns
        ), f"Target ticker column '{self.target_ticker_format}' is not available in ticker mapping."

        self.mapping = dict(
            self.ticker_map[[self.ticker_col, self.target_ticker_format]].values
        )

    @display_processor_info
    def transform(
        self, dataf: Union[pd.DataFrame, NumerFrame], *args, **kwargs
    ) -> NumerFrame:
        dataf[self.target_ticker_format] = dataf[self.ticker_col].map(self.mapping)
        return NumerFrame(dataf)

<IPython.core.display.Javascript object>

Use default signals mapping to convert between Numerai ticker, Bloomberg ticker and Yahoo ticker formats.

In [50]:
test_dataf = pd.DataFrame(["AAPL", "MSFT"], columns=["ticker"])
mapper = TickerMapper()
mapper.transform(test_dataf)

Unnamed: 0,ticker,bloomberg_ticker
0,AAPL,AAPL US
1,MSFT,MSFT US


<IPython.core.display.Javascript object>

You can also use a CSV file for mapping. For example, the mapping Numerai user degerhan provides in [dsignals](https://github.com/degerhan/dsignals) for EOD data.

In [51]:
test_dataf = pd.DataFrame(["LLB SW", "DRAK NA", "SWB MK", "ELEKTRA* MF", "NOT_A_TICKER"], columns=["bloomberg_ticker"])
mapper = TickerMapper(ticker_col="bloomberg_ticker", target_ticker_format="signals_ticker",
                      mapper_path="test_assets/eodhd-map.csv")
mapper.transform(test_dataf)

Unnamed: 0,bloomberg_ticker,signals_ticker
0,LLB SW,LLB.SW
1,DRAK NA,DRAK.AS
2,SWB MK,5211.KLSE
3,ELEKTRA* MF,ELEKTRA.MX
4,NOT_A_TICKER,


<IPython.core.display.Javascript object>

### 1.2.5. SignalsTargetProcessor


Numerai provides [targets for 5000 stocks](https://docs.numer.ai/numerai-signals/signals-overview#universe) that are neutralized against all sorts of factors. However, it can be helpful to experiment with creating your own targets. You might want to explore different windows, different target binning and/or neutralization. `SignalsTargetProcessor` engineers 3 different targets for every given windows:
- `_raw`: Raw return based on price movements.
- `_rank`: Ranks of raw return.
- `_group`: Binned returns based on rank.

Note that Numerai provides targets based on 4-day returns and 20-day returns. While you can explore any window you like, it makes sense to start with `windows` close to these timeframes.

For the `bins` argument there are also many options possible. The followed are commonly used binning:
- Nomi bins: `[0, 0.05, 0.25, 0.75, 0.95, 1]`
- Uniform bins: `[0, 0.20, 0.40, 0.60, 0.80, 1]`

In [52]:
# export
class SignalsTargetProcessor(BaseProcessor):
    """
    Engineer targets for Numerai Signals. \n
    More information on implements Numerai Signals targets: \n
    https://forum.numer.ai/t/decoding-the-signals-target/2501

    :param price_col: Column from which target will be derived. \n
    :param windows: Timeframes to use for engineering targets. 10 and 20-day by default. \n
    :param bins: Binning used to create group targets. Nomi binning by default. \n
    :param labels: Scaling for binned target. Must be same length as resulting bins (bins-1). Numerai labels by default.
    """

    def __init__(
        self,
        price_col: str = "close",
        windows: list = None,
        bins: list = None,
        labels: list = None,
    ):
        super().__init__()
        self.price_col = price_col
        self.windows = windows if windows else [10, 20]
        self.bins = bins if bins else [0, 0.05, 0.25, 0.75, 0.95, 1]
        self.labels = labels if labels else [0, 0.25, 0.50, 0.75, 1]

    @display_processor_info
    def transform(self, dataf: NumerFrame) -> NumerFrame:
        for window in tqdm(self.windows, desc="Signals target engineering windows"):
            dataf.loc[:, f"target_{window}d_raw"] = (
                dataf[self.price_col].pct_change(periods=window).shift(-window)
            )
            era_groups = dataf.groupby(dataf.meta.era_col)

            dataf.loc[:, f"target_{window}d_rank"] = era_groups[
                f"target_{window}d_raw"
            ].rank(pct=True, method="first")
            dataf.loc[:, f"target_{window}d_group"] = era_groups[
                f"target_{window}d_rank"
            ].transform(
                lambda group: pd.cut(
                    group, bins=self.bins, labels=self.labels, include_lowest=True
                )
            )
        return NumerFrame(dataf)

<IPython.core.display.Javascript object>

In [53]:
# other
stp = SignalsTargetProcessor()
era_dataf.meta.era_col = "date"
new_target_dataf = stp.transform(era_dataf)
new_target_dataf.get_target_data.head(2)

Signals target engineering windows:   0%|          | 0/2 [00:00<?, ?it/s]

2022-05-26 18:35:52,888 INFO numexpr.utils: Note: NumExpr detected 16 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
2022-05-26 18:35:52,889 INFO numexpr.utils: NumExpr defaulting to 8 threads.


Unnamed: 0,target_10d_raw,target_10d_rank,target_10d_group,target_20d_raw,target_20d_rank,target_20d_group
607733,-0.993852,0.5,0.5,-0.986229,0.5,0.5
719896,2160.214111,1.0,1.0,2419.86499,1.0,1.0


<IPython.core.display.Javascript object>

### 1.2.6. LagPreProcessor

Many models like Gradient Boosting Machines (GBMs) don't learn any time-series patterns by itself. However, if we create lags of our features the models will pick up on time dependencies between features. `LagPreProcessor` create lag features for given features and windows.

In [54]:
# export
class LagPreProcessor(BaseProcessor):
    """
    Add lag features based on given windows.

    :param windows: All lag windows to process for all features. \n
    [5, 10, 15, 20] by default (4 weeks lookback) \n
    :param ticker_col: Column name for grouping by tickers. \n
    :param feature_names: All features for which you want to create lags. All features by default.
    """

    def __init__(
        self,
        windows: list = None,
        ticker_col: str = "bloomberg_ticker",
        feature_names: list = None,
    ):
        super().__init__()
        self.windows = windows if windows else [5, 10, 15, 20]
        self.ticker_col = ticker_col
        self.feature_names = feature_names

    @display_processor_info
    def transform(self, dataf: NumerFrame, *args, **kwargs) -> NumerFrame:
        feature_names = self.feature_names if self.feature_names else dataf.feature_cols
        ticker_groups = dataf.groupby(self.ticker_col)
        for feature in tqdm(feature_names, desc="Lag feature generation"):
            feature_group = ticker_groups[feature]
            for day in self.windows:
                shifted = feature_group.shift(day, axis=0)
                dataf.loc[:, f"{feature}_lag{day}"] = shifted
        return NumerFrame(dataf)

<IPython.core.display.Javascript object>

In [55]:
# other
lpp = LagPreProcessor(ticker_col="ticker", feature_names=["close", "volume"])
dataf = lpp(dataf)

Lag feature generation:   0%|          | 0/2 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

All lag features will contain `lag` in the column name.

In [56]:
# other
dataf.get_pattern_data("lag").tail(2)

Unnamed: 0,close_lag5,close_lag10,close_lag15,close_lag20,volume_lag5,volume_lag10,volume_lag15,volume_lag20
1977264,2224.0,2117.0,2152.0,2087.0,333400.0,305900.0,449200.0,206600.0
1977265,2232.0,2094.0,2180.0,2129.0,285400.0,502000.0,341100.0,294200.0


<IPython.core.display.Javascript object>

### 1.2.7. DifferencePreProcessor

After creating lags with the `LagPreProcessor`, it may be useful to create new features that calculate the difference between those lags. Through this process in `DifferencePreProcessor`, we can provide models with more time-series related patterns.

In [57]:
# export
class DifferencePreProcessor(BaseProcessor):
    """
    Add difference features based on given windows. Run LagPreProcessor first.

    :param windows: All lag windows to process for all features. \n
    :param feature_names: All features for which you want to create differences. All features that also have lags by default. \n
    :param pct_change: Method to calculate differences. If True, will calculate differences with a percentage change. Otherwise calculates a simple difference. Defaults to False \n
    :param abs_diff: Whether to also calculate the absolute value of all differences. Defaults to True \n
    """

    def __init__(
        self,
        windows: list = None,
        feature_names: list = None,
        pct_diff: bool = False,
        abs_diff: bool = False,
    ):
        super().__init__()
        self.windows = windows if windows else [5, 10, 15, 20]
        self.feature_names = feature_names
        self.pct_diff = pct_diff
        self.abs_diff = abs_diff

    @display_processor_info
    def transform(self, dataf: NumerFrame, *args, **kwargs) -> NumerFrame:
        feature_names = self.feature_names if self.feature_names else dataf.feature_cols
        for feature in tqdm(self.feature_names, desc="Difference feature generation"):
            lag_columns = dataf.get_pattern_data(f"{feature}_lag").columns
            if not lag_columns.empty:
                for day in self.windows:
                    differenced_values = (
                        (dataf[feature] / dataf[f"{feature}_lag{day}"]) - 1
                        if self.pct_diff
                        else dataf[feature] - dataf[f"{feature}_lag{day}"]
                    )
                    dataf[f"{feature}_diff{day}"] = differenced_values
                    if self.abs_diff:
                        dataf[f"{feature}_absdiff{day}"] = np.abs(
                            dataf[f"{feature}_diff{day}"]
                        )
            else:
                rich_print(
                    f":warning: WARNING: Skipping {feature}. Lag features for feature: {feature} were not detected. Have you already run LagPreProcessor? :warning:"
                )
        return NumerFrame(dataf)

<IPython.core.display.Javascript object>

In [58]:
# other
dpp = DifferencePreProcessor(
    feature_names=["close", "volume"], windows=[5, 10, 15, 20], pct_diff=True
)
dataf = dpp.transform(dataf)

Difference feature generation:   0%|          | 0/2 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

All difference features will contain `diff` in the column name.

In [59]:
# other
dataf.get_pattern_data("diff").tail(2)

Unnamed: 0,close_diff5,close_diff10,close_diff15,close_diff20,volume_diff5,volume_diff10,volume_diff15,volume_diff20
1977264,0.010791,0.06188,0.04461,0.077144,-0.105879,-0.025499,-0.336376,0.442885
1977265,0.003136,0.069245,0.027064,0.051667,-0.113875,-0.496215,-0.258575,-0.140381


<IPython.core.display.Javascript object>

## 2. Custom preprocessors

There are an almost unlimited number of ways to preprocess (selection, engineering and manipulation). We have only scratched the surface with the preprocessors currently implemented. We invite the Numerai community to develop Numerai Classic and Numerai Signals preprocessors.

A new Preprocessor should inherit from `BaseProcessor` and implement a `transform` method. For efficient implementation, we recommend you use `NumerFrame` functionality for preprocessing. You can also support Pandas DataFrame input as long as the `transform` method returns a `NumerFrame`. This ensures that the Preprocessor still works within a full `numerai-blocks` pipeline. A template for new preprocessors is given below.

To enable fancy logging output. Add the `@display_processor_info` decorator to the `transform` method.

In [60]:
# export
class AwesomePreProcessor(BaseProcessor):
    """ TEMPLATE - Do some awesome preprocessing. """
    def __init__(self):
        super().__init__()

    @display_processor_info
    def transform(self, dataf: NumerFrame, *args, **kwargs) -> NumerFrame:
        # Do processing
        ...
        # Parse all contents of NumerFrame to the next pipeline step
        return NumerFrame(dataf)

<IPython.core.display.Javascript object>

-------------------------------------------

In [61]:
# hide
# Run this cell to sync all changes with library
from nbdev.export import notebook2script

notebook2script()

Converted 00_misc.ipynb.
Converted 01_download.ipynb.
Converted 02_numerframe.ipynb.
Converted 03_preprocessing.ipynb.
Converted 04_model.ipynb.
Converted 05_postprocessing.ipynb.
Converted 06_modelpipeline.ipynb.
Converted 07_evaluation.ipynb.
Converted 08_key.ipynb.
Converted 09_submission.ipynb.
Converted 10_staking.ipynb.
Converted index.ipynb.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>