In [None]:
#| include: false
%load_ext autoreload
%autoreload 2

In [None]:
#| default_exp numerframe

In [None]:
#| include: false
from nbdev.showdoc import *

In [None]:
#| export
import uuid
import numpy as np
import pandas as pd
import tensorflow as tf
from pathlib import Path
from typing import Union, Tuple, Any, List

from numerblox.misc import AttrDict

2022-11-11 12:15:07.038392: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-11 12:15:07.234871: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-11 12:15:08.008130: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2022-11-11 12:15:08.008257: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer

## Overview: The NumerFrame

`NumerFrame` is a data structure that extends `pd.DataFrame` with functionality convenient for Numerai users. The main benefits include:
1. Automatically track features, targets, prediction and other columns + easily retrieve these data slices.
2. Other library functionality automatically recognizes era column (`era`, `friday_date` or `date`).
3. Integrations with other library components (i.e. `preprocessing`, `model`, `modelpipeline`, `postprocessing`, `evaluation` and `submission`) to create more solid inference pipelines and increase reliability.

Besides, all functionality of Pandas DataFrames is still available in the `NumerFrame`. You therefore don't have to create new pipelines to process your data when using `NumerFrame`.

We adopt the convention:
 1. All feature column names should start with `'feature'`.
 2. All target column names should start with `'target'`.
 3. All prediction column names should start with `'prediction'`.
 4. Data should contain an `'era'`, `'friday_date'` or `'date'` column, as is almost always the case with Numerai datasets.

Every column for which these conditions do not hold will be classified as an `'aux'` column.

In [None]:
#| export
class NumerFrame(pd.DataFrame):
    """
    Data structure which extends Pandas DataFrames and
    allows for additional Numerai specific functionality.
    """
    _metadata = ["meta", "feature_cols", "target_cols",
                 "prediction_cols", "not_aux_cols", "aux_cols"]


    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.meta = AttrDict()
        self.__set_era_col()
        self.__init_meta_attrs()

    @property
    def _constructor(self):
        return NumerFrame

    def __init_meta_attrs(self):
        """ Dynamically track column groups. """
        self.feature_cols = [col for col in self.columns if str(col).startswith("feature")]
        self.target_cols = [col for col in self.columns if str(col).startswith("target")]
        self.prediction_cols = [
            col for col in self.columns if str(col).startswith("prediction")
        ]
        self.not_aux_cols = self.feature_cols + self.target_cols + self.prediction_cols
        self.aux_cols = [
            col for col in self.columns if col not in self.not_aux_cols
        ]

    def __set_era_col(self):
        """ Each NumerFrame should have an era column to benefit from all functionality. """
        if "era" in self.columns:
            self.meta.era_col = "era"
        elif "friday_date" in self.columns:
            self.meta.era_col = "friday_date"
        elif "date" in self.columns:
            self.meta.era_col = "date"
        else:
            self.meta.era_col = None

    def get_column_selection(self, cols: Union[str, list]):
        """ Return NumerFrame from selection of columns. """
        return self.loc[:, cols if isinstance(cols, list) else [cols]]

    @property
    def get_feature_data(self):
        """ All columns for which name starts with 'target'."""
        return self.get_column_selection(cols=self.feature_cols)

    @property
    def get_target_data(self):
        """ All columns for which name starts with 'target'."""
        return self.get_column_selection(cols=self.target_cols)

    @property
    def get_single_target_data(self):
        """ Column with name 'target' (Main Numerai target column). """
        return self.get_column_selection(cols=['target'])

    @property
    def get_prediction_data(self):
        """ All columns for which name starts with 'prediction'."""
        return self.get_column_selection(cols=self.prediction_cols)

    @property
    def get_aux_data(self):
        """ All columns that are not features, targets or predictions. """
        return self.get_column_selection(cols=self.aux_cols)

    @property
    def get_prediction_aux_data(self):
        """ All predictions columns and aux columns (for ensembling, etc.). """
        return self.get_column_selection(cols=self.prediction_cols+self.aux_cols)

    def get_pattern_data(self, pattern: str):
        """
        Get columns based on pattern (for example '_20' to get all 20-day Numerai targets).
        :param pattern: A 'like' pattern (pattern in column_name == True)
        """
        return self.filter(like=pattern)

    def get_feature_target_pair(self, multi_target=False) -> Tuple[Any, Any]:
        """
        Get split of feature and target columns.
        :param multi_target: Returns only 'target' column by default.
        Returns all target columns when set to True.
        """
        X = self.get_feature_data
        y = self.get_target_data if multi_target else self.get_single_target_data
        return X, y

    def get_era_batch(self, eras: List[Any],
                      convert_to_tf = False,
                      aemlp_batch = False,
                      features: list = None,
                      targets: list = None,
                      *args, **kwargs) -> tuple:
        """
        Get feature target pair batch of 1 or multiple eras. \n
        :param eras: Selection of era names that should be present in era_col. \n
        :param convert_to_tf: Convert to tf.Tensor. \n
        :param aemlp_batch: Specific target batch for autoencoder training. \n
        `y` output will contain three components: features, targets and targets. \n
        :param features: List of features to select. All by default \n
        :param targets: List of targets to select. All by default. \n
        *args, **kwargs are passed to initialization of Tensor.
        """
        valid_eras = []
        for era in eras:
            assert era in self[self.meta.era_col].unique(), f"Era '{era}' not found in era column ({self.meta.era_col})"
            valid_eras.append(era)
        features = features if features else self.feature_cols
        targets = targets if targets else self.target_cols
        X = self.loc[self[self.meta.era_col].isin(valid_eras)][features].values
        y = self.loc[self[self.meta.era_col].isin(valid_eras)][targets].values
        if aemlp_batch:
            y = [X.copy(), y.copy(), y.copy()]

        if convert_to_tf:
            X = tf.convert_to_tensor(X, *args, **kwargs)
            if aemlp_batch:
                y = [tf.convert_to_tensor(i, *args, **kwargs) for i in y]
            else:
                y = tf.convert_to_tensor(y, *args, **kwargs)
        return X, y

`create_numerframe` automatically recognizes your data file format, loads it into a `NumerFrame`, allows for column selection before loading and optionally adds metadata.

Support file formats are `.csv`, `.parquet`, `.pkl`, `.pickle`, `.xsl`, `.xslx`, `.xlsm`, `.xlsb`, `.odf`, `.ods` and `.odt`. If the file format for your use case is missing, feel free to create a Github issue or submit a pull request. See `README.md` for more information on contributing.

In [None]:
#| export
def create_numerframe(file_path: str, metadata: dict = None, columns: list = None, *args, **kwargs) -> NumerFrame:
    """
    Convenient function to initialize NumerFrame.
    Support most used file formats for Pandas DataFrames \n
    (.csv, .parquet, .xls, .pkl, etc.).
    For more details check https://pandas.pydata.org/docs/reference/io.html

    :param file_path: Relative or absolute path to data file. \n
    :param metadata: Metadata to be stored in NumerFrame.meta. \n
    :param columns: Which columns to read (All by default). \n
    *args, **kwargs will be passed to Pandas loading function.
    """
    assert Path(file_path).is_file(), f"{file_path} does not point to file."
    suffix = Path(file_path).suffix
    if suffix in [".csv"]:
        df = pd.read_csv(file_path, usecols=columns, *args, **kwargs)
    elif suffix in [".parquet"]:
        df = pd.read_parquet(file_path, columns=columns, *args, **kwargs)
    elif suffix in [".xls", ".xlsx", ".xlsm", "xlsb", ".odf", ".ods", ".odt"]:
        df = pd.read_excel(file_path, usecols=columns, *args, **kwargs)
    elif suffix in ['.pkl', '.pickle']:
        df = pd.read_pickle(file_path, *args, **kwargs)
        df = df.loc[:, columns] if columns else df
    else:
        raise NotImplementedError(f"Suffix '{suffix}' is not supported.")
    num_frame = NumerFrame(df)
    return num_frame

## NumerFrame Usage

A `NumerFrame` object can be initialized from memory just like you would with a Pandas DataFrame.
You then have the option to add metadata with `.add_metadata`. All metadata will be stored in the `meta` attribute.

### 1. Initialize from memory

In [None]:
test_features = [f"feature_{l}" for l in "ABCDEFGHIK"]
id_col = [uuid.uuid4().hex for _ in range(100)]

# Random DataFrame
dataf = pd.DataFrame(np.random.uniform(size=(100, 10)), columns=test_features)
dataf["id"] = id_col
dataf[["target", "target_1", "target_2"]] = np.random.normal(size=(100, 3))
dataf["date"] = range(100)

In [None]:
memory_dataf = NumerFrame(dataf)
assert memory_dataf.meta.era_col == "date"

In [None]:
memory_dataf.head(2)

Unnamed: 0,feature_A,feature_B,feature_C,feature_D,feature_E,feature_F,feature_G,feature_H,feature_I,feature_K,id,target,target_1,target_2,date
0,0.625467,0.585194,0.305124,0.743754,0.751707,0.982139,0.575864,0.830288,0.404422,0.948049,c8815b7344124751b5717c901359a7d2,0.433754,1.545413,1.076133,0
1,0.537607,0.567881,0.105472,0.273122,0.164984,0.321191,0.127097,0.27918,0.050139,0.200424,2551b6e1d0b94668a25ad2a35c9bb30b,-0.5716,1.463836,-0.771368,1


In [None]:
memory_dataf.meta

{'era_col': 'date'}

### 2. Initialize from file path

You can also use the convenience function `create_numerframe` so `NumerFrame` can be easily initialized. Think of it as a dynamic `pd.read_csv`, `pd.read_parquet`, etc.

In [None]:
num_dataf = create_numerframe("test_assets/mini_numerai_version_1_data.csv",
                          )
assert num_dataf.meta.era_col == "era"

### 3. Example functionality

In [None]:
num_dataf.meta

{'era_col': 'era'}

`.get_feature_data` will retrieve all columns where the column name starts with `feature`.

In [None]:
num_dataf.get_feature_data.head(2)

Unnamed: 0,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,feature_intelligence8,feature_intelligence9,feature_intelligence10,...,feature_wisdom37,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46
0,0.0,0.5,0.25,0.0,0.5,0.25,0.25,0.25,0.75,0.75,...,1.0,1.0,1.0,0.75,0.5,0.75,0.5,1.0,0.5,0.75
1,0.0,0.0,0.0,0.25,0.5,0.0,0.0,0.25,0.5,0.5,...,0.75,1.0,1.0,0.0,0.0,0.75,0.25,0.0,0.25,1.0


`.get_target_data` retrieves all columns if the column name starts with `"target"`.

In [None]:
num_dataf.get_target_data.head(2)

Unnamed: 0,target
0,0.5
1,0.25


`.get_single_target_data` only retrieves the column `"target"`.

In [None]:
num_dataf.get_single_target_data.head(2)

Unnamed: 0,target
0,0.5
1,0.25


`.get_pattern_data` allows you to get columns based on a certain pattern. In this example we retrieve all 20-day targets.

In [None]:
num_dataf.get_pattern_data("_20").head(2)

0
1


In [None]:
num_dataf.head()

Unnamed: 0,id,era,data_type,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,...,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46,target
0,n000315175b67977,era1,train,0.0,0.5,0.25,0.0,0.5,0.25,0.25,...,1.0,1.0,0.75,0.5,0.75,0.5,1.0,0.5,0.75,0.5
1,n0014af834a96cdd,era1,train,0.0,0.0,0.0,0.25,0.5,0.0,0.0,...,1.0,1.0,0.0,0.0,0.75,0.25,0.0,0.25,1.0,0.25
2,n001c93979ac41d4,era1,train,0.25,0.5,0.25,0.25,1.0,0.75,0.75,...,0.25,0.5,0.0,0.0,0.5,1.0,0.0,0.25,0.75,0.25
3,n0034e4143f22a13,era1,train,1.0,0.0,0.0,0.5,0.5,0.25,0.25,...,1.0,1.0,0.75,0.75,1.0,1.0,0.75,1.0,1.0,0.25
4,n00679d1a636062f,era1,train,0.25,0.25,0.25,0.25,0.0,0.25,0.5,...,0.75,0.75,0.25,0.5,0.75,0.0,0.5,0.25,0.75,0.75


`.get_era_batch` will return a `tf.Tensor` or `np.array` with feature data and target data for one or more eras. Convenient for creating neural network DataGenerators.

In [None]:
X_era, y_era = num_dataf.get_era_batch(['era1'], convert_to_tf=True, dtype=tf.float16)
X_era

2022-11-11 12:15:11.932838: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-11 12:15:11.946229: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-11 12:15:11.947022: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-11 12:15:11.948523: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

<tf.Tensor: shape=(10, 310), dtype=float16, numpy=
array([[0.  , 0.5 , 0.25, ..., 1.  , 0.5 , 0.75],
       [0.  , 0.  , 0.  , ..., 0.  , 0.25, 1.  ],
       [0.25, 0.5 , 0.25, ..., 0.  , 0.25, 0.75],
       ...,
       [0.25, 1.  , 1.  , ..., 0.75, 0.5 , 0.25],
       [0.5 , 0.5 , 0.5 , ..., 0.  , 0.  , 0.  ],
       [0.5 , 1.  , 1.  , ..., 1.  , 1.  , 0.75]], dtype=float16)>

For people training autoencoders + MLP you can get a target that contains 3 elements: features, targets and targets. Just define `aemlp_batch=True`.
More info on this setup: [AutoEncoder and multitask MLP on new dataset forum post](https://forum.numer.ai/t/autoencoder-and-multitask-mlp-on-new-dataset-from-kaggle-jane-street/4338).

In [None]:
_, y_era_aemlp = num_dataf.get_era_batch(['era1'], convert_to_tf=True, aemlp_batch=True, dtype=tf.float16)
y_era_aemlp

[<tf.Tensor: shape=(10, 310), dtype=float16, numpy=
 array([[0.  , 0.5 , 0.25, ..., 1.  , 0.5 , 0.75],
        [0.  , 0.  , 0.  , ..., 0.  , 0.25, 1.  ],
        [0.25, 0.5 , 0.25, ..., 0.  , 0.25, 0.75],
        ...,
        [0.25, 1.  , 1.  , ..., 0.75, 0.5 , 0.25],
        [0.5 , 0.5 , 0.5 , ..., 0.  , 0.  , 0.  ],
        [0.5 , 1.  , 1.  , ..., 1.  , 1.  , 0.75]], dtype=float16)>,
 <tf.Tensor: shape=(10, 1), dtype=float16, numpy=
 array([[0.5 ],
        [0.25],
        [0.25],
        [0.25],
        [0.75],
        [0.5 ],
        [0.25],
        [0.25],
        [0.5 ],
        [0.75]], dtype=float16)>,
 <tf.Tensor: shape=(10, 1), dtype=float16, numpy=
 array([[0.5 ],
        [0.25],
        [0.25],
        [0.25],
        [0.75],
        [0.5 ],
        [0.25],
        [0.25],
        [0.5 ],
        [0.75]], dtype=float16)>]

`.aux_cols` denotes all columns that are not features, targets or prediction columns.

In [None]:
num_dataf.aux_cols

['id', 'era', 'data_type']

In [None]:
num_dataf.get_aux_data.head(2)

Unnamed: 0,id,era,data_type
0,n000315175b67977,era1,train
1,n0014af834a96cdd,era1,train


In [None]:
num_dataf['prediction_1'] = np.random.uniform(size=len(num_dataf))
num_dataf['prediction_2'] = np.random.uniform(size=len(num_dataf))

To track new columns like prediction columns, make sure to initialize a new `NumerFrame`. Prediction columns can easily be retrieved with `.get_prediction_data` and `get_prediction_aux_data` if you want to also get columns like `era` and `data_type`. This can be handy for ensembling and submission use cases.

In [None]:
num_dataf = NumerFrame(num_dataf)

In [None]:
num_dataf.get_prediction_data.head(2)

Unnamed: 0,prediction_1,prediction_2
0,0.328964,0.902012
1,0.266986,0.526574


In [None]:
num_dataf.get_prediction_aux_data.head(2)

Unnamed: 0,prediction_1,prediction_2,id,era,data_type
0,0.328964,0.902012,n000315175b67977,era1,train
1,0.266986,0.526574,n0014af834a96cdd,era1,train


In [None]:
num_dataf.meta

{'era_col': 'era'}

Because `NumerFrame` inherits from `pd.DataFrame` you still have all functionality of a normal DataFrame at your disposal, like copying.

In [None]:
dataf2 = num_dataf.copy()
assert dataf2.equals(num_dataf)

`NumerFrame` dynamically tracks which feature, target, aux and prediction columns there are when initialized. For example, here we add a new prediction column. Upon initialization the column will be contained in `prediction_cols`. Prediction columns are all column names that start with `prediction`.

In [None]:
num_dataf.loc[:, "prediction_test_1"] = np.random.uniform(size=len(num_dataf))
new_dataset = NumerFrame(num_dataf)
assert "prediction_test_1" in new_dataset.prediction_cols

Arbitrary columns van be retrieved with `.get_column_selection`. The input argument can be either a string or a list with column names.

In [None]:
selection1 = num_dataf.get_column_selection("era")
selection1.head(2)

Unnamed: 0,era
0,era1
1,era1


In [None]:
selection2 = num_dataf.get_column_selection(["era", "prediction_test_1"])
selection2.head(2)

Unnamed: 0,era,prediction_test_1
0,era1,0.871487
1,era1,0.010738


In [None]:
#| include: false
for sel in [selection1, selection2]:
    assert isinstance(sel, NumerFrame)

For convenience we can get a feature, target pair with one method. If `multi_target=True` all columns where the column name starts with `target` will be retrieved.

In [None]:
features, single_target = num_dataf.get_feature_target_pair(multi_target=False)
features.head(2)

Unnamed: 0,feature_intelligence1,feature_intelligence2,feature_intelligence3,feature_intelligence4,feature_intelligence5,feature_intelligence6,feature_intelligence7,feature_intelligence8,feature_intelligence9,feature_intelligence10,...,feature_wisdom37,feature_wisdom38,feature_wisdom39,feature_wisdom40,feature_wisdom41,feature_wisdom42,feature_wisdom43,feature_wisdom44,feature_wisdom45,feature_wisdom46
0,0.0,0.5,0.25,0.0,0.5,0.25,0.25,0.25,0.75,0.75,...,1.0,1.0,1.0,0.75,0.5,0.75,0.5,1.0,0.5,0.75
1,0.0,0.0,0.0,0.25,0.5,0.0,0.0,0.25,0.5,0.5,...,0.75,1.0,1.0,0.0,0.0,0.75,0.25,0.0,0.25,1.0


In [None]:
single_target.head(2)

Unnamed: 0,target
0,0.5
1,0.25


-----------------------------------------------