In [None]:
#| include: false
%load_ext autoreload
%autoreload 2

In [None]:
#| default_exp numerframe

In [None]:
#| include: false
from nbdev.showdoc import *

In [None]:
#| export
import uuid
import json
import numpy as np
import pandas as pd
import tensorflow as tf
from pathlib import Path
from rich import print as rich_print
from typing import Union, Tuple, Any, List

from numerblox.misc import AttrDict

2022-11-10 00:48:12.370607: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-10 00:48:12.664730: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-11-10 00:48:12.672194: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-11-10 00:48:12.672219: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not ha

## Overview: The NumerFrame

`NumerFrame` is a data structure that extends `pd.DataFrame` with functionality convenient for Numerai users. The main benefits include:
1. Automatically track features, targets, prediction and other columns + easily retrieve these data slices.
2. Add, export and import metadata. Furthermore, dynamically update or manipulate metadata within your Numerai data pipeline.
3. Other library functionality automatically recognizes era column (`era`, `friday_date` or `date`).
4. Integrations with other library components (i.e. `preprocessing`, `model`, `modelpipeline`, `postprocessing`, `evaluation` and `submission`) to create more solid inference pipelines and increase reliability.

Besides, all functionality of Pandas DataFrames is still available in the `NumerFrame`. You therefore don't have to create new pipelines to process your data when using `NumerFrame`.

We adopt the convention:
 1. All feature column names should start with `'feature'`.
 2. All target column names should start with `'target'`.
 3. All prediction column names should start with `'prediction'`.
 4. Data should contain an `'era'`, `'friday_date'` or `'date'` column, as is almost always the case with Numerai datasets.

Every column for which these conditions do not hold will be classified as an `'aux'` column.

In [None]:
#| export
class NumerFrame(pd.DataFrame):
    """
    Data structure which extends Pandas DataFrames and
    allows for additional Numerai specific functionality.
    """
    _metadata = ["meta", "feature_cols", "target_cols",
                 "prediction_cols", "not_aux_cols", "aux_cols"]
    meta = AttrDict()

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.__init_meta_attrs()
        if not "era_col_verified" in self.meta:
            self.__set_era_col()

    @property
    def _constructor(self):
        return NumerFrame

    def __init_meta_attrs(self):
        """ Dynamically track column groups. """
        self.feature_cols = [col for col in self.columns if str(col).startswith("feature")]
        self.target_cols = [col for col in self.columns if str(col).startswith("target")]
        self.prediction_cols = [
            col for col in self.columns if str(col).startswith("prediction")
        ]
        self.not_aux_cols = self.feature_cols + self.target_cols + self.prediction_cols
        self.aux_cols = [
            col for col in self.columns if col not in self.not_aux_cols
        ]

    def __set_era_col(self):
        """ Each NumerFrame should have an era column to benefit from all functionality. """
        if "era" in self.columns:
            self.meta.era_col = "era"
        elif "friday_date" in self.columns:
            self.meta.era_col = "friday_date"
        elif "date" in self.columns:
            self.meta.era_col = "date"
        else:
            raise AttributeError("NumerFrame must contain either an 'era', 'friday_date' or 'date' column.")
        self.meta.era_col_verified = True

    def add_metadata(self, *args, **kwargs):
        """ Parse arbitrary metadata (i.e. Python objects) to the meta attribute. """
        self.meta.update(*args, **kwargs)

    def export_json_metadata(self, file="config.json", verbose=False, **kwargs):
        """Export all attributes in NumerFrame that can be serialized to json file."""
        rich_print(f":file_folder: Exporting metadata to {file} :file_folder:")
        json_txt = json.dumps(
            self.meta.__dict__, default=lambda o: "<not serializable>", **kwargs
        )
        if verbose:
            rich_print(json_txt)
        Path(file).write_text(json_txt)

    def import_json_metadata(self, file="config.json", verbose=False, **kwargs):
        """Load arbitrary data into NumerFrame object from json file."""
        rich_print(f":file_folder: Importing metadata from {file} :file_folder:")
        with open(file) as json_file:
            json_data = json.load(json_file, **kwargs)
        if verbose:
            rich_print(json_data)
        self.meta.__dict__.update(json_data)

    def get_column_selection(self, cols: Union[str, list]):
        """ Return NumerFrame from selection of columns. """
        return self.loc[:, cols if isinstance(cols, list) else [cols]]

    @property
    def get_feature_data(self):
        """ All columns for which name starts with 'target'."""
        return self.get_column_selection(cols=self.feature_cols)

    @property
    def get_target_data(self):
        """ All columns for which name starts with 'target'."""
        return self.get_column_selection(cols=self.target_cols)

    @property
    def get_single_target_data(self):
        """ Column with name 'target' (Main Numerai target column). """
        return self.get_column_selection(cols=['target'])

    @property
    def get_prediction_data(self):
        """ All columns for which name starts with 'prediction'."""
        return self.get_column_selection(cols=self.prediction_cols)

    @property
    def get_aux_data(self):
        """ All columns that are not features, targets or predictions. """
        return self.get_column_selection(cols=self.aux_cols)

    @property
    def get_prediction_aux_data(self):
        """ All predictions columns and aux columns (for ensembling, etc.). """
        return self.get_column_selection(cols=self.prediction_cols+self.aux_cols)

    def get_pattern_data(self, pattern: str):
        """
        Get columns based on pattern (for example '_20' to get all 20-day Numerai targets).
        :param pattern: A 'like' pattern (pattern in column_name == True)
        """
        return self.filter(like=pattern)

    def get_feature_target_pair(self, multi_target=False) -> Tuple[Any, Any]:
        """
        Get split of feature and target columns.
        :param multi_target: Returns only 'target' column by default.
        Returns all target columns when set to True.
        """
        X = self.get_feature_data
        y = self.get_target_data if multi_target else self.get_single_target_data
        return X, y

    def get_era_batch(self, eras: List[Any],
                      convert_to_tf = False,
                      aemlp_batch = False,
                      features: list = None,
                      targets: list = None,
                      *args, **kwargs) -> tuple:
        """
        Get feature target pair batch of 1 or multiple eras. \n
        :param eras: Selection of era names that should be present in era_col. \n
        :param convert_to_tf: Convert to tf.Tensor. \n
        :param aemlp_batch: Specific target batch for autoencoder training. \n
        `y` output will contain three components: features, targets and targets. \n
        :param features: List of features to select. All by default \n
        :param targets: List of targets to select. All by default. \n
        *args, **kwargs are passed to initialization of Tensor.
        """
        valid_eras = []
        for era in eras:
            assert era in self[self.meta.era_col].unique(), f"Era '{era}' not found in era column ({self.meta.era_col})"
            valid_eras.append(era)
        features = features if features else self.feature_cols
        targets = targets if targets else self.target_cols
        X = self.loc[self[self.meta.era_col].isin(valid_eras)][features].values
        y = self.loc[self[self.meta.era_col].isin(valid_eras)][targets].values
        if aemlp_batch:
            y = [X.copy(), y.copy(), y.copy()]

        if convert_to_tf:
            X = tf.convert_to_tensor(X, *args, **kwargs)
            if aemlp_batch:
                y = [tf.convert_to_tensor(i, *args, **kwargs) for i in y]
            else:
                y = tf.convert_to_tensor(y, *args, **kwargs)
        return X, y

`create_numerframe` automatically recognizes your data file format, loads it into a `NumerFrame`, allows for column selection before loading and optionally adds metadata.

Support file formats are `.csv`, `.parquet`, `.pkl`, `.pickle`, `.xsl`, `.xslx`, `.xlsm`, `.xlsb`, `.odf`, `.ods` and `.odt`. If the file format for your use case is missing, feel free to create a Github issue or submit a pull request. See `README.md` for more information on contributing.

In [None]:
#| export
def create_numerframe(file_path: str, metadata: dict = None, columns: list = None, *args, **kwargs) -> NumerFrame:
    """
    Convenient function to initialize NumerFrame.
    Support most used file formats for Pandas DataFrames \n
    (.csv, .parquet, .xls, .pkl, etc.).
    For more details check https://pandas.pydata.org/docs/reference/io.html

    :param file_path: Relative or absolute path to data file. \n
    :param metadata: Metadata to be stored in NumerFrame.meta. \n
    :param columns: Which columns to read (All by default). \n
    *args, **kwargs will be passed to Pandas loading function.
    """
    assert Path(file_path).is_file(), f"{file_path} does not point to file."
    suffix = Path(file_path).suffix
    if suffix in [".csv"]:
        df = pd.read_csv(file_path, usecols=columns, *args, **kwargs)
    elif suffix in [".parquet"]:
        df = pd.read_parquet(file_path, columns=columns, *args, **kwargs)
    elif suffix in [".xls", ".xlsx", ".xlsm", "xlsb", ".odf", ".ods", ".odt"]:
        df = pd.read_excel(file_path, usecols=columns, *args, **kwargs)
    elif suffix in ['.pkl', '.pickle']:
        df = pd.read_pickle(file_path, *args, **kwargs)
        df = df.loc[:, columns] if columns else df
    else:
        raise NotImplementedError(f"Suffix '{suffix}' is not supported.")
    num_frame = NumerFrame(df)
    if metadata:
        num_frame.add_metadata(metadata)
    return num_frame

## NumerFrame Usage

A `NumerFrame` object can be initialized from memory just like you would with a Pandas DataFrame.
You then have the option to add metadata with `.add_metadata`. All metadata will be stored in the `meta` attribute.

### 1. Initialize from memory

In [None]:
test_features = [f"feature_{l}" for l in "ABCDEFGHIK"]
id_col = [uuid.uuid4().hex for _ in range(100)]

# Random DataFrame
dataf = pd.DataFrame(np.random.uniform(size=(100, 10)), columns=test_features)
dataf["id"] = id_col
dataf[["target", "target_1", "target_2"]] = np.random.normal(size=(100, 3))
dataf["date"] = range(100)

In [None]:
metadata = {
    "version": 42,
    "additional_info": "test_model",
    "multi_target": False,
    "tournament_type": "random",
}
memory_dataf = NumerFrame(dataf)
memory_dataf.add_metadata(metadata)
assert memory_dataf.meta.version == 42
assert memory_dataf.meta.tournament_type == "random"

Metadata stored in `.meta` and can be accessed as a dictionary or as attributes.

In [None]:
memory_dataf.meta

{'era_col': 'date',
 'era_col_verified': True,
 'version': 42,
 'additional_info': 'test_model',
 'multi_target': False,
 'tournament_type': 'random'}

In [None]:
memory_dataf.meta.version

42

In [None]:
memory_dataf.meta['version']

42

In [None]:
assert memory_dataf.meta.version == memory_dataf.meta['version']

### 2. Initialize from file path

You can also use the convenience function `create_numerframe` so `NumerFrame` can be easily initialized. Think of it as a dynamic `pd.read_csv`, `pd.read_parquet`, etc. where you can also directly pass metadata.

In [None]:
metadata = {
    "version": 2,
    "multi_target": False,
    "tournament_type": "classic",
    "era_col": "era"
}

num_dataf = create_numerframe("test_assets/mini_numerai_version_2_data.parquet",
                          metadata=metadata
                          )
assert num_dataf.meta.version == 2
assert num_dataf.meta.era_col == "era"
assert not num_dataf.meta.multi_target
num_dataf.head(2)

### 3. Example functionality

In [None]:
num_dataf.meta

`.get_feature_data` will retrieve all columns where the column name starts with `feature`.

In [None]:
num_dataf.get_feature_data.head(2)

`.get_target_data` retrieves all columns if the column name starts with `"target"`.

In [None]:
num_dataf.get_target_data.head(2)

`.get_single_target_data` only retrieves the column `"target"`.

In [None]:
num_dataf.get_single_target_data.head(2)

`.get_pattern_data` allows you to get columns based on a certain pattern. In this example we retrieve all 20-day targets.

In [None]:
num_dataf.get_pattern_data("_20").head(2)

`.get_era_batch` will return a `tf.Tensor` or `np.array` with feature data and target data for one or more eras. Convenient for creating neural network DataGenerators.

In [None]:
X_era, y_era = num_dataf.get_era_batch(['0003'], convert_to_tf=True, dtype=tf.float16)
X_era

For people training autoencoders + MLP you can get a target that contains 3 elements: features, targets and targets. Just define `aemlp_batch=True`.
More info on this setup: [AutoEncoder and multitask MLP on new dataset forum post](https://forum.numer.ai/t/autoencoder-and-multitask-mlp-on-new-dataset-from-kaggle-jane-street/4338).

In [None]:
_, y_era_aemlp = num_dataf.get_era_batch(['0003'], convert_to_tf=True, aemlp_batch=True, dtype=tf.float16)
y_era_aemlp

`.aux_cols` denotes all columns that are not features, targets or prediction columns.

In [None]:
num_dataf.aux_cols

In [None]:
num_dataf.get_aux_data.head(2)

In [None]:
num_dataf['prediction_1'] = np.random.uniform(size=len(num_dataf))
num_dataf['prediction_2'] = np.random.uniform(size=len(num_dataf))

To track new columns like prediction columns, make sure to initialize a new `NumerFrame`. Prediction columns can easily be retrieved with `.get_prediction_data` and `get_prediction_aux_data` if you want to also get columns like `era` and `data_type`. This can be handy for ensembling and submission use cases.

In [None]:
num_dataf = NumerFrame(num_dataf)

In [None]:
num_dataf.get_prediction_data.head(2)

In [None]:
num_dataf.get_prediction_aux_data.head(2)

Arbitrary `.json` metadata can be stored into the `NumerFrame`. All metadata can also be exported to a `.json` file.

In [None]:
num_dataf.export_json_metadata("config.json")

In [None]:
num_dataf.import_json_metadata("config.json")

In [None]:
assert num_dataf.meta.version == 2
assert not num_dataf.meta.multi_target

Because `NumerFrame` inherits from `pd.DataFrame` you still have all functionality of a normal DataFrame at your disposal, like copying.

In [None]:
dataf2 = num_dataf.copy()
assert dataf2.equals(num_dataf)

`NumerFrame` dynamically tracks which feature, target, aux and prediction columns there are when initialized. For example, here we add a new prediction column. Upon initialization the column will be contained in `prediction_cols`. Prediction columns are all column names that start with `prediction`.

In [None]:
num_dataf.loc[:, "prediction_test_1"] = np.random.uniform(size=len(num_dataf))
new_dataset = NumerFrame(num_dataf)
assert "prediction_test_1" in new_dataset.prediction_cols
assert new_dataset.meta.version == 2

Arbitrary columns van be retrieved with `.get_column_selection`. The input argument can be either a string or a list with column names.

In [None]:
selection1 = num_dataf.get_column_selection("era")
selection1.head(2)

In [None]:
selection2 = num_dataf.get_column_selection(["era", "prediction_test_1"])
selection2.head(2)

In [None]:
#| include: false
for sel in [selection1, selection2]:
    assert isinstance(sel, NumerFrame)

For convenience we can get a feature, target pair with one method. If `multi_target=True` all columns where the column name starts with `target` will be retrieved.

In [None]:
features, single_target = num_dataf.get_feature_target_pair(multi_target=False)
features.head(2)

In [None]:
single_target.head(2)

-----------------------------------------------