In [None]:
%load_ext autoreload
%autoreload 2
%load_ext nb_black
%load_ext lab_black

In [None]:
# default_exp dataset

# Dataset

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
import uuid
import numpy as np
import pandas as pd
from copy import deepcopy
import json
from pathlib import Path
import datetime as dt
from functools import wraps
from rich import print as rich_print
from typeguard import typechecked

### Considerations

Goal: Create dynamic Numerai dataset where we can add metadata and Numerai specific functionality while keeping the flexibility of Pandas DataFrames.

__Options:__
__1.__ Add metadata to DataFrame through `df.attrs['META'] = "something"`
1.1. Downside: [Not persistent with parquet](https://stackoverflow.com/questions/14688306/adding-meta-information-metadata-to-pandas-dataframe).

__2.__ Subclass from DataFrame and add functionality.
2.1. Cumbersome when used to using `pd.read_csv`, `pd.read_parquet`, etc.
2.2 More info: [StackOverflow](https://stackoverflow.com/questions/22155951/how-can-i-subclass-a-pandas-dataframe), [Pandas Docs](https://pandas.pydata.org/pandas-docs/stable/development/extending.html#extending-subclassing-pandas).

__3.__ Develop dedicated `Dataset` class on which DataFrame is an attribute (`.dataf`)
3.1. Easy to add functionality and typecheck.
3.2. Does not work out of the box with sklearn Transformers, but can be easily made compatible with a single decorator.
3.3. Easy to export and import metadata.


In [None]:
@typechecked
class Dataset:
    def __init__(self, dataf: pd.DataFrame, *args, **kwargs):
        self.dataf = dataf
        self.__dict__.update(*args, **kwargs)
        self.features = [col for col in self.dataf.columns if "feature" in col]
        self.targets = [col for col in self.dataf.columns if "target" in col]
        self.aux_columns = [col for col in self.dataf.columns if col not in self.features + self.targets]
        self.all_columns = self.features + self.targets + self.aux_columns

    def copy_dataset(self):
        """ Copy Dataset object """
        return deepcopy(self)

    def copy_dataframe(self):
        """ Copy DataFrame part of Dataset """
        return deepcopy(self.dataf)

    def export_json_metadata(self, file="config.json", verbose=False, **kwargs):
        """ Export all attributes in Dataset that can be serialized to json file. """
        rich_print(f":file_folder: Exporting metadata to {file} :file_folder:")
        json_txt = json.dumps(self.__dict__, default=lambda o: '<not serializable>', **kwargs)
        if verbose:
            rich_print(json_txt)
        Path(file).write_text(json_txt)

    def import_json_metadata(self, file="config.json", verbose=False, **kwargs):
        """ Load arbitrary data into Dataset object from json file """
        rich_print(f":file_folder: Importing metadata from {file} :file_folder:")
        with open(file) as json_file:
            json_data = json.load(json_file, **kwargs)
        if verbose:
            rich_print(json_data)
        # Make sure there is no overwrite on DataFrame
        json_data.pop('dataf', None)
        self.__dict__.update(json_data)

    @property
    def get_feature_data(self):
        return self.dataf.loc[:, [*self.features]]

    @property
    def get_target_data(self):
        return self.dataf.loc[:, [*self.targets]]

    @property
    def get_aux_data(self):
        """ All columns that are neither features or targets """
        return self.dataf.loc[:, [*self.aux_columns]]

    def __repr__(self):
        return f"Dataset of shape {self.dataf.shape}. Columns: {self.all_columns}"

    def __str__(self):
        return self.__repr__()

### Tests

In [None]:
# Random DataFrame
test_features = [f"feature_{l}" for l in "ABCDEFGHIK"]
id_col = [uuid.uuid4().hex for _ in range(100)]

df = pd.DataFrame(np.random.uniform(size=(100, 10)), columns=test_features)
df['id'] = id_col
df['target'] = np.random.normal(size=100)
df['era'] = range(100)

In [None]:
metadata = {"version": 2, "additional_info": "test_model", "multi_target": False, "tournament_type": "classic"}
dataset = Dataset(df, metadata)

In [None]:
dataset.dataf.head(2)

Unnamed: 0,feature_A,feature_B,feature_C,feature_D,feature_E,feature_F,feature_G,feature_H,feature_I,feature_K,id,target,era
0,0.282598,0.545272,0.344197,0.127119,0.388487,0.546871,0.657288,0.400873,0.600365,0.191064,ac775f64c5884973ba1b24bdbb8be913,0.731874,0
1,0.004041,0.788963,0.052953,0.435246,0.420473,0.816777,0.310608,0.081211,0.881146,0.69792,345b7532f7f74f47aff6f96ff95ebf63,0.674846,1


In [None]:
dataset.get_feature_data.head(2)

Unnamed: 0,feature_A,feature_B,feature_C,feature_D,feature_E,feature_F,feature_G,feature_H,feature_I,feature_K
0,0.282598,0.545272,0.344197,0.127119,0.388487,0.546871,0.657288,0.400873,0.600365,0.191064
1,0.004041,0.788963,0.052953,0.435246,0.420473,0.816777,0.310608,0.081211,0.881146,0.69792


In [None]:
dataset.aux_columns

['id', 'era']

In [None]:
dataset.get_aux_data.head(2)

Unnamed: 0,id,era
0,ac775f64c5884973ba1b24bdbb8be913,0
1,345b7532f7f74f47aff6f96ff95ebf63,1


In [None]:
assert dataset.version == 2
assert dataset.multi_target == False

In [None]:
dataset.export_json_metadata("config.json")

In [None]:
dataset.import_json_metadata("config.json")

In [None]:
dataset.__dict__

{'dataf':     feature_A  feature_B  feature_C  feature_D  feature_E  feature_F  \
 0    0.282598   0.545272   0.344197   0.127119   0.388487   0.546871   
 1    0.004041   0.788963   0.052953   0.435246   0.420473   0.816777   
 2    0.441437   0.509976   0.500923   0.179379   0.521265   0.228809   
 3    0.647950   0.298224   0.147093   0.505925   0.016163   0.993806   
 4    0.653441   0.127955   0.046359   0.738709   0.413615   0.119128   
 ..        ...        ...        ...        ...        ...        ...   
 95   0.996016   0.661006   0.481067   0.197794   0.745087   0.028905   
 96   0.586763   0.332219   0.555999   0.388498   0.746459   0.483266   
 97   0.209631   0.518155   0.142643   0.538900   0.861116   0.442582   
 98   0.300793   0.703907   0.285225   0.223078   0.667507   0.104839   
 99   0.282371   0.527017   0.974048   0.259523   0.403430   0.760588   
 
     feature_G  feature_H  feature_I  feature_K  \
 0    0.657288   0.400873   0.600365   0.191064   
 1    0.310

In [None]:
dataf2 = dataset.copy_dataframe()
assert dataf2.equals(dataset.dataf)

In [None]:
str(dataset)

"Dataset of shape (100, 13). Columns: ['feature_A', 'feature_B', 'feature_C', 'feature_D', 'feature_E', 'feature_F', 'feature_G', 'feature_H', 'feature_I', 'feature_K', 'target', 'id', 'era']"

-----------------------------------------------

In [None]:
#hide
# Run this cell to sync all changes with library
from nbdev.export import notebook2script; notebook2script()