In [None]:
import pandas as pd
%load_ext autoreload
%autoreload 2
%load_ext nb_black
%load_ext lab_black

<IPython.core.display.Javascript object>

In [None]:
# default_exp preprocessing

<IPython.core.display.Javascript object>

# Preprocessing

In [None]:
# hide
from nbdev.showdoc import *

<IPython.core.display.Javascript object>

In [None]:
#export
import uuid
import inspect
import numpy as np
import pandas as pd
from functools import wraps
from typeguard import typechecked
from abc import ABC, abstractmethod

from numerai_blocks.dataset import Dataset

<IPython.core.display.Javascript object>

## 0.1. BaseProcessor

In [None]:
#export
@typechecked
class BaseProcessor(ABC):
    def __init__(self):
        ...

    @abstractmethod
    def transform(self, dataset: Dataset) -> Dataset:
        ...

    def __call__(self, dataset: Dataset) -> Dataset:
        return self.transform(dataset=dataset)

<IPython.core.display.Javascript object>

## 0.2. Decorators

In [None]:
def support_dataf_processing(func):
    """
    Make Dataset processor compatible with DataFrame input.
    :param func: Some function/method that takes Dataset as input
    and returns Dataset.
    """
    @wraps(func)
    def wrapper(dataf: pd.DataFrame, *args, **kwargs) -> pd.DataFrame:
        return func(Dataset(dataf), *args, **kwargs).dataf
    return wrapper

<IPython.core.display.Javascript object>

In [None]:
def support_dataset_processing(func):
    """
    Make existing DataFrame transformer compatible with Dataset input.
    :param func: Some function/method that takes Pandas DataFrame as input
    and return Pandas DataFrame.
    """
    @wraps(func)
    def wrapper(dataset: Dataset, *args, **kwargs) -> Dataset:
        dataf_transform = func(dataset.dataf, *args, **kwargs)
        metadata = dataset.__dict__
        metadata.pop("dataf", None)
        return Dataset(dataf_transform, metadata)
    return wrapper

<IPython.core.display.Javascript object>

### Decorator tests

In [None]:
# Random DataFrame
test_features = [f"feature_{l}" for l in "ABCDEFGHIK"]
id_col = [uuid.uuid4().hex for _ in range(100)]
df = pd.DataFrame(np.random.uniform(size=(100, 10)), columns=test_features)

def pandas_quad(df, col: str):
    """ Simple DataFrame function which takes gives column to power of 2. """
    df_copy = df.copy()
    df_copy.loc[:, col] = df_copy[col].apply(lambda x: x ** 2)
    return df_copy

class QuadraticProcessor(BaseProcessor):
    """ Simple Dataset Processor which takes gives column to power of 2. """
    def __init__(self):
        super(QuadraticProcessor, self).__init__()

    def transform(self, dataset: Dataset, col="feature_A") -> Dataset:
        dataset_copy = dataset.copy_dataset()
        dataset_copy.dataf.loc[:, col] = dataset_copy.dataf.pipe(self._quadratic)
        return Dataset(**dataset_copy.__dict__)

    @staticmethod
    def _quadratic(dataf: pd.DataFrame):
        return dataf ** 2


@support_dataset_processing
def test_dataf_with_dataset_input(dataset: Dataset, col="feature_A") -> Dataset:
    return pandas_quad(dataset, col=col)

@support_dataf_processing
def test_dataset_with_dataf_input(dataf: pd.DataFrame, col="feature_A") -> pd.DataFrame:
    return QuadraticProcessor().transform(dataset=dataf, col=col)

<IPython.core.display.Javascript object>

In [None]:
df['feature_A'].head(2)

0    0.625671
1    0.952235
Name: feature_A, dtype: float64

<IPython.core.display.Javascript object>

In [None]:
transformed_dataset = test_dataf_with_dataset_input(Dataset(df))
transformed_dataset.get_column_selection('feature_B').head(2)

Unnamed: 0,feature_B
0,0.659536
1,0.296588


<IPython.core.display.Javascript object>

In [None]:
transformed_df = test_dataset_with_dataf_input(df)
transformed_df.loc[:, ['feature_B']].head(2)

Unnamed: 0,feature_B
0,0.659536
1,0.296588


<IPython.core.display.Javascript object>

In [None]:
assert transformed_dataset.get_column_selection('feature_B').equals(transformed_df.loc[:, ['feature_B']])

<IPython.core.display.Javascript object>

-------------------------------------------

In [None]:
# hide
# Run this cell to sync all changes with library
from nbdev.export import notebook2script

notebook2script()

Converted 00_download.ipynb.
Converted 01_dataloaders.ipynb.
Converted 02_dataset.ipynb.
Converted 03_preprocessing.ipynb.
Converted 04a_model.ipynb.
Converted 04b_modelpipeline.ipynb.
Converted 05_postprocessing.ipynb.
Converted 06_prediction_dataset.ipynb.
Converted 07_evaluation.ipynb.
Converted 08_key.ipynb.
Converted 09_submission.ipynb.
Converted 10_staker.ipynb.
Converted index.ipynb.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>