In [None]:
#hide
%load_ext autoreload
%autoreload 2

In [None]:
# default_exp pipeline

In [None]:
# hide

from kaggle_m5_nbdev.core import test_eq, test_err, configure_logging
log = configure_logging('./tmp', 'test_log', con_log_lvl='DEBUG')

In [None]:
#export

from core import read_series_sample, melt_sales_series, extract_day_ids, join_w_calendar, join_w_prices
from core import to_parquet, get_submission_template_melt
import os

def prepare_data_on_disk(log, n_sample_series, processed_dir, raw_dir, force_data_prep):
    expected_path = f'{processed_dir}/sales_series_melt.parquet'
    if os.path.exists(expected_path) and not force_data_prep:
        log.info(f'Found parquet file ({expected_path})- skipping the prep')
        return

    log.info(f'Not found parquet file ({expected_path}) - preparing the data')

    sales_series = read_series_sample(log, n_sample_series)
    sales_series = melt_sales_series(sales_series)
    sales_series = extract_day_ids(sales_series)
    sales_series = join_w_calendar(sales_series, raw_dir)
    sales_series = join_w_prices(sales_series, raw_dir).persist()
    to_parquet(sales_series, 'sales_series_melt.parquet', processed_dir, log)

ModuleNotFoundError: No module named 'core'

In [None]:
prepare_data_on_disk(log, n_sample_series=10, processed_dir='./tmp', raw_dir='raw', force_data_prep=True)
prepare_data_on_disk(log, n_sample_series=10, processed_dir='./tmp', raw_dir='raw', force_data_prep=False)

In [None]:
# TODO: move to core (don't forget to add import here)

from sklearn.preprocessing import LabelEncoder
import dask.dataframe as dd
def load_encoders(processed):
    def _load(fn):
        l = LabelEncoder()
        l.classes_ = np.load(f'{processed}/{fn}', allow_pickle=True)
        return l

    encoders_paths = filter(lambda p: p.endswith('.npy'), os.listdir(processed))
    encoders = {fn[:-len('.npy')]:_load(fn) for fn in encoders_paths}

    return encoders

def encode(log, me, processed):
    encoders = load_encoders(processed)
    continuous_cols = ['sell_price']

    for col in me.columns:
        dtype_str = str(me[col].dtype)
        if col in continuous_cols:
            log.debug(f"Encoding {col} ({dtype_str}) as float32 just in case for pytorch")
            me[col] = me[col].astype('float32')
            continue

        log.debug(f"Encoding {col} ({dtype_str}) as categorical ")

        unlabelable = ~me[col].isin(encoders[col].classes_)
        unlabelable_count = unlabelable.sum()
        if unlabelable_count > 0:
            default_label = encoders[col].classes_[0]
            log.warning(f"{unlabelable_count} entries for {col} can't be labeled. Defaulting to {default_label} e.g.\n {me[unlabelable][col][:3].values}")
            me.loc[unlabelable, col] = default_label

        me[col] = encoders[col].transform(me[col])

    return me

In [None]:
raw = 'raw'
processed = './tmp'

In [None]:
def prepare_test_data_on_disk(log, raw, processed, force_data_prep):
    expected_path = f'{processed}/test_series_melt.parquet'
    if os.path.exists(expected_path) and not force_data_prep:
        log.info(f'Found parquet file ({expected_path})- skipping the prep')
        return

    template = get_submission_template_melt(raw)
    test_data = encode(log, template, processed)
    to_parquet(test_data, 'test_series_melt.parquet', processed, log)

In [None]:
prepare_test_data_on_disk(log, raw='raw', processed='./tmp', force_data_prep=True)
prepare_test_data_on_disk(log, raw='raw', processed='./tmp', force_data_prep=False)