In [121]:
import pandas as pd
import numpy as np

# demand-forecasting-kernels-only

In [122]:
train = pd.read_csv(r"data/demand-forecasting-kernels-only/train.csv")
train

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10
...,...,...,...,...
912995,2017-12-27,10,50,63
912996,2017-12-28,10,50,59
912997,2017-12-29,10,50,74
912998,2017-12-30,10,50,62


In [123]:
train.date.min(),train.date.max()

('2013-01-01', '2017-12-31')

In [124]:
test = pd.read_csv(r"data/demand-forecasting-kernels-only/test.csv")
test

Unnamed: 0,id,date,store,item
0,0,2018-01-01,1,1
1,1,2018-01-02,1,1
2,2,2018-01-03,1,1
3,3,2018-01-04,1,1
4,4,2018-01-05,1,1
...,...,...,...,...
44995,44995,2018-03-27,10,50
44996,44996,2018-03-28,10,50
44997,44997,2018-03-29,10,50
44998,44998,2018-03-30,10,50


In [125]:
test_min_date, test_max_date = test.date.unique().min(),test.date.unique().max()
test_min_date, test_max_date

('2018-01-01', '2018-03-31')

In [559]:
df = pd.concat([train, test], axis=0).reset_index(drop=True)
df0 = df.copy(deep=True)
df

Unnamed: 0,date,store,item,sales,id
0,2013-01-01,1,1,13.0,
1,2013-01-02,1,1,11.0,
2,2013-01-03,1,1,14.0,
3,2013-01-04,1,1,13.0,
4,2013-01-05,1,1,10.0,
...,...,...,...,...,...
957995,2018-03-27,10,50,,44995.0
957996,2018-03-28,10,50,,44996.0
957997,2018-03-29,10,50,,44997.0
957998,2018-03-30,10,50,,44998.0


# Utils functions and user configurations

In [595]:
from itertools import product
from functools import reduce # Valid in Python 2.6+, required in Python 3
import operator
import sys

def print_func_name():
    the_name = sys._getframe(1).f_code.co_name
    return print(f'{the_name}()')

def get_features_time_dependence(the_df):
    print_func_name()
    is_fixed_features_df = (the_df.groupby(index_cols)[features_orig].nunique(dropna=False) == 1).all()
    the_time_fixed_features = is_fixed_features_df[is_fixed_features_df==True].index
    the_time_variant_features = is_fixed_features_df[is_fixed_features_df==False].index
    return the_time_fixed_features, the_time_variant_features

def move_cols_to_first(the_df, first_cols: list):
    print_func_name()
    latter_cols = the_df.columns[~the_df.columns.isin(first_cols)]
    the_df = pd.concat([the_df[first_cols], the_df[latter_cols]],axis=1)
    return the_df

def add_indexes_col(the_df, indicator=False):
    print_func_name()
    indexes_col = "_".join(index_cols)
    the_df = the_df.copy(deep=True)
    the_df[indexes_col] = the_df[index_cols[0]].astype(str)
    if len(indexes_col)>1:
        for col in index_cols[1:]:
            the_df[indexes_col] = the_df[indexes_col].astype(str) + "_" + the_df[col].astype(str)
    if time_col in the_df:
        the_df = move_cols_to_first(the_df, [time_col,indexes_col]+index_cols)
    else:
        the_df = move_cols_to_first(the_df, [indexes_col]+index_cols)
    return the_df, indexes_col

def is_full_time_df(the_df):
    print_func_name()
    is_full_time = len(the_df) == reduce(operator.mul, the_df[index_cols+[time_col]].nunique())
    return is_full_time

def is_target_clean(the_df):
    print_func_name()
    return the_df[target].isna().sum() == 0

def target_fillna(the_df):
    """ To be used when NA is a fixed default value, like 0"""
    print_func_name()
    assert target in the_df
    the_df[target] = the_df[target].fillna(target_fillna_value)
    assert is_target_clean(the_df)
    return the_df

def target_dropna(the_df):
    print_func_name()
    the_df = the_df.dropna(subset=[target])
    assert the_df[target].isna().sum() == 0

def target_fix_na(the_df):
    print_func_name()
    if target_na_decision=='fill':
        the_df = target_fillna(the_df)
    elif target_na_decision=='drop':
        the_df = target_dropna(the_df)
    return the_df

def is_index_cols_clean(the_df):
    print_func_name()
    return (the_df[index_cols].isna().sum()==0).all()

def recreate_index_cols_from_indexes_col(the_df):
    "recreate the original index cols that are missing for the dates that were missing"
    if is_index_cols_clean(the_df):
        return the_df
    assert indexes_col in the_df
    recreated_index_values = the_df[the_df[index_cols].isna().any(axis=1)][indexes_col].str.split('_', expand=True)
    recreated_index_values.columns=index_cols
    the_df.loc[the_df[index_cols].isna().any(axis=1), index_cols] = recreated_index_values
    assert is_index_cols_clean(the_df)
    return the_df


def time_fixed_features_fillna(the_df):
    print_func_name()
    # Make sure there's one index col
    the_indexes_col = "_".join(index_cols)
    if not the_indexes_col in the_df:
        the_df, the_indexes_col = add_indexes_col(the_df)
    time_fixed_features_values_per_index = the_df.dropna(subset=time_fixed_features).groupby(the_indexes_col)[time_fixed_features].last().reset_index()
    # Make index_cols of time_fixed_features_values_per_index same type as the_df
    #time_fixed_features_values_per_index[index_cols] = time_fixed_features_values_per_index[index_cols].astype(the_df[index_cols].dtypes.drop_duplicates()[0])
    the_df = the_df.drop(columns=time_fixed_features)
    #return the_df, time_fixed_features_values_per_index
    the_df = the_df.merge(time_fixed_features_values_per_index, on=the_indexes_col, how='left')
    return the_df

def add_missing_dates_fill_target_zero(the_df):
    print_func_name()
    if is_full_time_df(the_df):
        return the_df
    # Make sure there's one index col
    the_indexes_col = "_".join(index_cols)
    if not the_indexes_col in the_df:
        the_df, the_indexes_col = add_indexes_col(the_df)
    # The logic
    the_df_full_date_indexes = pd.DataFrame(product(the_df[time_col].drop_duplicates(), the_df[the_indexes_col].drop_duplicates()), columns=[time_col, the_indexes_col]).dropna()
    the_df = the_df.merge(the_df_full_date_indexes, how='right')
    ## fillna's
    the_df = recreate_index_cols_from_indexes_col(the_df)
    #return the_df
    the_df = time_fixed_features_fillna(the_df)
    the_df = target_fillna(the_df)
    return the_df
    assert is_full_time_df(the_df)
    return the_df


df = df0.copy(deep=True)
# user and global configurations
time_col = 'date'
index_cols = ['store','item']
indexes_col = "_".join(index_cols)
target = 'sales'
target_na_decision = 'fill' # or drop
target_fillna_value = 0

# some extra features
df['storeXitem'] = df.store * df.item
df['storeX5'] = df.store*5
df['itemX5'] = df.item*5
features_orig = df.columns[~df.columns.isin(index_cols+[time_col, target])]

# pipeline
time_fixed_features, time_variant_features = get_features_time_dependence(df)
df = target_fix_na(df)
df = add_missing_dates_fill_target_zero(df[1:])
df

get_features_time_dependence()
target_fix_na()
target_fillna()
is_target_clean()
add_missing_dates_fill_target_zero()
is_full_time_df()
add_indexes_col()
move_cols_to_first()
is_index_cols_clean()
is_index_cols_clean()
time_fixed_features_fillna()
target_fillna()
is_target_clean()


Unnamed: 0,date,store_item,store,item,sales,id,storeXitem,storeX5,itemX5
0,2013-01-02,1_1,1,1,11.0,,1.0,5.0,5.0
1,2013-01-03,1_1,1,1,14.0,,1.0,5.0,5.0
2,2013-01-04,1_1,1,1,13.0,,1.0,5.0,5.0
3,2013-01-05,1_1,1,1,10.0,,1.0,5.0,5.0
4,2013-01-06,1_1,1,1,12.0,,1.0,5.0,5.0
...,...,...,...,...,...,...,...,...,...
957995,2018-03-28,10_50,10,50,0.0,44996.0,500.0,50.0,250.0
957996,2018-03-29,10_50,10,50,0.0,44997.0,500.0,50.0,250.0
957997,2018-03-30,10_50,10,50,0.0,44998.0,500.0,50.0,250.0
957998,2018-03-31,10_50,10,50,0.0,44999.0,500.0,50.0,250.0


In [None]:
# fillna

# handle features NA

# Target encoding categoricals

# Feature Engineering

# TODO
## create a unified time col from multiple time cols