# Rossmann

Please manually download the competition data from https://www.kaggle.com/c/rossmann-store-sales/data to ./data/raw

In [None]:
%matplotlib inline
import pandas as pd
import os, sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from guacml import GuacMl

In [None]:
stores = pd.read_csv('./data/raw/store.csv.zip')
train = pd.read_csv('./data/raw/train.csv.zip')
test = pd.read_csv('./data/raw/test.csv.zip')

In [None]:
merged = train.merge(stores, how='outer', on=['Store'])
rand_stores = merged.Store.drop_duplicates().sample(10)
sample = merged[merged.Store.isin(rand_stores)]
sample = sample[sample.Sales > 0]

In [None]:
%time guac = GuacMl(sample, 'Sales', eval_metric='rmspe', exclude_cols=['Customers'])

In [None]:
guac.info()

In [None]:
guac.data.df.shape[0]

In [None]:
guac.make_time_series(date_split_col='Date', series_key_cols='Store', prediction_length=14, n_offset_models=4)
guac.plots.target_plot()

In [None]:
guac.plots.tree()

In [None]:
from guacml.preprocessing.feature_generation.historical_medians import HistoricalMedians

week_day_medians = HistoricalMedians([1, 5, 20], guac.config, group_keys='DayOfWeek')
guac.tree.insert_step_after('week_day_medians' ,'historical_medians', week_day_medians)
guac.plots.tree()



In [None]:
guac.run(5)

In [None]:
guac.plots.predictions_vs_actual('xg_boost')

In [None]:
pd.DateOffset(days=1) * 2

In [None]:
pd.DateOffset(minutes=1)

In [None]:
import pandas as pd
df = pd.DataFrame({'date': list(pd.date_range(pd.datetime(2015, 6, 15), pd.datetime(2015, 6, 20))),
                   'group_key': ['a'] * 3 + ['b'] * 3,
                        'value': range(6)})

In [None]:
medians = df.set_index('date').groupby('group_key')['value'].rolling(2).median().reset_index('group_key')
medians

In [None]:
shifted = medians.copy()
shifted['value'] = shifted.groupby('group_key').shift(1)
shifted.set_index('group_key', append=True)

In [None]:
with_gap = medians.iloc[[0, 1, 3, 4, 5]]

In [None]:
with_gap.resample('d').first()