In [1]:
# Import DS tools
im

# Import project specific functions
from scripts import data
# Import data
data_dict = data.get_clean_data_dict()
purchases = data_dict['purchases']
users = data_dict['users']

### Target data-frame analysis and munging
Our goal is to predict the value of the purchases each `user_id` will perform at a given date. For that reason, we start by fixing the data-frame so that it is given in
the format $t|i_1|\cdots|i_N$, where $t$ is the time (with a daily granularity), $i_k$ contains the amount the $k$-th user_id spent, and $N$ is the total number of user_id.

In [None]:
merged_df = pd.merge(
    purchases,
    users
)[['user_id', 'purchased_at', 'value', 'orig_1']]
merged_df['user_id'] = merged_df.user_id.fillna(-1).astype(int).astype(str)
merged_df['orig_1'] = merged_df.orig_1.fillna(-1).astype(int).astype(str)
merged_df['t'] = merged_df.purchased_at.dt.date

hierarchy = {
    'total': merged_df.orig_1.unique().tolist()
}
for orig_1 in merged_df.orig_1.unique():
    hierarchy[orig_1] = merged_df[merged_df.orig_1 == orig_1].user_id.unique().tolist()
    hierarchy[orig_1] = [orig_1 + '_' + user_id for user_id in hierarchy[orig_1]]

from hts import HTSRegressor

Y_bottom = merged_df.groupby(
    ['t', 'orig_1', 'user_id']
).value.sum().reset_index().pivot(
    index='t',
    columns=['orig_1', 'user_id'],
    values='value'
)
Y_bottom = Y_bottom.reindex(
    pd.date_range(
        start=Y_bottom.index.min(),
        end=Y_bottom.index.max(),
        freq='1d'
    )
).fillna(0)
Y_bottom.columns = ["_".join(cols) for cols in Y_bottom.columns]
Y_mid = merged_df.groupby(
    ['t', 'orig_1']
).value.sum().reset_index().pivot(
    index='t',
    columns=['orig_1'],
    values='value'
).reindex(
    pd.date_range(
        start=Y_bottom.index.min(),
        end=Y_bottom.index.max(),
        freq='1d'
    )
).fillna(0)
Y_total = Y_mid.sum(axis=1).rename('total')
Y = pd.concat(
    (Y_bottom, Y_mid, Y_total), axis=1
)
Y
from hts.hierarchy import HierarchyTree

tree = HierarchyTree.from_nodes(
    nodes=hierarchy,
    df=Y
)
print(tree)
from hts import HTSRegressor

clf = HTSRegressor(model='arima', revision_method='OLS', n_jobs=-1)
model = clf.fit(Y, hierarchy)