In [1]:
# Import DS tools
import pandas as pd

# Import project specific functions
from scripts import data

# Import data
data_dict = data.get_clean_data_dict()
purchases = data_dict['purchases']
users = data_dict['users']

### Target data-frame analysis and munging
Our goal is to predict the value of the purchases each `user_id` will perform at a given date. For that reason, we start by fixing the data-frame so that it is given in
the format $t|i_1|\cdots|i_N$, where $t$ is the time (with a daily granularity), $i_k$ contains the amount the $k$-th user_id spent, and $N$ is the total number of user_id.

In [2]:
# Merge purchases and users
merged_df = pd.merge(
    purchases,
    users
)[['user_id', 'purchased_at', 'value', 'orig_1']]
# Fix target categorical cols
merged_df['user_id'] = merged_df.user_id.fillna(-1).astype(int).astype(str)
merged_df['orig_1'] = merged_df.orig_1.fillna(-1).astype(int).astype(str)
merged_df['t'] = merged_df.purchased_at.dt.date

# Create hierarchy,
# - First creating top
hierarchy = {
    'total': merged_df.orig_1.unique().tolist()
}
# - Then for each possible origin, getting possible unique user_ids
for orig_1 in merged_df.orig_1.unique():
    hierarchy[orig_1] = merged_df[merged_df.orig_1 == orig_1].user_id.unique().tolist()
    hierarchy[orig_1] = [orig_1 + '_' + user_id for user_id in hierarchy[orig_1]]

# Create Y in scikit-hts format,
# - first creating bottom time series:
Y_bottom = merged_df.groupby(
    ['t', 'orig_1', 'user_id']
).value.sum().reset_index().pivot(
    index='t',
    columns=['orig_1', 'user_id'],
    values='value'
)
Y_bottom = Y_bottom.reindex(
    pd.date_range(
        start=Y_bottom.index.min(),
        end=Y_bottom.index.max(),
        freq='1d'
    )
).fillna(0)
Y_bottom.columns = ["_".join(cols) for cols in Y_bottom.columns]
# - then middle - origin_1 - columns
Y_mid = merged_df.groupby(
    ['t', 'orig_1']
).value.sum().reset_index().pivot(
    index='t',
    columns=['orig_1'],
    values='value'
).reindex(
    pd.date_range(
        start=Y_bottom.index.min(),
        end=Y_bottom.index.max(),
        freq='1d'
    )
).fillna(0)
# finally total
Y_total = Y_mid.sum(axis=1).rename('total')
# and concatenating all of them
Y = pd.concat(
    (Y_bottom, Y_mid, Y_total), axis=1
)
Y

Unnamed: 0,30_1690,30_2020,30_2022,30_2023,75_181,78_945,30_1577,30_2031,78_1850,30_1760,...,87,88,89,93,94,95,96,97,99,total
2017-08-12,0.472902,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.472902
2017-08-13,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
2017-08-14,0.000000,1.212912,0.651416,0.951341,0.948683,0.437971,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.202323
2017-08-15,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
2017-08-16,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.476731,0.476837,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.953569
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-08-07,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,52.802193
2021-08-08,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,62.271813
2021-08-09,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49.594107
2021-08-10,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,56.233659


In [None]:
from hts.hierarchy import HierarchyTree

# Create hierarchy tree
tree = HierarchyTree.from_nodes(
    nodes=hierarchy,
    df=Y
)
from hts import HTSRegressor

clf = HTSRegressor(model='arima', revision_method='WLSV', low_memory=True, n_jobs=4)
model = clf.fit(Y, hierarchy)