In [5]:
# Import DS tools
import pandas as pd

# Import project specific functions
from scripts import data

# Import data
data_dict = data.get_clean_data_dict()
purchases = data_dict['purchases']
users = data_dict['users']

### Target data-frame analysis and munging
Our goal is to predict the value of the purchases each `user_id` will perform at a given date. For that reason, we start by fixing the data-frame so that it is given in
the format $t|i_1|\cdots|i_N$, where $t$ is the time (with a daily granularity), $i_k$ contains the amount the $k$-th user_id spent, and $N$ is the total number of user_id.

In [14]:
# Merge purchases and users
merged_df = pd.merge(
    purchases,
    users
)[['user_id', 'purchased_at', 'value', 'orig_1']]
# Fix target categorical cols
merged_df['user_id'] = merged_df.user_id.fillna(-1).astype(int).astype(str)
merged_df['orig_1'] = merged_df.orig_1.fillna(-1).astype(int).astype(str)
merged_df['t'] = merged_df.purchased_at.dt.date

# Reduce number of user_ids by 85%
merged_df = merged_df[
    merged_df.user_id.isin(
        merged_df.user_id.drop_duplicates().sample(
            frac = 0.15
        )
    )
]

# Create hierarchy,
# - First creating top
hierarchy = {
    'total': merged_df.orig_1.unique().tolist()
}
# - Then for each possible origin, getting possible unique user_ids
for orig_1 in merged_df.orig_1.unique():
    hierarchy[orig_1] = merged_df[merged_df.orig_1 == orig_1].user_id.unique().tolist()
    hierarchy[orig_1] = [orig_1 + '_' + user_id for user_id in hierarchy[orig_1]]

# Create Y in scikit-hts format,
# - first creating bottom time series:
Y_bottom = merged_df.groupby(
    ['t', 'orig_1', 'user_id']
).value.sum().reset_index().pivot(
    index='t',
    columns=['orig_1', 'user_id'],
    values='value'
)
Y_bottom = Y_bottom.reindex(
    pd.date_range(
        start=Y_bottom.index.min(),
        end=Y_bottom.index.max(),
        freq='1d'
    )
).fillna(0)
Y_bottom.columns = ["_".join(cols) for cols in Y_bottom.columns]
# - then middle - origin_1 - columns
Y_mid = merged_df.groupby(
    ['t', 'orig_1']
).value.sum().reset_index().pivot(
    index='t',
    columns=['orig_1'],
    values='value'
).reindex(
    pd.date_range(
        start=Y_bottom.index.min(),
        end=Y_bottom.index.max(),
        freq='1d'
    )
).fillna(0)
# finally total
Y_total = Y_mid.sum(axis=1).rename('total')
# and concatenating all of them
Y = pd.concat(
    (Y_bottom, Y_mid, Y_total), axis=1
)
Y

Unnamed: 0,30_2022,30_2031,30_1834,30_2078,30_1566,83_2047,30_2097,39_2076,34_2149,39_1637,...,77,78,79,8,80,83,85,86,96,total
2017-08-14,0.651416,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.651416
2017-08-15,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000
2017-08-16,0.000000,0.476837,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.476837
2017-08-17,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000
2017-08-18,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-08-07,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.760383,0.000000,0.000000,0.0,0.471405,0.0,0.0,0.0,0.0,7.971067
2021-08-08,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.419235,0.0,0.0,0.0,0.0,9.217058
2021-08-09,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.431699,0.0,0.000000,0.0,0.0,0.0,0.0,4.551616
2021-08-10,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.802126,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,9.936239


In [None]:
from hts import HTSRegressor

hts_regressor = HTSRegressor(
    model = 'holt_winters',
    revision_method = 'WLSS',
    n_jobs = 15,
    low_memory = True
)
hts_regressor.fit(df = Y.iloc[365:], nodes = hierarchy)
predictions = hts_regressor.predict(steps_ahead=10)

  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex
Fitting models: 100%|██████████| 75/75 [00:08<00:00,  8.54it/s]
  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex
  from pandas import Int

In [None]:
predictions

In [None]:
from hts.hierarchy import HierarchyTree

# Create hierarchy tree
tree = HierarchyTree.from_nodes(
    nodes=hierarchy,
    df=Y
)

In [None]:
import hts.functions
import statsmodels, collections, tqdm

forecasts = pd.DataFrame(columns=Y.columns)

# Make forecasts made outside of package. Could be any modeling technique.
for col in tqdm.tqdm(Y.columns):
    model = statsmodels.tsa.holtwinters.SimpleExpSmoothing(Y[col].values).fit()
    fcst = list(model.forecast(90))
    forecasts[col] = fcst

pred_dict = collections.OrderedDict()

In [None]:
sum_mat, sum_mat_labels = hts.functions.to_sum_mat(tree)

# Add predictions to dictionary is same order as summing matrix
for label in sum_mat_labels:
    pred_dict[label] = pd.DataFrame(data=forecasts[label].values, columns=['yhat'])

In [None]:
pred_dict

In [None]:
revised = hts.functions.optimal_combination(pred_dict, sum_mat, method='WLSV', mse={})

# Put reconciled forecasts in nice DataFrame form
revised_forecasts = pd.DataFrame(
    data=revised[0:,0:],
    index=forecasts.index,
    columns=sum_mat_labels
)

In [None]:
revised = hts.functions.optimal_combination(pred_dict, sum_mat, method='WLSV', mse={})

In [13]:
hts_regressor.predict(steps_ahead = 10)

  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex
  from pandas import Int64Index as NumericIndex
Fitting models: 100%|██████████| 50/50 [00:13<00:00,  3.65it/s]


KeyboardInterrupt: 