In [1]:
# Import DS tools
import pandas as pd

# Import project specific functions
from scripts import data

# Import data
data_dict = data.get_clean_data_dict()
purchases = data_dict['purchases']
users = data_dict['users']

### Featiure engineering
Our goal is to predict the value of the purchases each `user_id` will perform at a given date. For that reason, we start by fixing the data-frame so that it is given in
the format $t|i_1|\cdots|i_N$, where $t$ is the time (with a daily granularity), $i_k$ contains the amount the $k$-th user_id spent, and $N$ is the total number of user_id.

In [2]:
from pyhts.hierarchy import Hierarchy
# Merge purchases and users
merged_df = pd.merge(
    purchases,
    users
)[['user_id', 'purchased_at', 'value', 'orig_1']]
# Fix target categorical cols
merged_df['user_id'] = merged_df.user_id.fillna(-1).astype(int).astype(str)
merged_df['orig_1'] = merged_df.orig_1.fillna(-1).astype(int).astype(str)
merged_df['t'] = merged_df.purchased_at.dt.date

# Reduce number of user_ids by 85%
merged_df = merged_df[
    merged_df.user_id.isin(
        merged_df.user_id.drop_duplicates().sample(
            frac = 0.,
            random_state = 42
        )
    )
]
hierarchy = Hierarchy.from_long(
    df = merged_df.groupby(
        ['orig_1', 'user_id']
    ).value.sum().reset_index(),
    keys = ['orig_1', 'user_id']
)

# - first creating bottom time series:
Y_bottom = merged_df.groupby(
    ['t', 'user_id']
).value.sum().reset_index().pivot(
    index='t',
    columns=['user_id'],
    values='value'
)
Y_bottom.index = pd.to_datetime(Y_bottom.index)
Y_bottom = Y_bottom.resample('90d').sum()
Y_bottom.columns = ['user_id_' + cols for cols in Y_bottom.columns]
Y_bottom

Unnamed: 0_level_0,user_id_1003,user_id_10044,user_id_10055,user_id_1010,user_id_10110,user_id_10143,user_id_10144,user_id_10156,user_id_10162,user_id_10187,...,user_id_99869,user_id_99881,user_id_9990,user_id_99927,user_id_99946,user_id_99948,user_id_99950,user_id_99976,user_id_99979,user_id_99983
t,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-08-12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-11-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-02-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-05-09,0.501009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-08-07,0.459138,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-11-05,1.186296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-02-03,0.0,0.498179,0.466342,0.0,0.387298,1.432865,0.412555,0.475033,0.880083,0.493902,...,0.0,0.0,0.387298,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-05-04,2.511606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.070007,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-08-02,0.0,0.0,0.0,1.549193,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-10-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
from pyhts.HFModel import HFModel
import tqdm
from sklearn import metrics

# Make train-validation dictionary
train_val_dict = {
    # key corresponds to fold number, values are (train, val)
    1 : (Y_bottom.iloc[:-3], Y_bottom.iloc[[-3]]),
    2 : (Y_bottom.iloc[:-2], Y_bottom.iloc[[-2]]),
    3 : (Y_bottom.iloc[:-1], Y_bottom.iloc[[-1]])
}
# Results are stored here
results_dict = dict()

# Evaluate each fold
for fold, (train, val) in tqdm.tqdm(train_val_dict.items()) :
    # Generate model
    model = HFModel(
        hierarchy = hierarchy,
        base_forecasters = "arima",
        hf_method = "comb",
        comb_method = "ols"
    )
    # Fit model
    model.fit(train)
    # Predict
    predictions = model.predict(horizon=1)
    # Store results (also with naive model)
    results_dict[fold] = {
        'model_mae' : metrics.mean_absolute_error(
            predictions, val
        ),
        'naive_mae' : metrics.mean_absolute_error(
            # Use last available row as prediction
            train.iloc[[-1]], val
        )
    }

100%|██████████| 3/3 [14:30<00:00, 290.26s/it]


In [18]:
results = pd.DataFrame(results_dict)
results.columns = [str(col) + ' fold' for col in results.columns]
results

Unnamed: 0,1 fold,2 fold,3 fold
model_mae,0.227525,0.267809,0.203177
naive_mae,0.199993,0.240753,0.189326
