In [1]:
# Import DS tools
import pandas as pd

# Import project specific functions
from scripts import data

# Import data
data_dict = data.get_clean_data_dict()
purchases = data_dict['purchases']
users = data_dict['users']

### Target data-frame analysis and munging
Our goal is to predict the value of the purchases each `user_id` will perform at a given date. For that reason, we start by fixing the data-frame so that it is given in
the format $t|i_1|\cdots|i_N$, where $t$ is the time (with a daily granularity), $i_k$ contains the amount the $k$-th user_id spent, and $N$ is the total number of user_id.

In [3]:
# Merge purchases and users
merged_df = pd.merge(
    purchases,
    users
)[['user_id', 'purchased_at', 'value', 'orig_1']]
# Fix target categorical cols
merged_df['user_id'] = merged_df.user_id.fillna(-1).astype(int).astype(str)
merged_df['orig_1'] = merged_df.orig_1.fillna(-1).astype(int).astype(str)
merged_df['t'] = merged_df.purchased_at.dt.date

# Reduce number of user_ids by 85%
merged_df = merged_df[
    merged_df.user_id.isin(
        merged_df.user_id.drop_duplicates().sample(
            frac = 0.15,
            random_state = 42
        )
    )
]

# Create hierarchy,
# - First creating top
hierarchy = {
    'total': merged_df.orig_1.unique().tolist()
}
# - Then for each possible origin, getting possible unique user_ids
for orig_1 in merged_df.orig_1.unique():
    hierarchy[orig_1] = merged_df[merged_df.orig_1 == orig_1].user_id.unique().tolist()
    hierarchy[orig_1] = [orig_1 + '_' + user_id for user_id in hierarchy[orig_1]]

# Create Y in scikit-hts format,
# - first creating bottom time series:
Y_bottom = merged_df.groupby(
    ['t', 'orig_1', 'user_id']
).value.sum().reset_index().pivot(
    index='t',
    columns=['orig_1', 'user_id'],
    values='value'
)
Y_bottom.index = pd.to_datetime(Y_bottom.index)
Y_bottom = Y_bottom.resample('90d').sum()
Y_bottom.columns = ["_".join(cols) for cols in Y_bottom.columns]
# - then middle - origin_1 - columns
Y_mid = merged_df.groupby(
    ['t', 'orig_1']
).value.sum().reset_index().pivot(
    index='t',
    columns=['orig_1'],
    values='value'
)
Y_mid.index = pd.to_datetime(Y_mid.index)
Y_mid = Y_mid.resample('90d').sum()
# finally total
Y_total = Y_mid.sum(axis=1).rename('total')
# and concatenating all of them
Y = pd.concat(
    (Y_bottom, Y_mid, Y_total), axis=1
)
Y

Unnamed: 0_level_0,30_1690,30_2020,30_2023,78_945,30_2045,39_418,30_2068,30_2083,30_639,78_2123,...,49_99881,75_30978,19_99979,30_93382,30_99946,30_99950,30_99976,30_99983,35_99948,39_87829
t,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-08-12,0.472902,1.212912,0.951341,0.891576,0.654973,0.497468,0.517082,0.633094,0.511386,1.296848,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-11-10,0.0,0.0,0.0,0.0,0.0,0.489382,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-02-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.546782,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-05-09,0.0,0.0,0.0,0.760555,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-08-07,0.468287,0.0,0.0,0.0,0.895556,0.0,0.0,0.378861,0.948787,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-11-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.895556,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-02-03,0.498888,0.0,1.71011,0.0,1.02135,0.387298,0.0,0.0,0.895556,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-05-04,0.387298,0.0,1.201136,0.0,0.0,0.501009,0.0,0.0,0.447778,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-08-02,0.0,0.0,1.229762,0.0,0.223607,0.0,0.0,0.223607,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-10-31,0.0,0.0,0.52503,0.0,0.0,0.0,0.0,0.0,0.387298,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
from hts import HTSRegressor

predictions_dict = dict()
for threshold in [-2, -1] :
    hts_regressor = HTSRegressor(
        model = 'holt_winters',
        revision_method = 'WLSV',
        n_jobs = 10,
        low_memory = True
    )
    hts_regressor.fit(df = Y.iloc[:-2], nodes = hierarchy)
    predictions_dict[threshold] = hts_regressor.predict(steps_ahead=1).iloc[[-1]]

Fitting models: 100%|██████████| 50/50 [00:05<00:00,  9.28it/s]
Fitting models: 100%|██████████| 50/50 [00:05<00:00,  9.32it/s]
Fitting models: 100%|██████████| 50/50 [00:05<00:00,  9.18it/s]
Fitting models: 100%|██████████| 50/50 [00:05<00:00,  9.70it/s]


In [8]:
predictions_dict

{-2:              total        30       123        39       75        83        78  \
 2021-04-23  0.1202 -0.031647  0.259389  0.188253  1.01521  0.011319  0.323732   
 
                   10       103       119  ...   64_74060  64_74223  64_74440  \
 2021-04-23  0.104753 -1.338482 -7.150894  ... -20.317133 -0.473774 -2.449999   
 
              64_75133  64_78364  88_95819  88_98079  89_95856   5_97021  \
 2021-04-23 -28.410155 -0.127602  0.132813  0.446635 -4.291032 -0.074927   
 
             16_98077  
 2021-04-23  1.012049  
 
 [1 rows x 6765 columns],
 -1:                total        30       123        39        75        83  \
 2021-04-23  0.162354  0.191804  0.073137  0.136456  0.662761  0.087946   
 
                   78       10       103       119  ...  64_74060  64_74223  \
 2021-04-23 -4.502665  0.06365 -2.258184 -0.216478  ... -0.223259 -0.150615   
 
             64_74440  64_75133  64_78364  88_95819  88_98079  89_95856  \
 2021-04-23 -0.439319 -0.352625 -0.296702  0.

In [9]:
from pyhts.dataset import load_tourism

tourism_data = load_tourism()
train = tourism_data.iloc[:-12, :]
test = tourism_data.iloc[-12:, :]
tourism_data

Unnamed: 0,AAA,AAB,ABA,ABB,ACA,ADA,ADB,ADC,ADD,AEA,...,FBB,FCA,FCB,GAA,GAB,GAC,GBA,GBB,GBC,GBD
0,3749.420009,1234.153504,1332.659538,4014.959641,3571.298778,282.613549,277.258009,604.167348,312.516904,610.229788,...,531.590022,195.512426,46.957156,46.848028,2.584923,21.127603,22.672264,19.776870,9.165171,10.289907
1,1691.664436,245.565175,610.939832,909.981060,1066.882770,73.108680,160.238689,114.528983,90.981689,150.855730,...,258.569472,93.882862,28.023033,44.456176,5.687391,11.815504,1.199800,4.706087,5.296459,0.522899
2,1878.094277,248.197996,497.912226,1199.225593,1179.847256,155.646283,191.696367,277.474557,139.306907,337.220682,...,172.205507,114.067194,28.969209,204.127403,17.247262,32.853019,3.973355,0.889453,17.395196,0.000000
3,2218.486489,409.493994,1608.119556,2067.473509,1092.244810,190.093552,328.250392,332.206915,176.103433,642.248362,...,156.603061,135.067404,42.567232,148.208669,25.497905,5.355521,0.000000,15.282432,42.421868,13.270093
4,2162.827477,232.028390,543.001891,1076.512403,931.920614,203.576554,264.547293,233.874656,354.793863,355.710131,...,179.807921,59.264950,16.851261,120.926418,66.902240,26.250860,6.300017,22.815120,96.589700,0.734289
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,1823.214651,166.289940,702.726238,1150.368726,487.991745,562.380065,148.372483,129.986738,163.804676,451.210073,...,121.821310,149.666609,10.221980,576.816269,89.554305,117.776237,18.602800,54.377290,110.622438,39.155151
236,2061.810142,248.978920,756.922670,1589.967804,752.784101,522.885180,349.371128,264.164451,204.773784,751.567097,...,103.590744,210.783190,20.119593,454.383269,56.877583,352.513300,15.445858,83.999767,124.074529,281.450299
237,2626.809976,439.478789,919.337005,1819.882071,854.485841,279.892247,367.201403,344.022682,190.559343,747.499222,...,171.909631,124.437925,74.768559,553.390016,92.335509,121.393070,24.718813,65.019600,470.017728,68.018775
238,2093.527335,331.780369,677.038905,1411.298438,1016.028622,69.996512,169.264272,277.452898,217.057613,457.180448,...,174.742762,154.003227,57.681088,170.328795,39.670724,43.125875,6.692642,37.843170,21.870601,4.908592


In [16]:
from pyhts.hierarchy import Hierarchy
hierarchy = Hierarchy.from_long(
    df = merged_df.groupby(
        ['orig_1', 'user_id']
    ).value.sum().reset_index(),
    keys = ['orig_1', 'user_id']
)

<pyhts.hierarchy.Hierarchy at 0x142ae0520>

In [12]:
merged_df.groupby(
    ['orig_1', 'user_id']
).value.sum().reset_index()

Unnamed: 0,orig_1,user_id,value
0,-1,11603,0.452267
1,-1,13547,0.372542
2,-1,13731,1.200757
3,-1,13733,1.005999
4,-1,15880,0.387298
...,...,...,...
6681,88,98079,0.298481
6682,89,95856,0.413656
6683,95,34511,0.368864
6684,95,38572,0.371456


In [17]:
# - first creating bottom time series:
Y_bottom = merged_df.groupby(
    ['t', 'orig_1', 'user_id']
).value.sum().reset_index().pivot(
    index='t',
    columns=['orig_1', 'user_id'],
    values='value'
)
Y_bottom.index = pd.to_datetime(Y_bottom.index)
Y_bottom = Y_bottom.resample('90d').sum()
Y_bottom.columns = ["_".join(cols) for cols in Y_bottom.columns]
Y_bottom

Unnamed: 0_level_0,30_1690,30_2020,30_2023,78_945,30_2045,39_418,30_2068,30_2083,30_639,78_2123,...,49_99881,75_30978,19_99979,30_93382,30_99946,30_99950,30_99976,30_99983,35_99948,39_87829
t,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-08-12,0.472902,1.212912,0.951341,0.891576,0.654973,0.497468,0.517082,0.633094,0.511386,1.296848,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2017-11-10,0.0,0.0,0.0,0.0,0.0,0.489382,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-02-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.546782,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-05-09,0.0,0.0,0.0,0.760555,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-08-07,0.468287,0.0,0.0,0.0,0.895556,0.0,0.0,0.378861,0.948787,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-11-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.895556,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-02-03,0.498888,0.0,1.71011,0.0,1.02135,0.387298,0.0,0.0,0.895556,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-05-04,0.387298,0.0,1.201136,0.0,0.0,0.501009,0.0,0.0,0.447778,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-08-02,0.0,0.0,1.229762,0.0,0.223607,0.0,0.0,0.223607,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2019-10-31,0.0,0.0,0.52503,0.0,0.0,0.0,0.0,0.0,0.387298,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
from pyhts.HFModel import HFModel
model = HFModel(hierarchy=hierarchy, base_forecasters="arima",
                hf_method="comb", comb_method="ols")

R[write to console]: Error in loadNamespace(name) : there is no package called ‘forecast’
Calls: <Anonymous> ... loadNamespace -> withRestarts -> withOneRestart -> doWithOneRestart



RRuntimeError: Error in loadNamespace(name) : there is no package called ‘forecast’
Calls: <Anonymous> ... loadNamespace -> withRestarts -> withOneRestart -> doWithOneRestart
