Added custom loss function base on @kyakvolev 's work. Credit to the author.

The forum post is here: https://www.kaggle.com/c/m5-forecasting-uncertainty/discussion/139515

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm as tqdm

from ipywidgets import widgets, interactive, interact
import ipywidgets as widgets
from IPython.display import display

import os
for dirname, _, filenames in os.walk('data/raw'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

data/raw/calendar.csv
data/raw/sell_prices.csv
data/raw/sales_train_validation.csv
data/raw/sample_submission.csv


## Reading data

In [3]:
train_sales = pd.read_csv('data/raw/sales_train_validation.csv')
calendar_df = pd.read_csv('data/raw/calendar.csv')
submission_file = pd.read_csv('data/raw/sales_train_validation.csv')
sell_prices = pd.read_csv('data/raw/sample_submission.csv')

## Variables to help with aggregation

In [4]:
total = ['Total']
train_sales['Total'] = 'Total'
train_sales['state_cat'] = train_sales.state_id + "_" + train_sales.cat_id
train_sales['state_dept'] = train_sales.state_id + "_" + train_sales.dept_id
train_sales['store_cat'] = train_sales.store_id + "_" + train_sales.cat_id
train_sales['store_dept'] = train_sales.store_id + "_" + train_sales.dept_id
train_sales['state_item'] = train_sales.state_id + "_" + train_sales.item_id
train_sales['item_store'] = train_sales.item_id + "_" + train_sales.store_id

In [5]:
val_eval = ['validation', 'evaluation']

# creating lists for different aggregation levels
total = ['Total']
states = ['CA', 'TX', 'WI']
num_stores = [('CA',4), ('TX',3), ('WI',3)]
stores = [x[0] + "_" + str(y + 1) for x in num_stores for y in range(x[1])]
cats = ['FOODS', 'HOBBIES', 'HOUSEHOLD']
num_depts = [('FOODS',3), ('HOBBIES',2), ('HOUSEHOLD',2)]
depts = [x[0] + "_" + str(y + 1) for x in num_depts for y in range(x[1])]
state_cats = [state + "_" + cat for state in states for cat in cats]
state_depts = [state + "_" + dept for state in states for dept in depts]
store_cats = [store + "_" + cat for store in stores for cat in cats]
store_depts = [store + "_" + dept for store in stores for dept in depts]
prods = list(train_sales.item_id.unique())
prod_state = [prod + "_" + state for prod in prods for state in states]
prod_store = [prod + "_" + store for prod in prods for store in stores]

In [6]:
print("Departments: ", depts)
print("Categories by state: ", state_cats)

Departments:  ['FOODS_1', 'FOODS_2', 'FOODS_3', 'HOBBIES_1', 'HOBBIES_2', 'HOUSEHOLD_1', 'HOUSEHOLD_2']
Categories by state:  ['CA_FOODS', 'CA_HOBBIES', 'CA_HOUSEHOLD', 'TX_FOODS', 'TX_HOBBIES', 'TX_HOUSEHOLD', 'WI_FOODS', 'WI_HOBBIES', 'WI_HOUSEHOLD']


In [7]:
quants = ['0.005', '0.025', '0.165', '0.250', '0.500', '0.750', '0.835', '0.975', '0.995']
days = range(1, 1913 + 1)
time_series_columns = [f'd_{i}' for i in days]

## Getting aggregated sales

In [8]:
def CreateSales(name_list, group):
    '''
    This function returns a dataframe (sales) on the aggregation level given by name list and group
    '''
    rows_ve = [(name + "_X_" + str(q) + "_" + ve, str(q)) for name in name_list for q in quants for ve in val_eval]
    sales = train_sales.groupby(group)[time_series_columns].sum() #would not be necessary for lowest level
    return sales

In [9]:
total = ['Total']
train_sales['Total'] = 'Total'
train_sales['state_cat'] = train_sales.state_id + "_" + train_sales.cat_id
train_sales['state_dept'] = train_sales.state_id + "_" + train_sales.dept_id
train_sales['store_cat'] = train_sales.store_id + "_" + train_sales.cat_id
train_sales['store_dept'] = train_sales.store_id + "_" + train_sales.dept_id
train_sales['state_item'] = train_sales.state_id + "_" + train_sales.item_id
train_sales['item_store'] = train_sales.item_id + "_" + train_sales.store_id

In [10]:
#example usage of CreateSales
sales_by_state_cats = CreateSales(state_cats, 'state_cat')
sales_by_state_cats

Unnamed: 0_level_0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
state_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CA_FOODS,10101,9862,6944,7864,7178,8256,9005,11870,10977,8637,...,10732,13094,14855,11217,10088,9863,9479,10991,13845,14806
CA_HOBBIES,1802,1561,1472,1405,1181,1459,1314,1986,1482,1508,...,1658,2176,2330,1706,1849,1646,1590,2015,2367,2357
CA_HOUSEHOLD,2292,2382,1692,1778,1566,1607,1932,2754,2237,1677,...,3865,5294,5847,4129,3847,3639,3419,4089,5622,6024
TX_FOODS,6853,7030,5124,5470,4602,7067,4671,7055,6920,5505,...,6994,7944,8717,6834,7066,6242,6167,6839,7849,7765
TX_HOBBIES,879,870,526,809,501,831,390,785,794,524,...,1107,1124,1461,830,1025,980,882,1056,1222,1266
TX_HOUSEHOLD,1706,1730,1128,1102,809,1108,1165,1600,1662,1290,...,2699,3392,3531,2204,2831,2378,2553,2720,3195,3251
WI_FOODS,6224,5866,5106,5544,2823,6770,6814,8826,6965,4759,...,10956,10969,10925,8100,7794,7527,7671,8874,10233,10083
WI_HOBBIES,1083,926,684,455,132,930,1240,1215,623,583,...,1021,1334,1029,787,913,846,881,1014,1198,1060
WI_HOUSEHOLD,1691,1522,1107,985,354,1183,1479,1841,1076,1089,...,2757,3035,2945,2252,2157,2222,2391,2919,3431,3183


## Getting quantiles adjusted by day-of-week

In [14]:
def CreateQuantileDict(name_list = stores, group = 'store_id' ,X = False):
    '''
    This function writes creates sales data on given aggregation level, and then writes predictions to the global dictionary my_dict
    '''
    sales = CreateSales(name_list, group)
    sales = sales.iloc[:, 2:] #starting from d_3 because it is a monday, needed to make daily_factors work
    sales_quants = pd.DataFrame(index = sales.index)
    for q in quants:
        sales_quants[q] = np.quantile(sales, float(q), axis = 1)
    full_mean = pd.DataFrame(np.mean(sales, axis = 1))
    daily_means = pd.DataFrame(index = sales.index)
    for i in range(7):
        daily_means[str(i)] = np.mean(sales.iloc[:, i::7], axis = 1)
    daily_factors = daily_means / np.array(full_mean)

    daily_factors = pd.concat([daily_factors, daily_factors, daily_factors, daily_factors], axis = 1)
    daily_factors_np = np.array(daily_factors)

    factor_df = pd.DataFrame(daily_factors_np, columns = submission_file.columns[1:])
    factor_df.index = daily_factors.index

    for i,x in enumerate(tqdm(sales_quants.index)):
        for q in quants:
            v = sales_quants.loc[x, q] * np.array(factor_df.loc[x, :])
            if X:
                my_dict[x + "_X_" + q + "_validation"] = v
                my_dict[x + "_X_" + q + "_evaluation"] = v
            else:
                my_dict[x + "_" + q + "_validation"] = v
                my_dict[x + "_" + q + "_evaluation"] = v

In [17]:
my_dict = {}
#adding prediction to my_dict on all 12 aggregation levels
CreateQuantileDict(total, 'Total', X=True) #1
CreateQuantileDict(states, 'state_id', X=True) #2
CreateQuantileDict(stores, 'store_id', X=True) #3
CreateQuantileDict(cats, 'cat_id', X=True) #4
CreateQuantileDict(depts, 'dept_id', X=True) #5
CreateQuantileDict(state_cats, 'state_cat') #6
CreateQuantileDict(state_depts, 'state_dept') #7
CreateQuantileDict(store_cats, 'store_cat') #8
CreateQuantileDict(store_depts, 'store_dept') #9
CreateQuantileDict(prods, 'item_id', X=True) #10
CreateQuantileDict(prod_state, 'state_item') #11
CreateQuantileDict(prod_store, 'item_store') #12

ValueError: Shape of passed values is (3, 28), indices imply (3, 1918)

In [18]:
total

['Total']

## Creating valid prediction df from my_dict

In [12]:
pred_df = pd.DataFrame(my_dict)
pred_df = pred_df.transpose()
pred_df_reset = pred_df.reset_index()
final_pred = pd.merge(pd.DataFrame(submission_file.id), pred_df_reset, left_on = 'id', right_on = 'index')
del final_pred['index']
final_pred = final_pred.rename(columns={0: 'F1', 1: 'F2', 2: 'F3', 3: 'F4', 4: 'F5', 5: 'F6', 6: 'F7', 7: 'F8', 8: 'F9',
                                        9: 'F10', 10: 'F11', 11: 'F12', 12: 'F13', 13: 'F14', 14: 'F15', 15: 'F16',
                                        16: 'F17', 17: 'F18', 18: 'F19', 19: 'F20', 20: 'F21', 21: 'F22', 
                                        22: 'F23', 23: 'F24', 24: 'F25', 25: 'F26', 26: 'F27', 27: 'F28'})
final_pred = final_pred.fillna(0)

In [13]:
for i in range(1,29):
    final_pred['F'+str(i)] *= 1.170
final_pred.to_csv('return_of_the_blend.csv', index=False)