# Import Packages

In [1]:
from croston import croston
import sys
sys.path.append('..')

import matplotlib.pyplot as plt
import utils
import pandas as pd
import numpy as np
import math
import random
from isoweek import Week
import lightgbm as lgb
import pickle
from sklearn.metrics import mean_squared_error
from tqdm.notebook import tqdm
from IPython.display import clear_output

# Settings

In [2]:
target_variable = 'Turnover' # Either 'Turnover' or 'Sales_QTY'

forecast_horizon = 1

n_splits = 1 # Number of cross validation splits

# Import Sales Data

- We do not import the data of 2021 so that we get the same data again when we execute the notebook in the future.


- For this approach, we just need Week, Year, Site_ID, SKU, Sales_QTY

In [3]:
query = 'SELECT Week, Year, Site_ID, SKU, Sales_QTY,Turnover FROM `globus-datahub-dev.Verteiler_PoC.timeseries_sales_weekly` where Year >= 2015 and (Year <= 2019 OR (Year = 2020 AND Week <= 47))  order by sku, site_id, year, week'
df_raw = utils.bq_to_dataframe(
    query, verbose=True).pipe(utils.reduce_memory_usage)

********************
Query Duration:  0:01:12.402272
Mem. usage decreased to 122.87 Mb (64.6% reduction)


In [4]:
df = df_raw.copy()
df

Unnamed: 0,Week,Year,Site_ID,SKU,Sales_QTY,Turnover
0,1,2015,101,1014557,5.0,129.500000
1,2,2015,101,1014557,2.0,51.799999
2,3,2015,101,1014557,0.0,0.000000
3,4,2015,101,1014557,1.0,25.900000
4,5,2015,101,1014557,0.0,0.000000
...,...,...,...,...,...,...
7578843,47,2020,101,1289328500,4.0,56.090000
7578844,47,2020,101,1289328600,2.0,29.799999
7578845,47,2020,101,1289328700,5.0,32.869999
7578846,47,2020,101,1289328800,4.0,27.600000


In [5]:
# Get rid of negative Sales / Turnover
df.loc[:, 'Sales_QTY'] = df.Sales_QTY.clip(lower=0)
df.loc[:, 'Turnover'] = df.Turnover.clip(lower=0)


# Forecasting using Croston

In [6]:
# count number of timeseries
df[['Site_ID', 'SKU']].drop_duplicates().shape

(57909, 2)

In [7]:
def run_croston(ts_list: list,fh: int) -> list:
    """
    Forecast timeseries using Croson's Method.
    :param ts_list: timeseries to be forecasted (list)
    :param fh: forecast horizon (int)
    :return: predicted value without historacal timeries (list)
    """
    fit_pred = croston.fit_croston(ts_list, fh, croston_variant='adjusted')
    return list(fit_pred['croston_forecast'])

In [8]:
rmse = []

splits = utils.get_splits(df, fh=forecast_horizon, n_splits=1)

for week, year in splits:
    week=int(week)
    year=int(year)
    
    results_dict = {"Site_ID":[],"SKU":[],"Forecast":[]}
    error_log    = {"Site_ID":[],"SKU":[]};
    
    # Prepare data
    df_train = df.loc[((df.Year == (Week(year, week))[0]) & (df.Week <= (Week(year, week))[1])) | (df.Year < (Week(year, week))[0])]
    targets = df.loc[df.Year == (Week(year, week)+1)[0]].loc[df.Week == (Week(year, week)+1)[1]][['SKU','Site_ID', 'Year','Week']+[target_variable]]

    # Forecast
    for i, sku_df in tqdm(df_train.groupby(['Site_ID','SKU'])):
        try:
            yhat = run_croston(sku_df[target_variable].to_list(),1 )
        except:
            yhat=[0]
            error_log["Site_ID"].append(sku_df.Site_ID.values[0])
            error_log["SKU"].append(sku_df.SKU.values[0])

        results_dict["Site_ID"].append(sku_df.Site_ID.values[0])
        results_dict["SKU"].append(sku_df.SKU.values[0])
        results_dict["Forecast"].append(yhat[0])
        
        
    result_df = pd.DataFrame(results_dict)
    result_df = result_df.merge(targets, on=['SKU', 'Site_ID'])

    rmse.append(mean_squared_error(result_df.Forecast, result_df[target_variable], squared=False))
    
    # Clear Cell Output
    clear_output()

print('RMSE:\n', rmse)
    
print('\nCrossvalidated RMSE: ', np.mean(rmse))

RMSE:
 [82.4400084873183]

Crossvalidated RMSE:  82.4400084873183
