# Backtest: Forecast SS19, DAA vs. eCom 

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

import multiprocessing

import matplotlib.pyplot as plt
import bokeh
import bokeh.io
from bokeh.plotting import figure
from bokeh.io import output_notebook, show

# init_notebook_mode()

import seaborn as sns

import re
import math
import copy

from collections import defaultdict
import csv
import itertools
import datetime 
from datetime import datetime
import time
import dateutil.parser
import pickle
import random

import gc
import zipfile
import sys, getopt
import os

from IPython.core.interactiveshell import InteractiveShell
from io import StringIO

import dask.dataframe as dd
#from chest import Chest

InteractiveShell.ast_node_interactivity = "all"
#InteractiveShell.ast_node_interactivity = "last"

# Magic function to make matplotlib inline
%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

# Set up Bokeh for inline viewing
bokeh.io.output_notebook()

import dask.dataframe as ddf
import dask.array as da

pd.set_option('max_columns', 500)
pd.set_option('max_rows', 800)

import scipy

import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.tsatools import detrend

import datetime as dt

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
# CHRIS and ARTEM code

def initialize_parameters(par = np.array([0.5, 0.9, 0, 1, 0])):
    # np.random.seed(3)
    parameters = {}

    parameters['alpha'] = par[0]
    parameters['beta'] = par[1]
    parameters['omega'] = par[2]* (1-par[1])    # one way to choose that is omega/(1-beta) = unconditional mean 
    parameters['sigma'] = par[3]
    parameters['f0'] = par[4]                   # one way to choose is unconditional mean 

    return parameters

def loglik(y, f, x, sigma):
    ll = -1/2*np.log(2*np.pi ) - 1/2*np.log(sigma) - 1/(2*sigma)*(y - x*f)**2 
    return ll


def score_compute(y, f, x, parameters, epsilon = 1e-7 ):
    alpha = parameters["alpha"]
    beta = parameters["beta"]
    omega = parameters['omega']
    sigma = parameters["sigma"]
    f0 = parameters["f0"]
    
    score = (y - x*f)/sigma
    
    return score

def filterGAS(y, x, parameters):
    
    alpha = parameters["alpha"]
    beta = parameters["beta"]
    omega = parameters['omega']
    sigma = parameters["sigma"]
    f0 = parameters["f0"]
    score0 = score_compute(y[0,:],  f0, x[0,:], parameters, epsilon = 1e-7) 
    f = np.zeros((len(y),1))
    
    f[0,:] = f0
    for t in range(1,len(y)):
        scoret = score_compute(y[t-1,:], f[t-1,:], x[t-1,:], parameters, epsilon = 1e-7) 
        f[t,:] = omega + alpha*scoret + beta*f[t-1,:] 

    return f

def loglikest(par, y, x):
    parameters = initialize_parameters(par)
    alpha = parameters["alpha"]
    beta = parameters["beta"]
    sigma = parameters["sigma"]
    # f0 = parameters["f0"]
    
    f = filterGAS(y, x, parameters) 
    ll = np.zeros((len(y), 1))
    m = len(y)

    for t in range(0, len(y)):
         ll[t,:] = loglik(y[t,:], f[t,:], x[t,:], sigma)
    loglik_res = -(np.sum(ll))/m

    return loglik_res

# ----------------------------------------------------------------

def score_compute_2(y, f, x, parameters, epsilon = 1e-7 ):
    alpha = parameters["alpha"]
    beta = parameters["beta"]
    omega = parameters['omega']
    sigma = parameters["sigma"]
    f0 = parameters["f0"]
    
    score = (y - x*f) 
    
    return score

def filterGAS_2(y, x, parameters):
    
    alpha = parameters["alpha"]
    beta = parameters["beta"]
    omega = parameters['omega']
    sigma = parameters["sigma"]
    f0 = parameters["f0"]
    score0 = score_compute_2(y[0,:],  f0, x[0,:], parameters, epsilon = 1e-7) 
    f = np.zeros((len(y),1))
    
    f[0,:] = f0
    for t in range(1,len(y)):
        scoret = score_compute_2(y[t-1,:], f[t-1,:], x[t-1,:], parameters, epsilon = 1e-7) 
        f[t,:] = omega + alpha*scoret + beta*f[t-1,:] 

    return f

def loglikest_2(par, y, x):
    parameters = initialize_parameters(par)
    alpha = parameters["alpha"]
    beta = parameters["beta"]
    sigma = parameters["sigma"]
    # f0 = parameters["f0"]
    
    f = filterGAS_2(y, x, parameters) 
    ll = np.zeros((len(y), 1))
    m = len(y)

    for t in range(0, len(y)):
         ll[t,:] = loglik(y[t,:], f[t,:], x[t,:], sigma)
    loglik_res = -(np.sum(ll))/m

    return loglik_res

# ----------------------------------------------------------------

def GAS_est(df):
    
    y = df.net_qty.values          # observed demand (response)
    x = df.buy_availability.values # buy_availability (explanatory)

    y = y.reshape((len(y),1)) 
    x = x.reshape((len(y),1))
    
    ret = pd.DataFrame()
    ret['year'] = df['year']
    ret['week'] = df['week']
        
    abc = scipy.optimize.minimize(
        loglikest,                                       # function to minimize (log likelihood y|x,theta)
        np.array([0.8, 0.9, np.mean(y), 1, np.mean(y)]), # initial parameter values (starting)
        args=(y, x), 
        options ={'eps':1e-09, 'maxiter': 600, 'ftol': 1e-12},
        method='L-BFGS-B', 
        bounds=((0,  None),             # alpha
                (-1, 1),                # beta
                (0.001, np.mean(y)*2),  # omega 
                (0.001, None),          # sigma
                (0.001, np.mean(y)*2)   # f
               )
    )
    
    
    # --- CONVERGENCE control flow ---
    if abc.success == True:
        
        x1par = initialize_parameters(abc.x) 
        GAS = filterGAS(y, x, x1par)
        
        ret['GAS_est'] = GAS
        ret['Convergence'] = [abc.success] * len(y)
        ret['Convg type'] = ['One'] * len(y)
        
    # **Modification if first algorithm fails
    elif abc.success == False:
        
        abc = scipy.optimize.minimize(
            loglikest_2,                                       # function to minimize (log likelihood y|x,theta)
            np.array([0.8, 0.9, np.mean(y), 1, np.mean(y)]),   # initial parameter values (starting)
            args=(y, x), 
            options ={'eps':1e-09, 'maxiter': 600, 'ftol': 1e-12},
            method='L-BFGS-B', 
            bounds=((0,  None),             # alpha
                    (-1, 1),                # beta
                    (0.001, np.mean(y)*2),  # omega 
                    (0.001, None),          # sigma
                    (0.001, np.mean(y)*2)   # f
                   )
                )

        x1par = initialize_parameters(abc.x) 
        GAS = filterGAS_2(y, x, x1par)
        
        ret['GAS_est'] = GAS
        ret['Convergence'] = [abc.success] * len(y)
        ret['Convg type'] = ['Two'] * len(y)

    return ret

# Data

In [38]:
# EU_seasons = pd.read_csv('data/EU_seasons.csv', low_memory = False, error_bad_lines = False, sep = ",") # 26 Aug
# SS18 = EU_seasons[(EU_seasons.season == 'SS18')]
# SS19 = EU_seasons[(EU_seasons.season == 'SS19')]

# CO = SS18[SS18.article_number.isin(SS19.article_number)]
# CO.shape
# len(CO.article_number.unique())

(5724, 3)

5722

In [4]:
SS19 = pd.read_excel('data/ecom_SS19.xlsx').dropna()
SS19_carryovers = SS19[SS19.carryover_FW18 == 'YES'] # carryovers only
len(SS19_carryovers)

1177

In [5]:
dat0 = pd.read_csv('data/ch4k_df.csv')
ref_dat0 = pd.read_csv('data/Article reference data.csv', low_memory = False, error_bad_lines = False, 
                       usecols = ['article_no', 'model_no', 'art_desc', 'sports_cat_desc', 'rmh_cat_desc', 
                                  'franchise', 'gender_desc', 'age_group_desc', 'prod_grp_desc', 'prod_type_desc',
                                  'brand_desc', 'bus_unit_desc', 'rmh_cat_desc'])

# Remove clearance transactions!!
dat0['clearance'] = dat0.clearance.fillna(0) 
dat0['net_qty'] = (1 - dat0.clearance)*dat0.net_qty


In [32]:
dat = dat0.copy()
dat = dat[(dat.net_qty > 0) & (dat.season.isin(['SS16', 'SS17', 'SS18']))]


In [33]:
# Within article price, cost, margin averages
dat[['price', 'cost', 'margin']] = (
    dat.
    groupby('article_number', group_keys=False)[['price', 'cost', 'margin']].
    transform(lambda x: np.nanmean(x).round(2))
             )

  """


## Buy Availability Correction

In [34]:
# Instead of GAS (quicker)

dat = dat[~dat.price.isna()]
dat = dat.sort_values(['article_number', 'country', 'year', 'week'])

# ---- Replace NAs and zeros (w/ no impact replacements) ----
dat['buy_availability'] = dat.buy_availability.fillna(1) # assume full availability 
dat['buy_availability'] = np.where(dat.buy_availability == 0, 1, dat.buy_availability) # replace 0


# ---- Smooth buy_availability ----
def roll(df):
    return df.rolling(window = 5, min_periods = 1, center = True).mean()

dat['buy_availability'] = dat.groupby(['article_number', 'country'])['buy_availability'].apply(roll)


# ---- Corrected net_qty ----
dat['corr_net_qty'] = (dat.net_qty / dat.buy_availability).round()
dat['corr_net_qty'] = dat.groupby(['article_number', 'year', 'week'])['corr_net_qty'].transform(lambda x: np.sum(x)) # over country


# ---- Aggregate by season, across country ----
# dat['corr_season_net_qty'] = dat.groupby(['article_number', 'season'])['corr_net_qty'].transform(lambda x: np.sum(x))
# dat['corr_season_net_qty'] = np.where(dat.corr_season_net_qty > dat.season_net_qty, dat.corr_season_net_qty, dat.season_net_qty) # only if >

dat = dat[['article_number', 'season', 'year', 'week', 'corr_net_qty']]

In [None]:
# --- GAS ----

# dat_GAS = dat0.copy()[['article_number', 'year', 'week', 'country', 'season', 'net_qty', 'buy_availability']]
# dat_GAS = dat_GAS[(dat_GAS.season == 'SS18') & (dat_GAS.article_number.isin(SS19_carryovers.article_number))].sort_values(['article_number', 'country', 'year', 'week'])
# dat_GAS = dat_GAS.groupby(['article_number', 'country']).apply(GAS_est).reset_index()
# dat_GAS = pd.DataFrame(dat_GAS.groupby(['article_number', 'year', 'week'])['GAS_est'].sum()).reset_index()


# Seasonality

In [10]:
seasonality_dat = (dat0.copy()[[
    'article_number', 'year', 'week', 'country', 'season', 'net_qty', 'sports_cat_desc', 'rmh_cat_desc', 'gender_desc', 'age_group_desc', 'franchise', 'prod_grp_desc']].
                   dropna().sort_values(['article_number', 'year', 'week'])
                  )

# -- Sum over UK/EU, ADD article reference data --
# seasonality_dat = pd.merge(
#     pd.DataFrame(seasonality_dat.groupby(['article_number', 'season', 'year', 'week'])['net_qty'].sum()).reset_index(), # sum over UK & EU
#     seasonality_dat[['article_number', 'sports_cat_desc', 'rmh_cat_desc', 'gender_desc', 'age_group_desc', 'franchise', 'prod_grp_desc']].drop_duplicates() # add reference information
#     ).dropna().sort_values(['article_number', 'year', 'week'])

seasonality_dat['net_qty2'] = seasonality_dat.groupby(['article_number', 'season', 'year', 'week'])['net_qty'].transform(sum)
seasonality_dat = seasonality_dat.drop(['country', 'net_qty'], axis = 1).drop_duplicates()
seasonality_dat = seasonality_dat.rename(columns = {'net_qty2': 'net_qty'})

# -- Mirror seasons only --
seasonality_dat = seasonality_dat[seasonality_dat.season.isin(['SS16', 'SS17', 'SS18'])]


In [11]:
# ---- Calculate cat-level weekly means across *ALL SEASONS* ---- 

seasonality_sport   = pd.DataFrame(seasonality_dat.groupby(['sports_cat_desc', 'week'])['net_qty'].mean()).reset_index().rename(columns = {'net_qty': 'sport_weekly_mean'})
seasonality_rmh     = pd.DataFrame(seasonality_dat.groupby(['rmh_cat_desc', 'week'])['net_qty'].mean()).reset_index().rename(columns = {'net_qty': 'rmh_weekly_mean'})
seasonality_gndr    = pd.DataFrame(seasonality_dat.groupby(['gender_desc', 'week'])['net_qty'].mean()).reset_index().rename(columns = {'net_qty': 'gender_weekly_mean'})
seasonality_agegrp  = pd.DataFrame(seasonality_dat.groupby(['age_group_desc', 'week'])['net_qty'].mean()).reset_index().rename(columns = {'net_qty': 'age_weekly_mean'})
seasonality_frnchse = pd.DataFrame(seasonality_dat.groupby(['franchise', 'week'])['net_qty'].mean()).reset_index().rename(columns = {'net_qty': 'franchise_weekly_mean'})
seasonality_prdgrp  = pd.DataFrame(seasonality_dat.groupby(['prod_grp_desc', 'week'])['net_qty'].mean()).reset_index().rename(columns = {'net_qty': 'prd_grp_weekly_mean'})

seasonality_dfs = [seasonality_sport, seasonality_rmh, seasonality_gndr, seasonality_agegrp, seasonality_frnchse, seasonality_prdgrp]



In [12]:
def regress(df):
    # function for regressing article net_qty on seasonalities
    # for article a's level of each category, retreive weekly means, then regress 
    
    df = df.sort_values(['article_number', 'year', 'week'])
        
    # article net_demand_qty
    y = df[['net_qty', 'week']].set_index('week')

    # article category-level weekly means; set_index() for joining
    x_sport   = seasonality_sport[seasonality_sport.sports_cat_desc == df.sports_cat_desc.unique()[0]].set_index('week')
    x_rmh     = seasonality_rmh[seasonality_rmh.rmh_cat_desc == df.rmh_cat_desc.unique()[0]].set_index('week')
    x_gndr    = seasonality_gndr[seasonality_gndr.gender_desc == df.gender_desc.unique()[0]].set_index('week')
    x_agegrp  = seasonality_agegrp[seasonality_agegrp.age_group_desc == df.age_group_desc.unique()[0]].set_index('week')
    x_frnchse = seasonality_frnchse[seasonality_frnchse.franchise == df.franchise.unique()[0]].set_index('week')
    x_prdgrp  = seasonality_prdgrp[seasonality_prdgrp.prod_grp_desc == df.prod_grp_desc.unique()[0]].set_index('week')

    # design matrix (ensure 'week' alignment)
    yX = (pd.merge(y, x_sport, left_index=True, right_index=True, how = 'outer').
          merge(x_rmh, left_index=True, right_index=True, how = 'outer').
          merge(x_gndr, left_index=True, right_index=True, how = 'outer').
          merge(x_agegrp, left_index=True, right_index=True, how = 'outer').
          merge(x_frnchse, left_index=True, right_index=True, how = 'outer').
          merge(x_prdgrp, left_index=True, right_index=True, how = 'outer').
          drop(['sports_cat_desc', 'rmh_cat_desc', 'gender_desc',
               'age_group_desc', 'franchise', 'prod_grp_desc'], axis = 1))

    # predict article 'a' net_demand_qty with 5 article 'a' category-level seasonalities

    y = yX.net_qty
    
    X = yX.drop('net_qty', axis = 1)
    X = sm.add_constant(X) # ****** ******* *******
    
    mod = sm.OLS(y, X, missing='drop').fit()
    # print(df.article_number.unique(), round(mod.rsquared, 2))

    ret = pd.DataFrame(index = X.index)
    ret['seas_preds'] = mod.predict(X).round()
    
    ret = ret.reset_index()
    ret = pd.DataFrame(ret.groupby('week')['seas_preds'].mean())

    return ret

In [13]:
# Just SS18-SS19 carryovers
seasonality_dat = seasonality_dat[(seasonality_dat.article_number.isin(SS19_carryovers.article_number))].sort_values(['article_number', 'year', 'week']) 

# Some confusion as to which articles are SS18 to SS19 carryovers; using EU_seasons and ecom_SS19.xlsx gives different
# Using ecom_SS19.xlsx, from Demand Planning SS Mike pointed me to


In [41]:
preds = seasonality_dat.groupby(['article_number']).apply(regress).reset_index() # regress articles of interest on seasonality
preds['seas_preds'] = np.where(preds.seas_preds > 0, preds.seas_preds, 0) # Zero out negative preds


# Merge back with reference data
preds = (pd.merge(preds, seasonality_dat[seasonality_dat.season == 'SS18'], how = 'left', left_on = ['article_number', 'week'], right_on=['article_number', 'week']).
         sort_values(['article_number', 'year', 'week']))[['article_number', 'year', 'week', 'net_qty', 'seas_preds']]
       

In [42]:
# Combined observed weeks (partial season)  --- AND --- regression predicted (all) weeks
preds = pd.merge(
    preds,   # all weeks
    dat,     # observed weeks
    how = 'left')

# corr_net_qty NAs where buy_availability == NA

# Aggregate to Season

In [43]:
# weekly assignment of GAS, seasonality, or combination
preds['y_hat'] = np.where(np.isnan(preds.corr_net_qty), preds.seas_preds, (preds.corr_net_qty + preds.seas_preds)/2)

In [44]:
preds['corrected'] = (preds.corr_net_qty + preds.seas_preds)/2

# Sum over season
preds_season = pd.DataFrame(preds.groupby('article_number')['y_hat'].apply(sum).round())

preds_season['y_hat'] = preds_season.y_hat * 1.1 # default growth rate

In [46]:
# Combine DAA + eCom

preds_season.shape
preds_season = pd.merge(
    preds_season.reset_index(),             # DAA forecasts
    SS19_carryovers[['article_number', 'Ecom_FC_RMA']], # eCom forecast
    how = 'left', left_on = 'article_number', right_on = 'article_number'
)
preds_season2.shape


(608, 1)

(608, 3)

# Overbuy

In [26]:
from functools import partial
from scipy import optimize
from scipy import integrate
import scipy.stats as stats

# Loss --- demand, buy, margin, cost
def L(d, b, margin, cost):
    if d > b:
        return (d - b)*margin
    elif d < b:
        return (b - d)*cost
    elif d == b:
        return 0
    else:
        print('Error')

# E[L | buy, article_mean, article_sd, article_margin, article_cost]
def EL(mu, sigma, margin, cost, b):
    I = lambda x: L(x, b, margin, cost) * stats.norm.pdf(x, mu, sigma) # I for integrand
    Exp_loss = integrate.quad(I, 0, mu + 3*sigma)/(1 - stats.norm.cdf(0, loc = mu, scale = sigma)) # Expected value of Loss function
    return round(Exp_loss[0], 2) 

def minimize_EL(mu, sigma, margin, cost):
    if(mu < 1000):
        return 1.2*mu 
    p = partial(EL, mu, sigma, margin, cost) # Make EL function of only one var: b_0
    buy_opt = optimize.minimize_scalar(p, bounds = (mu, mu + 2*sigma))
    return int(buy_opt['x']) # optimal buy quantity

In [47]:
# Load cost/margin data
cost_margin = dat0.copy()

cost_margin = pd.DataFrame(
    cost_margin[cost_margin.season.isin(['SS18', 'FW18', 'SS19', 'FW19'])].
                groupby('article_number')[['price', 'cost', 'margin']].
                mean()
)

# Add cost and margin for optimal overbuy estimation                                                                           
preds_season = pd.merge(preds_season, cost_margin, how = 'left', left_on = 'article_number', right_index=True).round()

In [None]:
# see evaluation.ipynb for sd estimation 

# ---- CURRENT ----
# opt_ovb_all = pd.DataFrame(preds_season.
#     apply(lambda row: minimize_EL(row['y_hat'], 550 + 0.2*row['y_hat'], row['margin'], row['cost']), axis=1)
#                           )

# opt_ovb_all = opt_ovb_all.rename(columns = {opt_ovb_all.columns[0]: 'Opt_Ovb'})

# preds_season = pd.merge(
#     preds_season, # everything
#     opt_ovb_all,  # optimal overbuy
#     right_index= True, left_index= True
# )

In [28]:
# ------- Try -------
Opt_Ovb = preds_season.apply(lambda row: minimize_EL(row['y_hat'], 550 + 0.2*row['y_hat'], row['margin'], row['cost']), axis=1)


In [69]:
preds_season = preds_season.set_index('article_number')
preds_season['Opt_Ovb'] = Opt_Ovb

In [39]:
# Profit
def P(d, margin, cost, b):
    if d > b:    # CANNOT satisfy demand
        return b*margin
    
    elif d <= b: # CAN satisfy demand
        return d*margin - (b - d)*cost
    
    else:
        print('Error')

In [86]:
# SS19 only, corrected

# Instead of GAS (quicker)
dat19 = dat0.copy()
dat19 = dat19[(dat19.net_qty > 0) & (dat19.season == 'SS19') & (dat19.article_number.isin(SS19_carryovers.article_number))]

dat19 = dat19[~dat19.price.isna()]
dat19 = dat19.sort_values(['article_number', 'country', 'year', 'week'])

# ---- Replace NAs and zeros (w/ no impact replacements) ----
dat19['buy_availability'] = dat19.buy_availability.fillna(1) # assume full availability 
dat19['buy_availability'] = np.where(dat19.buy_availability == 0, 1, dat19.buy_availability) # replace 0


# ---- Smooth buy_availability ----
def roll(df):
    return df.rolling(window = 5, min_periods = 1, center = True).mean()

dat19['buy_availability'] = dat19.groupby(['article_number', 'country'])['buy_availability'].apply(roll)

# ---- Corrected net_qty ----
dat19['corr_net_qty'] = (dat19.net_qty / dat19.buy_availability).round()

dat19['corr_net_qty'] = dat19.groupby(['article_number', 'year', 'week'])['corr_net_qty'].transform(lambda x: np.sum(x)) # over country

dat19 = dat19[['article_number', 'season', 'year', 'week', 'corr_net_qty']]


dat19['corr_season_net_qty'] = dat19.groupby(['article_number', 'season'])['corr_net_qty'].transform(lambda x: np.sum(x))

dat19 = dat19[['article_number', 'season', 'corr_season_net_qty']].drop_duplicates()

In [99]:
preds_season[['y_hat', 'Ecom_FC_RMA', 'Opt_Ovb']].head()
dat19.head()

Unnamed: 0_level_0,y_hat,Ecom_FC_RMA,Opt_Ovb
article_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11040,207.0,600.0,248.4
15110,1107.0,800.0,1723.0
19000,2878.0,2000.0,3663.0
19228,891.0,500.0,1069.2
19310,319.0,500.0,382.8


Unnamed: 0,article_number,season,corr_season_net_qty
2843310,11040,SS19,209.0
3773754,15110,SS19,1590.0
21785,19000,SS19,2507.0
1411561,19228,SS19,1027.0
2532499,19310,SS19,486.0


In [None]:
# Do some merging...
# ...calculate profits -->

In [None]:
# Profit example
eCom_profit = dat.apply(lambda row: P(row['corr_season_net_qty'], row['margin'], row['cost'], row['Ecom_FC_RMA']), axis=1)
DAA_profit = dat.apply(lambda row: P(row['corr_season_net_qty'], row['margin'], row['cost'], row['Opt_Ovb']), axis=1)
