# Evaluate model improvements

The steps:

1. Calculate 'true' SS19 net demand quantity.
2. Forecast SS19 true net demand with pre-SS19 data, using the methodologies of our previous delivery.
3. Forecast SS19 true net demand with pre-SS19 data, using the improved/current methodologies.
4. Evaluate the two approaches.

In [None]:
import numpy as np
import pandas as pd
import multiprocessing

import matplotlib.pyplot as plt
import bokeh
import bokeh.io
from bokeh.plotting import figure
from bokeh.io import output_notebook, show

# init_notebook_mode()

import seaborn as sns

import re
import math
import copy

from collections import defaultdict
import csv
import itertools
import datetime 
from datetime import datetime
import time
import dateutil.parser
import pickle
import random

import gc
import zipfile
import sys, getopt
import os

from IPython.core.interactiveshell import InteractiveShell
from io import StringIO

import dask.dataframe as dd
#from chest import Chest

InteractiveShell.ast_node_interactivity = "all"
#InteractiveShell.ast_node_interactivity = "last"

# Magic function to make matplotlib inline
%matplotlib inline

%config InlineBackend.figure_formats = {'png', 'retina'}

# Set up Bokeh for inline viewing
bokeh.io.output_notebook()

import dask.dataframe as ddf
import dask.array as da

pd.set_option('max_columns', 500)
pd.set_option('max_rows', 5000)

import scipy

import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.tsatools import detrend

import datetime as dt


In [None]:
dat0 = pd.read_csv('data/ch4k_df_eu.csv')
ref_dat0 = pd.read_csv('data/Article reference data.csv', low_memory = False, error_bad_lines = False, 
                       usecols = ['article_no', 'model_no', 'art_desc', 'sports_cat_desc', 'rmh_cat_desc', 
                                  'franchise', 'gender_desc', 'age_group_desc', 'prod_grp_desc', 'prod_type_desc'])



In [None]:
dat = dat0.copy()
dat = dat.dropna()
dat = dat[(dat.season_net_qty > 100)]
dat = dat.sort_values(['article_number', 'country', 'year', 'week'])



In [None]:
a = set(dat[dat.season == 'SS18'].article_number)
b = set(dat[dat.season == 'SS19'].article_number)

carryovers = a.intersection(b)


len(carryovers) 

## Method 0

In [None]:
dat = dat[['article_number', 'year', 'week', 'country', 'season', 'net_qty', 'clearance', 'season_net_qty', 'buy_availability']]

dat = pd.merge(
    pd.DataFrame(dat.groupby(['article_number', 'season', 'year', 'week'])['net_qty'].sum().reset_index()),
    pd.DataFrame(dat.groupby(['article_number', 'season', 'year', 'week'])['buy_availability', 'clearance', 'season_net_qty'].mean().round(2).reset_index())
)

# For sorting
# key = {'FW16': 1, 'SS17': 2, 'FW17': 3, 'SS18': 4, 'FW18': 5, 'SS19': 6, 'FW19': 7, 'SS20': 8}
# dat['order_key'] = [key[s] for s in dat.season]

In [None]:
dat['adj_net_qty'] = np.where(dat.net_qty * (1 - dat.clearance) > 0, 
                              dat.net_qty * (1 - dat.clearance),
                              0)

dat= pd.merge(
    dat,
    pd.DataFrame(dat[dat.buy_availability > 0.35].
             groupby(['article_number', 'season'])['adj_net_qty'].
             mean().
             round(1)*26
            ).rename(columns = {'adj_net_qty': 'adj_seas_net_qty'}).reset_index()
)



dat['adj_seas_net_qty'] = [max(x, y) for x, y in zip(dat['season_net_qty'], dat['adj_seas_net_qty'])] # max of original and adjustment



In [None]:
# dat[['net_qty', 'adj_net_qty']].describe()
     
# dat[['season_net_qty', 'adj_seas_net_qty']].drop_duplicates().describe()

In [None]:
dat_delivery = dat[dat.season == 'SS18'][['article_number', 'season', 'adj_seas_net_qty']].drop_duplicates()
#pd.crosstab(index = dat[['season', 'article_number']].drop_duplicates().season, columns = 'count')

dat_delivery = dat_delivery[dat_delivery.article_number.isin(carryovers)]
dat_delivery['DAA_0'] = dat_delivery.adj_seas_net_qty*1.1
dat_delivery.shape

mothership = dat_delivery[['article_number', 'DAA_0']].copy()

In [None]:
mothership.head()

## Buyers

In [None]:
buyer_table = pd.read_csv('data/Buyers Predictions.csv', low_memory = False, error_bad_lines = False, sep = ",")
buyer_table = buyer_table[(buyer_table.season == 'SS19')]

buyer_table = buyer_table[['article', 'season', 'ecom_marketing_forecast']].rename(columns = {'article': 'article_number'})



In [None]:
mothership = pd.merge(
    mothership,
    buyer_table[['article_number', 'ecom_marketing_forecast']])

## GAS w/ buy_availability (only)

In [None]:
def initialize_parameters(par = np.array([0.5, 0.9, 0, 1, 0])):
    # np.random.seed(3)
    parameters = {}

    parameters['alpha'] = par[0]
    parameters['beta'] = par[1]
    parameters['omega'] = par[2]* (1-par[1])    # one way to choose that is omega/(1-beta) = unconditional mean 
    parameters['sigma'] = par[3]
    parameters['f0'] = par[4]                   # one way to choose is unconditional mean 

    return parameters

def loglik(y, f, x, sigma):
    ll = -1/2*np.log(2*np.pi ) - 1/2*np.log(sigma) - 1/(2*sigma)*(y - x*f)**2 
    return ll


def score_compute(y, f, x, parameters, epsilon = 1e-7 ):
    alpha = parameters["alpha"]
    beta = parameters["beta"]
    omega = parameters['omega']
    sigma = parameters["sigma"]
    f0 = parameters["f0"]
    
    score = (y - x*f)/sigma
    # score = (y - x*f)
    
    return score

def filterGAS(y, x, parameters):
    
    alpha = parameters["alpha"]
    beta = parameters["beta"]
    omega = parameters['omega']
    sigma = parameters["sigma"]
    f0 = parameters["f0"]
    score0 = score_compute(y[0,:],  f0, x[0,:], parameters, epsilon = 1e-7) 
    f = np.zeros((len(y),1))
    
    f[0,:] = f0
    for t in range(1,len(y)):
        scoret = score_compute(y[t-1,:], f[t-1,:], x[t-1,:], parameters, epsilon = 1e-7) 
        f[t,:] = omega + alpha*scoret + beta*f[t-1,:] 

    return f

def loglikest(par, y, x):
    parameters = initialize_parameters(par)
    alpha = parameters["alpha"]
    beta = parameters["beta"]
    sigma = parameters["sigma"]
    # f0 = parameters["f0"]
    
    f = filterGAS(y, x, parameters) 
    ll = np.zeros((len(y), 1))
    m = len(y)

    for t in range(0, len(y)):
         ll[t,:] = loglik(y[t,:], f[t,:], x[t,:], sigma)
    loglik_res = -(np.sum(ll))/m
        
#     else:
#         loglik_res=10**9 # causing gradient problems??

    return loglik_res

def score_compute_2(y, f, x, parameters, epsilon = 1e-7 ):
    alpha = parameters["alpha"]
    beta = parameters["beta"]
    omega = parameters['omega']
    sigma = parameters["sigma"]
    f0 = parameters["f0"]
    
    score = (y - x*f) # ** The 'type = 2' modification **
    
    return score

def filterGAS_2(y, x, parameters):
    
    alpha = parameters["alpha"]
    beta = parameters["beta"]
    omega = parameters['omega']
    sigma = parameters["sigma"]
    f0 = parameters["f0"]
    score0 = score_compute_2(y[0,:],  f0, x[0,:], parameters, epsilon = 1e-7) 
    f = np.zeros((len(y),1))
    
    f[0,:] = f0
    for t in range(1,len(y)):
        scoret = score_compute_2(y[t-1,:], f[t-1,:], x[t-1,:], parameters, epsilon = 1e-7) 
        f[t,:] = omega + alpha*scoret + beta*f[t-1,:] 

    return f

def loglikest_2(par, y, x):
    parameters = initialize_parameters(par)
    alpha = parameters["alpha"]
    beta = parameters["beta"]
    sigma = parameters["sigma"]
    # f0 = parameters["f0"]
    
    f = filterGAS_2(y, x, parameters) 
    ll = np.zeros((len(y), 1))
    m = len(y)

    for t in range(0, len(y)):
         ll[t,:] = loglik(y[t,:], f[t,:], x[t,:], sigma)
    loglik_res = -(np.sum(ll))/m
        
#     else:
#         loglik_res=10**9 # causing gradient problems??

    return loglik_res

def GAS_est(df):
    
    y = df.net_qty.values          # observed demand (response)
    x = df.buy_availability.values # buy_availability (explanatory)

    y = y.reshape((len(y),1)) 
    x = x.reshape((len(y),1))
    
    ret = pd.DataFrame()
    ret['year'] = df['year']
    ret['week'] = df['week']
        
    abc = scipy.optimize.minimize(
        loglikest,                                       # function to minimize (log likelihood y|x,theta)
        np.array([0.8, 0.9, np.mean(y), 1, np.mean(y)]), # initial parameter values (starting)
        args=(y, x), 
        options ={'eps':1e-09, 'maxiter': 600, 'ftol': 1e-12},
        method='L-BFGS-B', 
        bounds=((0,  None),             # alpha
                (-1, 1),                # beta
                (0.001, None),  # omega 
                (0.001, None),          # sigma
                (0.001, None)   # f
               )
    )
    
    
    # --- CONVERGENCE control flow ---
    if abc.success == True:
        
        x1par = initialize_parameters(abc.x) 
        GAS = filterGAS(y, x, x1par)
        
        ret['GAS_est'] = GAS
        ret['Convergence'] = [abc.success] * len(y)
        ret['Convg type'] = ['One'] * len(y)
        
    # **Modification if first algorithm fails
    elif abc.success == False:
        
        print('Convergence failure notification')
        
        abc = scipy.optimize.minimize(
            loglikest_2,                                       # function to minimize (log likelihood y|x,theta)
            np.array([0.8, 0.9, np.mean(y), 1, np.mean(y)]),   # initial parameter values (starting)
            args=(y, x), 
            options ={'eps':1e-09, 'maxiter': 600, 'ftol': 1e-12},
            method='L-BFGS-B', 
            bounds=((0,  None),             # alpha
                    (-1, 1),                # beta
                    (0.001, None),  # omega 
                    (0.001, None),          # sigma
                    (0.001, None)   # f
                   )
                )
        
        if abc.success == False:
            print('Unresolved convergence failure')

        x1par = initialize_parameters(abc.x) 
        GAS = filterGAS_2(y, x, x1par)
        
        ret['GAS_est'] = GAS
        ret['Convergence'] = [abc.success] * len(y)
        ret['Convg type'] = ['Two'] * len(y)

    return ret

In [None]:
# dat0 = pd.read_csv('data/ch4k_df_eu.csv')

# ref_dat0 = pd.read_csv('data/Article reference data.csv', low_memory = False, # index_col = 0, 
#                        error_bad_lines = False,
#                        usecols = ['article_no', 'model_no', 'art_desc', 
#                                   'key_cat_desc', 'sports_cat_desc', 'rmh_cat_desc', 
#                                   'franchise', 'franchise_family',
#                                   'prod_grp_desc', 'prod_type_desc']                      
#                       )

In [None]:
dat = dat0.copy()

dat = dat.dropna()
dat = dat[(dat.season_net_qty > 100) & (dat.season_net_qty != 0) & (dat.season == 'SS18')]

# For constructing seasonality reference by product_type
# ref_dat = ref_dat0.copy()
# ref_dat = ref_dat.drop_duplicates()[['article_no', 'prod_type_desc', 'model_no', 'art_desc', 'key_cat_desc', 'sports_cat_desc', 'rmh_cat_desc', 'franchise', 'franchise_family', 'prod_grp_desc']]
# dat = pd.merge(dat, ref_dat, left_on='article_number', right_on='article_no', how = 'left')

dat = dat[dat.article_number.isin(carryovers)] 
dat = dat[['article_number', 'year', 'week', 'country', 'season', 'net_qty', 'clearance', 'season_net_qty', 'buy_availability']]
dat = dat.sort_values(['article_number', 'country', 'year', 'week'])



In [None]:
# dat['adj_net_qty'] = np.where(dat.net_qty * (1 - dat.clearance) > 0, 
#                               dat.net_qty * (1 - dat.clearance),
#                               0)

In [None]:
%%time 

dat_GAS = dat[dat.article_number.isin(a)].groupby(['article_number', 'country']).apply(GAS_est)
dat_GAS = dat_GAS.reset_index()

In [None]:
mothership = pd.merge(
    mothership,
    pd.DataFrame(dat_GAS.groupby(['article_number'])['GAS_est'].apply(sum).round(2)).reset_index()
)
mothership['GAS_est'] = mothership.GAS_est*1.1

In [None]:
mothership

In [None]:
# dat = pd.merge(
#     dat,
#     dat_GAS[['article_number', 'country', 'year', 'week', 'GAS_est']]
# )
# dat = dat.sort_values(['article_number', 'country', 'year', 'week'])

In [None]:
# Plot -- EDA

# a = np.random.choice(list(carryovers), size = 1, replace = False)[0]

# dat_a = dat[(dat.article_number == a) & (dat.country == 'EU')]
# dat_a.head()
# dat_a.shape

# plt.rcParams['font.size'] = 11
# plt.rcParams['legend.fontsize'] = 'medium'
# plt.rcParams['figure.titlesize'] = 'medium'
# plt.rcParams["figure.figsize"] = [18,12]

# plt.subplot(3,1,1)
# plt.plot(dat_a.week.round(0).astype(str), dat_a[['net_qty', 'GAS_est']], linewidth = 2.5)

# plt.subplot(3,1,2)
# plt.ylim(0,1)
# plt.plot(dat_a.week.round(0).astype(str), dat_a[['buy_availability']], linewidth = 2.5)


## Seasonality approach

In [None]:
# dat0 = pd.read_csv('data/ch4k_df_eu.csv')
# ref_dat0 = pd.read_csv('data/Article reference data.csv', low_memory = False, error_bad_lines = False, 
#                        usecols = ['article_no', 'model_no', 'art_desc', 'sports_cat_desc', 'rmh_cat_desc', 
#                                   'franchise', 'gender_desc', 'age_group_desc', 'prod_grp_desc', 'prod_type_desc'])


In [None]:
dat = dat0.copy()
ref_dat = ref_dat0.copy()

dat = dat.dropna()
dat = dat[(dat.season_net_qty > 100) & (dat.season.isin(['SS17', 'SS18']))]

# For constructing seasonality reference by product_type
ref_dat = ref_dat[['article_no', 'sports_cat_desc', 'rmh_cat_desc', 'gender_desc', 'age_group_desc', 'franchise', 'prod_grp_desc']].drop_duplicates() 
dat = pd.merge(dat, ref_dat, left_on='article_number', right_on='article_no', how = 'left')
dat = dat.sort_values(['article_number', 'year', 'week'])





In [None]:
# ---- All-seasons cat-level means ----
seasonality_dat = dat.copy()
seasonality_dat.shape

seasonality_dat = pd.merge(
    pd.DataFrame(seasonality_dat.groupby(['article_number', 'season', 'year', 'week'])['net_qty'].sum()).reset_index(),
    seasonality_dat[['article_number', 'sports_cat_desc', 'rmh_cat_desc', 'gender_desc', 'age_group_desc', 'franchise', 'prod_grp_desc']].drop_duplicates()
).dropna()


# seasonality_dat[seasonality_dat[['article_number', 'season', 'year', 'week', 'net_qty']].duplicated()].article_number.unique()

# seasonality_dat[seasonality_dat.isna().any(axis=1)]





In [None]:
seasonality_sport   = pd.DataFrame(seasonality_dat.groupby(['sports_cat_desc', 'week'])['net_qty'].mean()).reset_index().rename(columns = {'net_qty': 'sport_weekly_mean'})
seasonality_rmh     = pd.DataFrame(seasonality_dat.groupby(['rmh_cat_desc', 'week'])['net_qty'].mean()).reset_index().rename(columns = {'net_qty': 'rmh_weekly_mean'})
seasonality_gndr    = pd.DataFrame(seasonality_dat.groupby(['gender_desc', 'week'])['net_qty'].mean()).reset_index().rename(columns = {'net_qty': 'gender_weekly_mean'})
seasonality_agegrp  = pd.DataFrame(seasonality_dat.groupby(['age_group_desc', 'week'])['net_qty'].mean()).reset_index().rename(columns = {'net_qty': 'age_weekly_mean'})
seasonality_frnchse = pd.DataFrame(seasonality_dat.groupby(['franchise', 'week'])['net_qty'].mean()).reset_index().rename(columns = {'net_qty': 'franchise_weekly_mean'})
seasonality_prdgrp  = pd.DataFrame(seasonality_dat.groupby(['prod_grp_desc', 'week'])['net_qty'].mean()).reset_index().rename(columns = {'net_qty': 'prd_grp_weekly_mean'})

seasonality_dfs = [seasonality_sport, seasonality_rmh, seasonality_gndr, seasonality_agegrp, seasonality_frnchse, seasonality_prdgrp]

In [None]:
# ---- SS18 seasonality (cat-level means) ----
seasonality_dat_SS18 = seasonality_dat[seasonality_dat.season == 'SS18']

seasonality_sport_SS18   = pd.DataFrame(seasonality_dat_SS18.groupby(['sports_cat_desc', 'week'])['net_qty'].mean()).reset_index().rename(columns = {'net_qty': 'sport_weekly_mean_SS18'})
seasonality_rmh_SS18     = pd.DataFrame(seasonality_dat_SS18.groupby(['rmh_cat_desc', 'week'])['net_qty'].mean()).reset_index().rename(columns = {'net_qty': 'rmh_weekly_mean_SS18'})
seasonality_gndr_SS18    = pd.DataFrame(seasonality_dat_SS18.groupby(['gender_desc', 'week'])['net_qty'].mean()).reset_index().rename(columns = {'net_qty': 'gender_weekly_mean_SS18'})
seasonality_agegrp_SS18  = pd.DataFrame(seasonality_dat_SS18.groupby(['age_group_desc', 'week'])['net_qty'].mean()).reset_index().rename(columns = {'net_qty': 'age_weekly_mean_SS18'})
seasonality_frnchse_SS18 = pd.DataFrame(seasonality_dat_SS18.groupby(['franchise', 'week'])['net_qty'].mean()).reset_index().rename(columns = {'net_qty': 'franchise_weekly_mean_SS18'})
seasonality_prdgrp_SS18 = pd.DataFrame(seasonality_dat_SS18.groupby(['prod_grp_desc', 'week'])['net_qty'].mean()).reset_index().rename(columns = {'net_qty': 'prd_grp_weekly_mean_SS18'})

seasonality_dfs_SS18 = [seasonality_sport_SS18, seasonality_rmh_SS18, seasonality_gndr_SS18, seasonality_agegrp_SS18, seasonality_frnchse_SS18, seasonality_prdgrp_SS18]

In [None]:
# ---- SS18-SS19 carryovers: SS18 data ----
carryovers_SS18 = seasonality_dat[(seasonality_dat.season == 'SS18') & 
                                  (seasonality_dat.article_number.isin(carryovers))
                                 ].rename(columns = {'net_qty': 'article_net_qty'})

carryovers_SS18 = carryovers_SS18.sort_values(['article_number', 'season', 'year', 'week'])

In [None]:
# Add all-season mean column to SS18 df --- to calculate SS19 deseas

net_qty_cols = ['sport_weekly_mean', 'rmh_weekly_mean', 'gender_weekly_mean', 'age_weekly_mean', 'franchise_weekly_mean', 'prd_grp_weekly_mean'] # seasonality_df[<net_qty_by_another_name>] 
abbrevs = ['sp', 'rmh', 'gndr', 'age', 'frnchse', 'prd_grp']

for i in range(6):
    seasonality_dfs_SS18[i] = pd.DataFrame(pd.merge(seasonality_dfs[i], seasonality_dfs_SS18[i]))
    seasonality_dfs_SS18[i][abbrevs[i] + '_deseas'] = seasonality_dfs_SS18[i].loc[:, net_qty_cols[i] + '_SS18'] - seasonality_dfs_SS18[i].loc[:, net_qty_cols[i]]



In [None]:
# function for regressing article net_qty on seasonalities

def regress(df):
    # for article a's level of each category, retreive weekly means, then regress 

#     print(df.columns)
#     print(df.iloc[:5,:5])
    
    ret = pd.DataFrame()
    ret['year'] = df['year']
    ret['week'] = df['week']
    
    # article net_demand_qty
    y = df[['article_net_qty', 'week']].set_index('week')

    # article category-level combination weekly means
    # set_index() for joining
    x_sport   = seasonality_sport[seasonality_sport.sports_cat_desc == df.sports_cat_desc.unique()[0]].set_index('week')
    x_rmh     = seasonality_rmh[seasonality_rmh.rmh_cat_desc == df.rmh_cat_desc.unique()[0]].set_index('week')
    x_gndr    = seasonality_gndr[seasonality_gndr.gender_desc == df.gender_desc.unique()[0]].set_index('week')
    x_agegrp  = seasonality_agegrp[seasonality_agegrp.age_group_desc == df.age_group_desc.unique()[0]].set_index('week')
    x_frnchse = seasonality_frnchse[seasonality_frnchse.franchise == df.franchise.unique()[0]].set_index('week')
    x_prdgrp  = seasonality_prdgrp[seasonality_prdgrp.prod_grp_desc == df.prod_grp_desc.unique()[0]].set_index('week')

    # design matrix (ensure 'week' alignment)
    yX = (pd.merge(y, x_sport, left_index=True, right_index=True).
          merge(x_rmh, left_index=True, right_index=True).
          merge(x_gndr, left_index=True, right_index=True).
          merge(x_agegrp, left_index=True, right_index=True).
          merge(x_frnchse, left_index=True, right_index=True).
          merge(x_prdgrp, left_index=True, right_index=True).
          drop(['sports_cat_desc', 'rmh_cat_desc', 'gender_desc',
               'age_group_desc', 'franchise', 'prod_grp_desc'], axis = 1))

    # predict article 'a' net_demand_qty with 5 article 'a' category-level seasonalities
    # print(df.article_number.unique())
    
    mod = sm.OLS(yX.article_net_qty, yX.drop('article_net_qty', axis = 1), missing='drop').fit()
#     print(df.article_number.unique())
#     print(round(mod.rsquared, 2))
#     print()
    
    ret['seas_preds'] = mod.predict()
    ret['deseas_net_qty'] = df['article_net_qty'] - ret['seas_preds']

    return ret

In [None]:
%%time
carryovers_SS18.shape

# Component seasonality predictions
carryovers_SS18 = pd.merge(
    carryovers_SS18,
    carryovers_SS18.groupby(['article_number']).apply(regress).reset_index().drop('level_1', axis = 1)
)
carryovers_SS18.shape



In [None]:
def initialize_parameters_0(par = np.array([0.5, 0.9, 0, 1, 0])):
    parameters = {}

    parameters['alpha'] = par[0]
    parameters['beta'] = par[1]
    parameters['omega'] = par[2]* (1-par[1])
    parameters['sigma'] = par[3]
    parameters['f0'] = par[4]                   

    return parameters


def loglik_0(y, f, sigma):
    ll = -1/2*np.log(2*np.pi ) - 1/2*np.log(sigma) - 1/(2*sigma)*(y - f)**2 
    return ll


def score_compute_0(y, f, parameters, epsilon = 1e-7 ):
    alpha = parameters["alpha"]
    beta = parameters["beta"]
    omega = parameters['omega']
    sigma = parameters["sigma"]
    f0 = parameters["f0"]
    
    score = (y - f)/sigma
    
    return score


def filterGAS_0(y, parameters):
    
    alpha = parameters["alpha"]
    beta = parameters["beta"]
    omega = parameters['omega']
    sigma = parameters["sigma"]
    f0 = parameters["f0"]
    
    score0 = score_compute_0(y[0,:],  f0, parameters, epsilon = 1e-7) 
    
    f = np.zeros((len(y),1))
    f[0,:] = f0
    
    for t in range(1,len(y)):
        scoret = score_compute_0(y[t-1,:], f[t-1,:], parameters, epsilon = 1e-7) 
        f[t,:] = omega + alpha*scoret + beta*f[t-1,:] 

    return f


def loglikest_0(par, y):
    
    parameters = initialize_parameters_0(par)
    alpha = parameters["alpha"]
    beta = parameters["beta"]
    sigma = parameters["sigma"]
    
    f = filterGAS_0(y, parameters) 
    ll = np.zeros((len(y), 1))
    m = len(y)

    for t in range(0, len(y)):
         ll[t,:] = loglik_0(y[t,:], f[t,:], sigma)
            
    loglik_res = -(np.sum(ll))/m

    return loglik_res


def GAS_est_0(df, col):
    
    y = df.loc[:, col].values 
    y = y.reshape((len(y),1)) 
    
    ret = pd.DataFrame()
    ret['week'] = df['week']
        
    abc = scipy.optimize.minimize(
        loglikest_0,                                       
        np.array([0.8, 0.9, np.mean(y), 1, np.mean(y)]), 
        args = y, 
        options = {'eps':1e-09, 'maxiter': 600, 'ftol': 1e-12},
        method ='L-BFGS-B', 
        bounds =((0,  None),    # alpha
                (-1, 1),        # beta
                (0.001, None),  # omega 
                (0.001, None),  # sigma
                (0.001, None)   # f
               )
    )
    
    if abc.success == False:
        print('C1 failed for article', abc.message)

    x1par = initialize_parameters_0(abc.x) 
    GAS = filterGAS_0(y, x1par)
    ret['GAS_est'] = GAS
    
    return ret     

In [None]:
%%time 

categories =  ['sports_cat_desc', 'rmh_cat_desc', 'gender_desc', 'age_group_desc', 'franchise',      'prod_grp_desc']
deseas_cols = ['sp_deseas',       'rmh_deseas',   'gndr_deseas', 'age_deseas',     'frnchse_deseas', 'prd_grp_deseas'] # seasonality_df[<net_qty_by_another_name>] 
abbrevs =     ['sp',              'rmh',          'gndr',        'agegrp',         'frnchse',        'prdgrp']

for i in range(6):
    seasonality_dfs_SS18[i] = pd.DataFrame(pd.merge(
        seasonality_dfs_SS18[i], 
        seasonality_dfs_SS18[i].groupby(categories[i]).apply(GAS_est_0, col = deseas_cols[i]).reset_index().drop('level_1', axis = 1)
    )).rename(columns = {'GAS_est': 'GAS_est_' + abbrevs[i] + '_deseas'}) # distinct names for GAS_est column
  

In [None]:
def initialize_parameters_mv(par = np.array([0.5, 0.9, 0, 1, 0, 0, 0, 0, 0, 0, 0])):
    parameters = {}

    parameters['alpha'] = par[0]
    parameters['beta'] = par[1]
    parameters['omega'] = par[2]* (1-par[1])
    parameters['sigma'] = par[3]
    parameters['f0'] = par[4]
    
    # l for lambda
    parameters['l_sport'] = par[5]
    parameters['l_rmh'] = par[6]    
    parameters['l_gender'] = par[7]
    parameters['l_age'] = par[8]
    parameters['l_franchise'] = par[9]
    parameters['l_prod_grp'] = par[10]
    
    return parameters


def loglik_mv(y, f, 
              f_sp, f_rmh, f_gndr, f_age, f_fr, f_pr, 
              l_sport, l_rmh, l_gender, l_age, l_franchise, l_prod_grp, 
              sigma):
    
    ll = -1/2*np.log(2*np.pi ) - 1/2*np.log(sigma) - 1/(2*sigma)*(y - (f + l_sport*f_sp + l_rmh*f_rmh + l_gender*f_gndr + l_age*f_age + l_franchise*f_fr + l_prod_grp*f_pr))**2 
    
    return ll


def score_compute_mv(y, f, 
                     f_sp, f_rmh, f_gndr, f_age, f_fr, f_pr, 
                     parameters, epsilon = 1e-7):
          
    alpha = parameters["alpha"]
    beta = parameters["beta"]
    omega = parameters['omega']
    sigma = parameters["sigma"]
    f0 = parameters["f0"]
          
    l_sport = parameters['l_sport'] 
    l_rmh = parameters['l_rmh'] 
    l_gender = parameters['l_gender'] 
    l_age = parameters['l_age'] 
    l_franchise = parameters['l_franchise'] 
    l_prod_grp = parameters['l_prod_grp']
    
    score = (y - (f + l_sport*f_sp + l_rmh*f_rmh + l_gender*f_gndr + l_age*f_age + l_franchise*f_fr + l_prod_grp*f_pr))/sigma
    
    return score


def filterGAS_mv(y, f_sp, f_rmh, f_gndr, f_age, f_fr, f_pr, parameters):
    
    alpha = parameters["alpha"]
    beta = parameters["beta"]
    omega = parameters['omega']
    sigma = parameters["sigma"]
    f0 = parameters["f0"]
          
    l_sport = parameters['l_sport'] 
    l_rmh = parameters['l_rmh'] 
    l_gender = parameters['l_gender'] 
    l_age = parameters['l_age'] 
    l_franchise = parameters['l_franchise'] 
    l_prod_grp = parameters['l_prod_grp']
    
    # print('filterGAS_mv:', y.shape)
    
    score0 = score_compute_mv(y[0,:], f0, f_sp[0,:], f_rmh[0,:], f_gndr[0,:], f_age[0,:], f_fr[0,:], f_pr[0,:], parameters, epsilon = 1e-7) 
    
    f = np.zeros((len(y), 1))
    f[0,:] = f0
    
    for t in range(1,len(y)):
        scoret = score_compute_mv(y[t-1,:], f[t-1,:], f_sp[t-1,:], f_rmh[t-1,:], f_gndr[t-1,:], f_age[t-1,:], f_fr[t-1,:], f_pr[t-1,:], parameters, epsilon = 1e-7) 
        f[t,:] = omega + alpha*scoret + beta*f[t-1,:] 

    return f


def loglikest_mv(parameters, y, f_sp, f_rmh, f_gndr, f_age, f_fr, f_pr):
    
    parameters = initialize_parameters_mv(parameters)
          
    alpha = parameters["alpha"]
    beta = parameters["beta"]
    sigma = parameters["sigma"]
          
    l_sport = parameters['l_sport'] 
    l_rmh = parameters['l_rmh'] 
    l_gender = parameters['l_gender'] 
    l_age = parameters['l_age'] 
    l_franchise = parameters['l_franchise'] 
    l_prod_grp = parameters['l_prod_grp']
          
    ll = np.zeros((len(y), 1))
    m = len(y)
    
    if len(y.shape) == 1:
        y = y.reshape(len(y), 1)
        
    f = filterGAS_mv(y, f_sp, f_rmh, f_gndr, f_age, f_fr, f_pr, parameters) 
          
    for t in range(0, len(y)):
         ll[t,:] = loglik_mv(y[t,:], f[t,:], 
                             f_sp[t,:], f_rmh[t,:], f_gndr[t,:], f_age[t,:], f_fr[t,:], f_pr[t,:], 
                             l_sport, l_rmh, l_gender, l_age, l_franchise, l_prod_grp, 
                             sigma)
            
    loglik_res = -(np.sum(ll))/m

    return loglik_res

In [None]:
# ---- create df for next GAS-ing ----
for i in range(6):
    if i == 0:
        dat_factors = pd.merge(carryovers_SS18, seasonality_dfs_SS18[i], how = 'left')
    else:
        dat_factors = pd.merge(dat_factors, seasonality_dfs_SS18[i], how = 'left')
    
    

# dat_factors.columns
dat_factors = dat_factors[['article_number', 'year', 'week', 'season', 
            'deseas_net_qty', 
            'GAS_est_sp_deseas', 'GAS_est_rmh_deseas', 'GAS_est_gndr_deseas', 
            'GAS_est_agegrp_deseas', 'GAS_est_frnchse_deseas', 'GAS_est_prdgrp_deseas']]

In [None]:
dat_factors.head()

In [None]:
# GAS behemoth

def GAS_est_mv(df):
    
    # For article a's level of each category 
        # Retrieve deseasonalized y*
        # Retreive appropriate GAS factors
        # Then apply multi-factor GAS 
            
    ret = pd.DataFrame()
    ret['week'] = df['week']
    
    y = df.deseas_net_qty.values     
    y = y.reshape(len(y), 1) 
        
    f_sp = df.GAS_est_sp_deseas.values.reshape((len(y),1)) 
    f_rmh = df.GAS_est_rmh_deseas.values.reshape((len(y),1)) 
    f_gndr = df.GAS_est_gndr_deseas.values.reshape((len(y),1)) 
    f_age = df.GAS_est_agegrp_deseas.values.reshape((len(y),1)) 
    f_fr = df.GAS_est_frnchse_deseas.values.reshape((len(y),1)) 
    f_pr = df.GAS_est_prdgrp_deseas.values.reshape((len(y),1)) 
    
    abc = scipy.optimize.minimize(
        loglikest_mv,                                       
        np.array([0.8, 0.9, np.mean(y), 1, np.mean(y), 0, 0, 0, 0, 0, 0]), 
        args = (y, f_sp, f_rmh, f_gndr, f_age, f_fr, f_pr),
        options = {'eps':1e-09, 'maxiter': 600, 'ftol': 1e-12},
        method ='L-BFGS-B', 
        bounds =(
                (0,  None),             # alpha
                (-1, 1),                # beta
                (0.001, None),          # omega 
                (0.001, None),          # sigma
                (0.001, None),          # f
                (None, None),           # lambda_sport
                (None, None),           # lambda_rmh
                (None, None),           # lambda_gender
                (None, None),           # lambda_age
                (None, None),           # lambda_franchise
                (None, None)            # lambda_prod_grp
               )
    )
      
    # --- CONVERGENCE check message ---
    if abc.success == False:
        print('Convergence failed', abc.message)
        print('Article:', df.article_number.unique()[0])
        print()

    x1par = initialize_parameters_mv(abc.x) 
    GAS = filterGAS_mv(y, f_sp, f_rmh, f_gndr, f_age, f_fr, f_pr, x1par)
    ret['deseas_GAS_est'] = GAS

    return ret

In [None]:
a = np.random.choice(dat_factors.article_number.unique(), size = 50, replace = False)
dat_factors_subset = dat_factors[dat_factors.article_number.isin(a)]

In [None]:
%%time

# 10 = 1 min
# 20 = 1 min 20s
# 50 = 2 min 45s

dat_factors_subset
dat_factors_subset = pd.DataFrame(pd.merge(
        dat_factors_subset, 
        dat_factors_subset.groupby('article_number').apply(GAS_est_mv).reset_index().drop('level_1', axis = 1)
    ))

dat_factors_subset.shape
dat_factors_subset.head()

In [None]:
# y_est = seas + deseas = 
#       = carryover_SS18['seas_preds'] + dat_factors['deseas_GAS_est']

carryovers_SS18 = pd.merge(
    carryovers_SS18,
    dat_factors_subset[['article_number', 'season', 'year', 'week', 'deseas_GAS_est']]
)

In [209]:
pd.merge(
    mothership,
    pd.DataFrame(carryovers_SS18.groupby('article_number')['seas_preds'].apply(sum)*1.1).reset_index()
)

Unnamed: 0,article_number,DAA_0,ecom_marketing_forecast,GAS_est,seas_preds
0,807295,860.2,900.0,2376.297,878.016382
1,AJ5881,258.5,407.0,382.404,236.854112
2,BD5321,293.7,250.0,789.118,293.830528
3,BJ9174,549.12,489.0,926.541,496.170246
4,BK3161,552.2,0.0,700.469,545.993015
5,BP6233,220.22,998.0,341.451,204.931391
6,BR1065,223.08,440.0,236.137,222.729893
7,BR5110,294.8,0.0,342.837,294.257353
8,BY9434,532.4,0.0,981.068,567.415294
9,BY9544,6755.32,6831.0,8351.53,6113.531021


In [None]:
carryovers_SS18['net_qty_est'] = carryovers_SS18.seas_preds + y_star.y_star 
# y^* = f + lambda1*f_s1 + lambda2*f_s2 + ... + f_s5
# y_hat = y_hat_seas + y_star

In [None]:
# OLS to reverse engineer coefficients and predict!!

# Combine into one df for regression trick
dat_factors2 = pd.merge(
    dat_factors,
    carryovers_SS18[['article_number', 'season', 'year', 'week', 'deseas_GAS_est']]
)

# recreate coefficients, predict
def regress2(df):
    ret = pd.DataFrame()
    ret['week'] = df.week

    ret['y_star'] = sm.OLS(df.deseas_net_qty, 
           df[['deseas_GAS_est', 
               'GAS_est_sp_deseas', 'GAS_est_rmh_deseas', 'GAS_est_gndr_deseas', 
               'GAS_est_agegrp_deseas', 'GAS_est_frnchse_deseas', 'GAS_est_prdgrp_deseas']]
          ).fit().predict().round(2)
    
    return ret

# true-er deseasonalized 
y_star = pd.DataFrame(dat_factors2.groupby('article_number').
                      apply(regress2).
                      reset_index()).drop('level_1', axis = 1)

carryovers_SS18 = pd.merge(carryovers_SS18, y_star)

In [None]:
# seasonal component + true-er deseasonalized component
carryovers_SS18['y_est'] = carryovers_SS18.seas_preds + carryovers_SS18.y_star

In [210]:
pd.merge(
    mothership,
    pd.DataFrame(carryovers_SS18.groupby('article_number')['y_est'].sum()*1.1).reset_index()
)


Unnamed: 0,article_number,DAA_0,ecom_marketing_forecast,GAS_est,y_est
0,807295,860.2,900.0,2376.297,872.747382
1,AJ5881,258.5,407.0,382.404,253.255112
2,BD5321,293.7,250.0,789.118,296.976528
3,BJ9174,549.12,489.0,926.541,552.875246
4,BK3161,552.2,0.0,700.469,546.609015
5,BP6233,220.22,998.0,341.451,213.291391
6,BR1065,223.08,440.0,236.137,223.169893
7,BR5110,294.8,0.0,342.837,292.893353
8,BY9434,532.4,0.0,981.068,552.917294
9,BY9544,6755.32,6831.0,8351.53,5995.864021


## Actuals

In [None]:
dat = dat0.copy()

dat = dat.dropna()
dat = dat[(dat.season_net_qty > 100) & (dat.season == 'SS19')]
dat = dat.groupby('article_number')['net_qty'].apply('sum')

## Appendix

In [None]:
# dat_SS18 = dat[dat.season == 'SS18'].copy().rename(columns = {'net_qty': 'article_net_qty'}).drop(['clearance', 'margin', 'gross_demand_quantity', 'season_gross_demand_quantity'], axis = 1)

# # ---- For development select subset of articles ----
# articles = dat_SS18.article_number.unique()
# articles_subset = np.random.choice(articles, size = 5, replace = False)

# dat_SS18 = dat_SS18[dat_SS18.article_number.isin(articles_subset)]



In [None]:
# a = np.random.choice(articles_subset, size = 1, replace = False)
# dat_SS18[dat_SS18.article_number == a[0]][['article_net_qty', 'seas_preds']].plot(linewidth = 3)