In [None]:
import numpy as np
import pandas as pd
import multiprocessing

import matplotlib.pyplot as plt
import bokeh
import bokeh.io
from bokeh.plotting import figure
from bokeh.io import output_notebook, show

# init_notebook_mode()

import seaborn as sns

import re
import math
import copy

from collections import defaultdict
import csv
import itertools
import datetime 
from datetime import datetime
import time
import dateutil.parser
import pickle
import random

import gc
import zipfile
import sys, getopt
import os

from IPython.core.interactiveshell import InteractiveShell
from io import StringIO

import dask.dataframe as dd
#from chest import Chest

InteractiveShell.ast_node_interactivity = "all"
#InteractiveShell.ast_node_interactivity = "last"

# Magic function to make matplotlib inline
%matplotlib inline

%config InlineBackend.figure_formats = {'png', 'retina'}

# Set up Bokeh for inline viewing
bokeh.io.output_notebook()

import dask.dataframe as ddf
import dask.array as da

pd.set_option('max_columns', 500)
pd.set_option('max_rows', 800)

import scipy

import statsmodels.api as sm

# Functions

In [None]:
def initialize_parameters(par = np.array([0.5, 0.9, 0, 1, 0])):
    # np.random.seed(3)
    parameters = {}

    parameters['alpha'] = par[0]
    parameters['beta'] = par[1]
    parameters['omega'] = par[2]* (1-par[1])    # one way to choose that is omega/(1-beta) = unconditional mean 
    parameters['sigma'] = par[3]
    parameters['f0'] = par[4]                   # one way to choose is unconditional mean 

    return parameters

def loglik(y, f, x, sigma):
    ll = -1/2*np.log(2*np.pi ) - 1/2*np.log(sigma) - 1/(2*sigma)*(y - x*f)**2 
    return ll


def score_compute(y, f, x, parameters, epsilon = 1e-7 ):
    alpha = parameters["alpha"]
    beta = parameters["beta"]
    omega = parameters['omega']
    sigma = parameters["sigma"]
    f0 = parameters["f0"]
    
    score = (y - x*f)/sigma
    # score = (y - x*f)
    
    return score

def filterGAS(y, x, parameters):
    
    alpha = parameters["alpha"]
    beta = parameters["beta"]
    omega = parameters['omega']
    sigma = parameters["sigma"]
    f0 = parameters["f0"]
    score0 = score_compute(y[0,:],  f0, x[0,:], parameters, epsilon = 1e-7) 
    f = np.zeros((len(y),1))
    
    f[0,:] = f0
    for t in range(1,len(y)):
        scoret = score_compute(y[t-1,:], f[t-1,:], x[t-1,:], parameters, epsilon = 1e-7) 
        f[t,:] = omega + alpha*scoret + beta*f[t-1,:] 

    return f

def loglikest(par, y, x):
    parameters = initialize_parameters(par)
    alpha = parameters["alpha"]
    beta = parameters["beta"]
    sigma = parameters["sigma"]
    # f0 = parameters["f0"]
    
    f = filterGAS(y, x, parameters) 
    ll = np.zeros((len(y), 1))
    m = len(y)

    for t in range(0, len(y)):
         ll[t,:] = loglik(y[t,:], f[t,:], x[t,:], sigma)
    loglik_res = -(np.sum(ll))/m
        
#     else:
#         loglik_res=10**9 # causing gradient problems??

    return loglik_res

In [None]:
def score_compute_2(y, f, x, parameters, epsilon = 1e-7 ):
    alpha = parameters["alpha"]
    beta = parameters["beta"]
    omega = parameters['omega']
    sigma = parameters["sigma"]
    f0 = parameters["f0"]
    
    score = (y - x*f) # ** The 'type = 2' modification **
    
    return score

def filterGAS_2(y, x, parameters):
    
    alpha = parameters["alpha"]
    beta = parameters["beta"]
    omega = parameters['omega']
    sigma = parameters["sigma"]
    f0 = parameters["f0"]
    score0 = score_compute_2(y[0,:],  f0, x[0,:], parameters, epsilon = 1e-7) 
    f = np.zeros((len(y),1))
    
    f[0,:] = f0
    for t in range(1,len(y)):
        scoret = score_compute_2(y[t-1,:], f[t-1,:], x[t-1,:], parameters, epsilon = 1e-7) 
        f[t,:] = omega + alpha*scoret + beta*f[t-1,:] 

    return f

def loglikest_2(par, y, x):
    parameters = initialize_parameters(par)
    alpha = parameters["alpha"]
    beta = parameters["beta"]
    sigma = parameters["sigma"]
    # f0 = parameters["f0"]
    
    f = filterGAS_2(y, x, parameters) 
    ll = np.zeros((len(y), 1))
    m = len(y)

    for t in range(0, len(y)):
         ll[t,:] = loglik(y[t,:], f[t,:], x[t,:], sigma)
    loglik_res = -(np.sum(ll))/m
        
#     else:
#         loglik_res=10**9 # causing gradient problems??

    return loglik_res

In [None]:
def GAS_est(df):
    
    y = df.net_qty.values          # observed demand (response)
    x = df.buy_availability.values # buy_availability (explanatory)

    y = y.reshape((len(y),1)) 
    x = x.reshape((len(y),1))
    
    ret = pd.DataFrame()
    ret['year'] = df['year']
    ret['week'] = df['week']
        
    abc = scipy.optimize.minimize(
        loglikest,                                       # function to minimize (log likelihood y|x,theta)
        np.array([0.8, 0.9, np.mean(y), 1, np.mean(y)]), # initial parameter values (starting)
        args=(y, x), 
        options ={'eps':1e-09, 'maxiter': 600, 'ftol': 1e-12},
        method='L-BFGS-B', 
        bounds=((0,  None),             # alpha
                (-1, 1),                # beta
                (0.001, np.mean(y)*2),  # omega 
                (0.001, None),          # sigma
                (0.001, np.mean(y)*2)   # f
               )
    )
    
    
    # --- CONVERGENCE control flow ---
    if abc.success == True:
        
        x1par = initialize_parameters(abc.x) 
        GAS = filterGAS(y, x, x1par)
        
        ret['GAS_est'] = GAS
        ret['Convergence'] = [abc.success] * len(y)
        ret['Convg type'] = ['One'] * len(y)
        
    # **Modification if first algorithm fails
    elif abc.success == False:
        
        abc = scipy.optimize.minimize(
            loglikest_2,                                       # function to minimize (log likelihood y|x,theta)
            np.array([0.8, 0.9, np.mean(y), 1, np.mean(y)]),   # initial parameter values (starting)
            args=(y, x), 
            options ={'eps':1e-09, 'maxiter': 600, 'ftol': 1e-12},
            method='L-BFGS-B', 
            bounds=((0,  None),             # alpha
                    (-1, 1),                # beta
                    (0.001, np.mean(y)*2),  # omega 
                    (0.001, None),          # sigma
                    (0.001, np.mean(y)*2)   # f
                   )
                )

        x1par = initialize_parameters(abc.x) 
        GAS = filterGAS_2(y, x, x1par)
        
        ret['GAS_est'] = GAS
        ret['Convergence'] = [abc.success] * len(y)
        ret['Convg type'] = ['Two'] * len(y)

    return ret

## Carryover 

In [None]:
rma1_0 = pd.read_csv('data/article_range_rma1_adidas_fw20.csv', low_memory = False,
                  error_bad_lines = False, sep = ";")

rma1_0 = rma1_0[rma1_0['eCom Range'] == 'YES']

In [None]:
rma1_rbk = pd.read_csv('data/reebok_FW20range.csv') # Reebok FW20 range

In [None]:
buyer_table = pd.read_csv('data/buyer_table.csv', low_memory = False,
                  error_bad_lines = False, sep = ",")

In [None]:
buyer_table2 = buyer_table[(buyer_table.season == 'FW19')]# &
                           # (buyer_table.ecom_marketing_forecast > 1)]
buyer_table2.brand.value_counts()
buyer_table2.head()

In [None]:
rma1 = rma1_0.copy()

# FW19 to FW20 carryovers
carryovers = set(buyer_table2.article).intersection(set(rma1['Article Number']))
len(carryovers)

In [None]:
rbk_carryovers = set(buyer_table2.article).intersection(set(rma1_rbk.iloc[:,0]))
len(rbk_carryovers)

## Data

In [187]:
dat0 = pd.read_csv('data/ch4k_df_eu.csv')

ref_dat0 = pd.read_csv('data/Article reference data.csv', low_memory = False, # index_col = 0, 
                       error_bad_lines = False,
                       usecols = ['article_no', 'model_no', 'art_desc', 
                                  'sports_cat_desc', 'rmh_cat_desc', 
                                  'franchise', 'gender_desc', 'age_group_desc',
                                  'prod_grp_desc', 'prod_type_desc']                      
                      )



In [188]:
dat = dat0.copy()
ref_dat = ref_dat0.copy()

dat.dropna(inplace = True)
dat = dat[(dat.season_net_qty > 100) &
          (dat.country == 'EU')] # temporary

# For constructing seasonality reference by product_type
ref_dat = ref_dat[['article_no', 
                   'sports_cat_desc', 'rmh_cat_desc', 
                   'gender_desc', 'age_group_desc',
                   'franchise', 'prod_grp_desc']].drop_duplicates() 

dat = pd.merge(dat, ref_dat, left_on='article_number', right_on='article_no', how = 'left')

dat = dat.sort_values(['article_number', 'year', 'week'])

# For sorting
# key = {'FW16': 1, 'SS17': 2, 'FW17': 3, 'SS18': 4, 'FW18': 5, 'SS19': 6, 'FW19': 7, 'SS20': 8}
# dat['order_key'] = [key[s] for s in dat.season]

In [203]:
# for c in ref_dat.columns:
#     print(c, len(ref_dat[c].unique()))
#     # print(ref_dat[c].unique())
#     print()

# pd.crosstab(index = [dat['prod_grp_desc']], columns = 'count')

In [192]:
dat[dat.season == 'SS19']

Unnamed: 0,article_number,brand,year,week,country,season,gross_demand_quantity,net_qty,clearance,margin,season_gross_demand_quantity,season_net_qty,buy_availability,cost,price,article_no,sports_cat_desc,rmh_cat_desc,gender_desc,age_group_desc,franchise,prod_grp_desc
687305,011040,adidas,2018.0,48.0,EU,SS19,1,1,0.00,79.37,161.0,116.0,1.00,40.03,119.40,011040,FOOTBALL/SOCCER,FOOTBALL GENERIC,MEN,ADULT,NOT APPLICABLE,SHOES
897327,011040,adidas,2018.0,49.0,EU,SS19,10,8,0.00,68.24,161.0,116.0,1.00,40.02,108.26,011040,FOOTBALL/SOCCER,FOOTBALL GENERIC,MEN,ADULT,NOT APPLICABLE,SHOES
336930,011040,adidas,2018.0,50.0,EU,SS19,6,5,0.00,73.21,161.0,116.0,1.00,40.03,113.24,011040,FOOTBALL/SOCCER,FOOTBALL GENERIC,MEN,ADULT,NOT APPLICABLE,SHOES
902066,011040,adidas,2018.0,52.0,EU,SS19,3,2,0.00,83.82,161.0,116.0,1.00,40.12,123.94,011040,FOOTBALL/SOCCER,FOOTBALL GENERIC,MEN,ADULT,NOT APPLICABLE,SHOES
906720,011040,adidas,2019.0,1.0,EU,SS19,1,1,0.00,86.28,161.0,116.0,1.00,40.15,126.43,011040,FOOTBALL/SOCCER,FOOTBALL GENERIC,MEN,ADULT,NOT APPLICABLE,SHOES
98298,011040,adidas,2019.0,2.0,EU,SS19,3,3,0.00,87.11,161.0,116.0,1.00,40.12,127.23,011040,FOOTBALL/SOCCER,FOOTBALL GENERIC,MEN,ADULT,NOT APPLICABLE,SHOES
332299,011040,adidas,2019.0,3.0,EU,SS19,5,5,0.00,80.39,161.0,116.0,1.00,40.12,120.51,011040,FOOTBALL/SOCCER,FOOTBALL GENERIC,MEN,ADULT,NOT APPLICABLE,SHOES
729827,011040,adidas,2019.0,4.0,EU,SS19,10,7,0.00,55.25,161.0,116.0,1.00,39.97,95.22,011040,FOOTBALL/SOCCER,FOOTBALL GENERIC,MEN,ADULT,NOT APPLICABLE,SHOES
191868,011040,adidas,2019.0,5.0,EU,SS19,3,3,0.00,60.66,161.0,116.0,1.00,40.01,100.67,011040,FOOTBALL/SOCCER,FOOTBALL GENERIC,MEN,ADULT,NOT APPLICABLE,SHOES
4774,011040,adidas,2019.0,6.0,EU,SS19,3,3,0.00,74.59,161.0,116.0,1.00,40.06,114.65,011040,FOOTBALL/SOCCER,FOOTBALL GENERIC,MEN,ADULT,NOT APPLICABLE,SHOES


## Attribute-level Specific  Seasonality 

In [214]:
# Calculate weekly means, by category, for well selling articles, pre-FW19
# Use SS seasons for now, b/c there is no FW19 in my data to experiment on
seasonality_dat = dat[(dat.season_net_qty > 500) &
                      # (dat.season.isin(['FW16', 'FW17', 'FW18']))
                        (dat.season.isin(['SS16', 'SS17', 'SS18'])) # will use these to 'predict' SS19
                       ]

seasonality_sport   = pd.DataFrame(seasonality_dat.groupby(['sports_cat_desc', 'week'])['net_qty'].mean()).reset_index().rename(columns = {'net_qty': 'sport_weekly_mean'})
seasonality_rmh     = pd.DataFrame(seasonality_dat.groupby(['rmh_cat_desc', 'week'])['net_qty'].mean()).reset_index().rename(columns = {'net_qty': 'rmh_weekly_mean'})
seasonality_gndr    = pd.DataFrame(seasonality_dat.groupby(['gender_desc', 'week'])['net_qty'].mean()).reset_index().rename(columns = {'net_qty': 'gender_weekly_mean'})
seasonality_agegrp  = pd.DataFrame(seasonality_dat.groupby(['age_group_desc', 'week'])['net_qty'].mean()).reset_index().rename(columns = {'net_qty': 'age_weekly_mean'})
seasonality_frnchse = pd.DataFrame(seasonality_dat.groupby(['franchise', 'week'])['net_qty'].mean()).reset_index().rename(columns = {'net_qty': 'franchise_weekly_mean'})
seasonality_prdgrp  = pd.DataFrame(seasonality_dat.groupby(['prod_grp_desc', 'week'])['net_qty'].mean()).reset_index().rename(columns = {'net_qty': 'prd_grp_weekly_mean'})



In [208]:
seasonality_sport.head()
seasonality_dat.head()


Unnamed: 0,sports_cat_desc,week,sport_weekly_mean
0,BASKETBALL,1.0,29.263158
1,BASKETBALL,2.0,23.85
2,BASKETBALL,3.0,22.615385
3,BASKETBALL,4.0,21.875
4,BASKETBALL,5.0,23.219512


Unnamed: 0,article_number,brand,year,week,country,season,gross_demand_quantity,net_qty,clearance,margin,season_gross_demand_quantity,season_net_qty,buy_availability,cost,price,article_no,sports_cat_desc,rmh_cat_desc,gender_desc,age_group_desc,franchise,prod_grp_desc
729828,15110,adidas,2016.0,49.0,EU,SS17,5,5,0.0,65.21,649.0,507.0,0.0,32.93,98.14,15110,FOOTBALL/SOCCER,FOOTBALL GENERIC,MEN,ADULT,NOT APPLICABLE,SHOES
677959,15110,adidas,2016.0,50.0,EU,SS17,21,12,0.0,71.45,649.0,507.0,0.0,32.37,103.82,15110,FOOTBALL/SOCCER,FOOTBALL GENERIC,MEN,ADULT,NOT APPLICABLE,SHOES
210382,15110,adidas,2016.0,51.0,EU,SS17,38,24,0.0,70.71,649.0,507.0,0.0,32.28,102.99,15110,FOOTBALL/SOCCER,FOOTBALL GENERIC,MEN,ADULT,NOT APPLICABLE,SHOES
168462,15110,adidas,2016.0,52.0,EU,SS17,24,19,0.0,79.16,649.0,507.0,0.0,31.78,110.94,15110,FOOTBALL/SOCCER,FOOTBALL GENERIC,MEN,ADULT,NOT APPLICABLE,SHOES
757995,15110,adidas,2017.0,1.0,EU,SS17,26,18,0.0,80.86,649.0,507.0,0.0,30.61,111.47,15110,FOOTBALL/SOCCER,FOOTBALL GENERIC,MEN,ADULT,NOT APPLICABLE,SHOES


In [251]:
dat_SS19 = dat[dat.season == 'SS19'].copy().rename(columns = {'net_qty': 'article_net_qty'})

# SS19 articles
# For: article ~ seasonality_(article_attribute_level)

articles = dat_SS19.article_number.unique()
# a = np.random.choice(articles, size = 1, replace = False)[0]
# a

In [212]:
dat_SS19

Unnamed: 0,article_number,brand,year,week,country,season,gross_demand_quantity,article_net_qty,clearance,margin,season_gross_demand_quantity,season_net_qty,buy_availability,cost,price,article_no,sports_cat_desc,rmh_cat_desc,gender_desc,age_group_desc,franchise,prod_grp_desc
687305,011040,adidas,2018.0,48.0,EU,SS19,1,1,0.00,79.37,161.0,116.0,1.00,40.03,119.40,011040,FOOTBALL/SOCCER,FOOTBALL GENERIC,MEN,ADULT,NOT APPLICABLE,SHOES
897327,011040,adidas,2018.0,49.0,EU,SS19,10,8,0.00,68.24,161.0,116.0,1.00,40.02,108.26,011040,FOOTBALL/SOCCER,FOOTBALL GENERIC,MEN,ADULT,NOT APPLICABLE,SHOES
336930,011040,adidas,2018.0,50.0,EU,SS19,6,5,0.00,73.21,161.0,116.0,1.00,40.03,113.24,011040,FOOTBALL/SOCCER,FOOTBALL GENERIC,MEN,ADULT,NOT APPLICABLE,SHOES
902066,011040,adidas,2018.0,52.0,EU,SS19,3,2,0.00,83.82,161.0,116.0,1.00,40.12,123.94,011040,FOOTBALL/SOCCER,FOOTBALL GENERIC,MEN,ADULT,NOT APPLICABLE,SHOES
906720,011040,adidas,2019.0,1.0,EU,SS19,1,1,0.00,86.28,161.0,116.0,1.00,40.15,126.43,011040,FOOTBALL/SOCCER,FOOTBALL GENERIC,MEN,ADULT,NOT APPLICABLE,SHOES
98298,011040,adidas,2019.0,2.0,EU,SS19,3,3,0.00,87.11,161.0,116.0,1.00,40.12,127.23,011040,FOOTBALL/SOCCER,FOOTBALL GENERIC,MEN,ADULT,NOT APPLICABLE,SHOES
332299,011040,adidas,2019.0,3.0,EU,SS19,5,5,0.00,80.39,161.0,116.0,1.00,40.12,120.51,011040,FOOTBALL/SOCCER,FOOTBALL GENERIC,MEN,ADULT,NOT APPLICABLE,SHOES
729827,011040,adidas,2019.0,4.0,EU,SS19,10,7,0.00,55.25,161.0,116.0,1.00,39.97,95.22,011040,FOOTBALL/SOCCER,FOOTBALL GENERIC,MEN,ADULT,NOT APPLICABLE,SHOES
191868,011040,adidas,2019.0,5.0,EU,SS19,3,3,0.00,60.66,161.0,116.0,1.00,40.01,100.67,011040,FOOTBALL/SOCCER,FOOTBALL GENERIC,MEN,ADULT,NOT APPLICABLE,SHOES
4774,011040,adidas,2019.0,6.0,EU,SS19,3,3,0.00,74.59,161.0,116.0,1.00,40.06,114.65,011040,FOOTBALL/SOCCER,FOOTBALL GENERIC,MEN,ADULT,NOT APPLICABLE,SHOES


In [228]:
# function for regressing article on seasonalities

def process(df):
    # ---- for article a's level of each attribute, calculate weekly means -----

    ret = pd.DataFrame()
    ret['year'] = df['year']
    ret['week'] = df['week']
    ret['country'] = df['country']
    
    # article net_demand_qty
    y = df[['article_net_qty', 'week']].set_index('week')

    # article category+level weekly means
    # set_index() for joining
    x_sport   = seasonality_sport[seasonality_sport.sports_cat_desc == dat_a.sports_cat_desc.unique()[0]].set_index('week')
    x_rmh     = seasonality_rmh[seasonality_rmh.rmh_cat_desc == dat_a.rmh_cat_desc.unique()[0]].set_index('week')
    x_gndr    = seasonality_gndr[seasonality_gndr.gender_desc == dat_a.gender_desc.unique()[0]].set_index('week')
    x_agegrp  = seasonality_agegrp[seasonality_agegrp.age_group_desc == dat_a.age_group_desc.unique()[0]].set_index('week')
    x_frnchse = seasonality_frnchse[seasonality_frnchse.franchise == dat_a.franchise.unique()[0]].set_index('week')
    x_prdgrp  = seasonality_prdgrp[seasonality_prdgrp.prod_grp_desc == dat_a.prod_grp_desc.unique()[0]].set_index('week')

    # design matrix (ensure 'week' alignment)
    yX = (pd.merge(y, x_sport, left_index=True, right_index=True).
          merge(x_rmh, left_index=True, right_index=True).
          merge(x_gndr, left_index=True, right_index=True).
          merge(x_agegrp, left_index=True, right_index=True).
          merge(x_frnchse, left_index=True, right_index=True).
          merge(x_prdgrp, left_index=True, right_index=True).
          drop(['sports_cat_desc', 'rmh_cat_desc', 'gender_desc',
               'age_group_desc', 'franchise', 'prod_grp_desc'], axis = 1))

    # predict article 'a' net_demand_qty with 5 article a category-level seasonalities
    mod = sm.OLS(yX.article_net_qty, yX.drop('article_net_qty', axis = 1), missing='drop').fit()
    
    ret['seas_preds'] = mod.predict()

    return ret


In [247]:
# ~12000 total SS19 articles
articles_test = np.random.choice(articles, size = 50, replace = False)

dat_SS19_test = dat_SS19[dat_SS19.article_number.isin(articles_test)]


In [249]:
%%time
# 500 -- 8 seconds
dat_SS19_test.groupby(['article_number']).apply(process).reset_index().drop('level_1', axis = 1)

CPU times: user 844 ms, sys: 5.87 ms, total: 850 ms
Wall time: 847 ms


Unnamed: 0,article_number,year,week,country,seas_preds
0,AC7932,2018.0,48.0,EU,2.594896
1,AC7932,2018.0,49.0,EU,11.410541
2,AC7932,2018.0,50.0,EU,14.873851
3,AC7932,2018.0,51.0,EU,10.902777
4,AC7932,2018.0,52.0,EU,6.887653
5,AC7932,2019.0,1.0,EU,8.158033
6,AC7932,2019.0,2.0,EU,8.829637
7,AC7932,2019.0,3.0,EU,5.617653
8,AC7932,2019.0,4.0,EU,9.563550
9,AC7932,2019.0,5.0,EU,4.548046


In [222]:
yX

Unnamed: 0_level_0,article_net_qty,sport_weekly_mean,rmh_weekly_mean,gender_weekly_mean,age_weekly_mean,franchise_weekly_mean,prd_grp_weekly_mean
week,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
48.0,1,7.162047,7.147436,7.0176,7.033191,5.0,5.48227
49.0,2,35.061224,35.035484,28.270504,27.713314,19.727273,24.955263
50.0,3,68.049407,68.031652,54.517702,53.995577,23.666667,51.854531
51.0,1,66.095331,66.110029,53.01043,52.195397,16.75,52.056031
52.0,1,56.927184,56.941691,43.007138,44.705095,23.416667,49.674419
1.0,6,55.773407,55.93207,43.22093,47.399334,21.5,48.016351
2.0,1,52.046681,52.141288,40.953286,46.475601,32.384615,47.571516
3.0,4,43.02953,43.173414,35.380753,41.083248,28.076923,42.005396
4.0,2,41.653581,41.782548,32.213263,37.967485,36.818182,38.661604
5.0,1,44.285714,44.39775,34.056295,38.99259,23.846154,42.227325


In [None]:
# Need to decide how to collect/store predictions, merge with data -- be sure week and season are aligned
yX
mod.predict()


# GAS

In [None]:
dat_FW18 = dat[dat.season == 'FW18'].copy()
dat_FW18.sort_values(['article_number', 'year', 'week'], inplace = True)


#   --- RANDOM SUBSET OF ARTICLES ---
dat_FW18.set_index(['article_number'], inplace = True)
a = np.random.choice(dat_FW18.index.unique(), size = 100, replace = False)
# dat_FW18 = dat_FW18.loc[np.append(a, ['D96635']) ,:].copy() # *********
dat_FW18 = dat_FW18.loc[a ,:].copy()
dat_FW18.reset_index(inplace = True)
#   ---- ---- ---- ---- ---- ---- 

In [None]:
%%time 

dat_GAS = dat_FW18.groupby('article_number').apply(GAS_est)
dat_GAS.reset_index(inplace = True)


## Combine: observed + GAS + seasonal

In [None]:
# Add GAS estimates to data
dat_FW18 = dat_FW18.merge(
    dat_GAS[['article_number', 'year', 'week', 'GAS_est']], 
    how = 'left', 
    on = ['article_number', 'year', 'week']
                          ).sort_values(['article_number', 'year', 'week'])

# tidy
dat_FW18 = dat_FW18[['article_number', 'year', 'week', 'net_qty', 'GAS_est', 'buy_availability', 'prod_grp_desc']]
seasonality.rename(columns = {'net_qty': 'seasonal_week_avg'}, inplace = True)

In [None]:
# Two part combine: merge + concatenate

df = dat_FW18.copy()
container = pd.DataFrame(data = None, columns = dat_FW18.columns)

for a in df.article_number.unique():
    
    dat_a = df[df.article_number == a]
    seasonality_a = seasonality[seasonality.prod_grp_desc == dat_a.prod_grp_desc.unique()[0]]
    
    dat_a = pd.merge(dat_a, seasonality_a[['week', 'seasonal_week_avg']], 
                     how = 'right', 
                     on = 'week').sort_values('week')
        
    dat_a[['year', 'article_number', 'prod_grp_desc']] = dat_a[['year', 'article_number', 'prod_grp_desc']].fillna(method='ffill')
    dat_a[['year', 'article_number', 'prod_grp_desc']] = dat_a[['year', 'article_number', 'prod_grp_desc']].fillna(method='bfill')

    container = pd.concat([container, dat_a], sort = True)

dat_FW18 = container.copy()
del container, df

## Seasonality Prediction Column

In [None]:
def ols_preds(df):
    
    if(df.seasonal_week_avg.isna().mean() == 1):
        ret['preds'] = df['seasonal_week_avg']
        print('Uh oh:', df.article_number.unique())
        return ret
    
    ret = pd.DataFrame()

    df = df.sort_values(['article_number', 'week'])
    ret['week'] = df['week']
    
    # df.net_qty: only observed weeks
    # df.seasonal_week_avg: all weeks, b/c historical avgs
    
    mod = sm.OLS(df.GAS_est, df.seasonal_week_avg, missing = 'drop').fit() 
    
    mod_predict = mod.predict(df.seasonal_week_avg) # predict for all weeks

    ret['preds'] = mod_predict
    
    
    return ret  

In [None]:
dat_FW18.head()

In [None]:
preds = dat_FW18.groupby(['article_number']).apply(ols_preds)
preds.reset_index(inplace = True)

In [None]:
dat_FW18 = pd.merge(dat_FW18, preds[['article_number', 'week', 'preds']], how = 'outer')

dat_FW18 = dat_FW18[['article_number', 'prod_grp_desc', 'year', 'week', 'net_qty', 
                     'buy_availability', 'GAS_est', 
                     'seasonal_week_avg', 'preds']]


In [None]:
dat_FW18 = dat_FW18.set_index('week')

## Plots

In [None]:
#   --- CHOOSE RANDOM ARTICLE TO PLOT---
# a = np.random.choice(dat_FW18.article_number.unique(), size = 1, replace = False)[0]
a = 'CQ2659'
dat_a = dat_FW18[dat_FW18.article_number == a].copy()
# dat_a.sort_values('week', inplace = True)
# dat_a.set_index(['week'], inplace = True)
#   ---- ---- ---- ---- ---- ---- 

plt.rcParams['font.size'] = 22
plt.rcParams['legend.fontsize'] = 'medium'
plt.rcParams['figure.titlesize'] = 'medium'
plt.rcParams["figure.figsize"] = [14,18]

# Plot 0 -------
plt.subplot(3,1,1)
plt.plot(dat_a.index, 
         dat_a[['GAS_est']], 
         linewidth = 4, color = 'orange')
# plt.legend(('GAS_est'))

plt.xlim((21, 50))
# plt.title('Net Demand Qty: Observed and GAS Estimated \'True\' ')
plt.title('Net Demand Qty: GAS Estimated \'True\' ')
plt.xlabel('Week')
plt.ylabel('Net Demand Qty')
plt.grid(True)
plt.subplots_adjust(hspace = 0.4) ####


# Plot -------
plt.subplot(3,1,2)
plt.plot(dat_a.index, 
         dat_a[['preds']], 
         linewidth = 4)
# plt.legend(('seasonal_week_avg', 'preds'))

plt.xlim((21, 50))
# plt.title('Net Demand Qty: Observed, GAS, Seasonality')
plt.title('Net Demand Qty: Seasonality-based Prediction')

plt.xlabel('Week')
plt.ylabel('Net Demand Qty')
plt.grid(True)
plt.subplots_adjust(hspace = 0.4) ####



# Plot -------
plt.subplot(3,1,3)
plt.plot(dat_a.index, 
         pd.concat([dat_a['GAS_est'].loc[22:35], dat_a['preds'].loc[36:49]]), 
         linewidth = 4, color = 'green')
# plt.legend(('seasonal_week_avg', 'preds'))

plt.xlim((21, 50))
# plt.title('Net Demand Qty: Observed, GAS, Seasonality')
plt.title('Net Demand Qty: GAS + Seasonality')

plt.xlabel('Week')
plt.ylabel('Net Demand Qty')
plt.grid(True)
plt.subplots_adjust(hspace = 0.4) ####


# # Plot 2 -------
# plt.subplot(3,1,3)
# plt.plot(dat_a.index, dat_a['buy_availability'], 
#          linewidth = 4, label='buy_availability')
# plt.xlim((21, 50))
# plt.xlabel('Week')
# plt.ylabel('Net Demand Qty')
# plt.title('Buy Availability')
# plt.grid(True)
# plt.subplots_adjust(hspace = 0.4) ####

# plt.legend()

# RED (preds) is GREEN's (article type weekly avg) prediction of ORANGE (GAS)

In [None]:
pd.concat([dat_a['GAS_est'].loc[22:35], dat_a['preds'].loc[36:49]])

# dat[(dat.article_number == 'CQ2659') &
#           (dat.country == 'EU') & 
#           (dat.season_net_qty != 0)].sort_values('week')[['week', 'net_qty', 'season_net_qty', 'buy_availability']]

dat_a 

# dat_a.buy_availability.iloc[16]
# type(dat_a.buy_availability.iloc[16])

# is_nan(dat_a.buy_availability.iloc[16])



# Appendix

In [None]:
# # ---- for article a's level of each attribute, calculate weekly means -----
    
# # article a
# dat_a = dat_SS19[dat_SS19.article_number == a]

# # article 'a' net_demand_qty
# y = dat_a[['article_net_qty', 'week']].set_index('week')

# # article 'a' category-level weekly means
# # set_index() for joining
# x_sport   = seasonality_sport[seasonality_sport.sports_cat_desc == dat_a.sports_cat_desc.unique()[0]].set_index('week')
# x_rmh     = seasonality_rmh[seasonality_rmh.rmh_cat_desc == dat_a.rmh_cat_desc.unique()[0]].set_index('week')
# x_gndr    = seasonality_gndr[seasonality_gndr.gender_desc == dat_a.gender_desc.unique()[0]].set_index('week')
# x_agegrp  = seasonality_agegrp[seasonality_agegrp.age_group_desc == dat_a.age_group_desc.unique()[0]].set_index('week')
# x_frnchse = seasonality_frnchse[seasonality_frnchse.franchise == dat_a.franchise.unique()[0]].set_index('week')
# x_prdgrp  = seasonality_prdgrp[seasonality_prdgrp.prod_grp_desc == dat_a.prod_grp_desc.unique()[0]].set_index('week')

# # design matrix (ensure 'week' alignment)
# yX = (pd.merge(y, x_sport, left_index=True, right_index=True).
#      merge(x_rmh, left_index=True, right_index=True).
#      merge(x_gndr, left_index=True, right_index=True).
#      merge(x_agegrp, left_index=True, right_index=True).
#      merge(x_frnchse, left_index=True, right_index=True).
#      merge(x_prdgrp, left_index=True, right_index=True).
#      drop(['sports_cat_desc', 'rmh_cat_desc', 'gender_desc',
#            'age_group_desc', 'franchise', 'prod_grp_desc'], axis = 1))

# # predict article 'a' net_demand_qty with 5 article a category-level seasonalities
# mod = sm.OLS(yX.article_net_qty, yX.drop('article_net_qty', axis = 1), missing='drop').fit()

# mod.predict()
# print(mod.summary())