In [2]:
import numpy as np
import pandas as pd
import multiprocessing

import matplotlib.pyplot as plt
import bokeh
import bokeh.io
from bokeh.plotting import figure
from bokeh.io import output_notebook, show

import seaborn as sns

import re
import math
import copy

from collections import defaultdict
import csv
import itertools
import datetime 
from datetime import datetime
import time
import dateutil.parser
import pickle
import random

import gc
import zipfile
import sys, getopt
import os

from IPython.core.interactiveshell import InteractiveShell
from io import StringIO

import dask.dataframe as dd

InteractiveShell.ast_node_interactivity = "all"

%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

# Set up Bokeh for inline viewing
bokeh.io.output_notebook()

import dask.dataframe as ddf
import dask.array as da

pd.set_option('max_columns', 500)
pd.set_option('max_rows', 800)

import scipy


# GAS Model

In [34]:
def initialize_parameters(par = np.array([0.5, 0.9, 0, 1, 0])):
    # np.random.seed(3)
    parameters = {}

    parameters['alpha'] = par[0]
    parameters['beta'] = par[1]
    parameters['omega'] = par[2]* (1-par[1])    # one way to choose that is omega/(1-beta) = unconditional mean 
    parameters['sigma'] = par[3]
    parameters['f0'] = par[4]                   # one way to choose is unconditional mean 

    return parameters

def loglik(y, f, x, sigma):
    ll = -1/2*np.log(2*np.pi ) - 1/2*np.log(sigma) - 1/(2*sigma)*(y - x*f)**2 
    return ll


def score_compute(y, f, x, parameters, epsilon = 1e-7 ):
    alpha = parameters["alpha"]
    beta = parameters["beta"]
    omega = parameters['omega']
    sigma = parameters["sigma"]
    f0 = parameters["f0"]
    
    score = (y - x*f)/sigma
    # score = (y - x*f)
    
    return score

def filterGAS(y, x, parameters):
    
    alpha = parameters["alpha"]
    beta = parameters["beta"]
    omega = parameters['omega']
    sigma = parameters["sigma"]
    f0 = parameters["f0"]
    score0 = score_compute(y[0,:],  f0, x[0,:], parameters, epsilon = 1e-7) 
    f = np.zeros((len(y),1))
    
    f[0,:] = f0
    for t in range(1,len(y)):
        scoret = score_compute(y[t-1,:], f[t-1,:], x[t-1,:], parameters, epsilon = 1e-7) 
        f[t,:] = omega + alpha*scoret + beta*f[t-1,:] 

    return f

def loglikest(par, y, x):
    parameters = initialize_parameters(par)
    alpha = parameters["alpha"]
    beta = parameters["beta"]
    sigma = parameters["sigma"]
    # f0 = parameters["f0"]
    
#     if  (alpha > 0) and (sigma > 0) and (beta > alpha) : # *these are problematic maybe...???

    f = filterGAS(y, x, parameters) 
    ll = np.zeros((len(y), 1))
    m = len(y)

    for t in range(0, len(y)):
         ll[t,:] = loglik(y[t,:], f[t,:], x[t,:], sigma)
    loglik_res = -(np.sum(ll))/m
        
#     else:
#         loglik_res=10**9 # causing gradient problems??

    return loglik_res

### Estimate a GAS model

In [None]:
dat0 = pd.read_csv('ch4k_df_eu.csv', low_memory = False, index_col = 0)

In [None]:
dat = dat0.copy()
dat = dat[(dat.season == 'SS19') & (dat.country == 'EU') & (dat.season_net_qty > 1000)]
dat.reset_index(inplace = True)
dat.sort_values(['article_number', 'year', 'week'], inplace = True)
dat.set_index('article_number', inplace = True)
dat.shape

In [None]:
dat.dropna(inplace=True) # ******
dat.shape

In [None]:
# dat.reset_index(inplace=True)

# a = pd.DataFrame(dat.groupby('article_number')['net_qty'].sum())
# b = dat[['article_number', 'season_net_qty']].drop_duplicates()
# c = pd.merge(a, b, left_index=True, right_on='article_number')
# c[np.abs(c.net_qty - c.season_net_qty) > 100]

In [None]:
a = np.random.choice(dat.index.unique(), size = 1, replace = False)
print('Article:', a[0])

dat_a = dat[(dat.index == a[0])].copy()

dat_a.sort_values(by = ['year', 'week'], inplace = True)

print()
print('Number of weeks:', dat_a['year'].count())

y = dat_a.net_qty.values # observed demand -- basically the response variable
x = dat_a.buy_availability.values # basically the explanatory variable

y = y.reshape((len(y),1)) 
x = x.reshape((len(y),1))

print()

# from scipy import optimize # --------------
# par_ranges = (slice(0.01, 1.01, 0.1), 
#               slice(0.01, 1.01, 0.1), 
#               slice(0.01, np.mean(y)*1.5, 5), 
#               slice(0.01, 1.01, 0.1), 
#               slice(0.01, np.mean(y)*1.5, 5)
#              )
# x0_brute = scipy.optimize.brute(loglikest, par_ranges, args = (y, x), finish = None)
# print('x0_brute', x0_brute) # ----------------

x0 = np.array([0.8, 0.9, np.mean(y), 1, np.mean(y)]) # starting value for optimisation
            # alpha, beta, omega, sigma, f0
            # See initialize_parameters for ordering
        
# # function to minimize log likelihood of observed values given parameters
res = scipy.optimize.minimize(loglikest, 
                              x0,        # initial parameter values (starting)
                              args=(y, x), 
                                  # y1: obs demand
                                  # x: buy_availability
                              options ={'eps':1e-09, 'maxiter': 200, 'ftol': 1e-12},
                              method = 'L-BFGS-B', # 'TNC', , 'SLSQP'
                              bounds =(
                                  (0,  None),             # alpha
                                  (-1, 1),                # beta
                                  (0.001, np.mean(y)*2),  # omega 
                                  (0.001, None),          # sigma
                                  (0.001, np.mean(y)*2)   # f0
                              ),
                              # constraints = ({'type': 'ineq', 'fun': lambda x: x[1] - x[0]}) # (beta > alpha)
                             )
# print(res)

x1 = res.x.round(1) # numpy.ndarray, (5,)

print('Message:', res.message)
print()
print('Iterations:', res.nit)
print()
print('Param est conv\'g:', res.success)
print()
print('alpha:', x1[0])
print('beta:', x1[1])
print('omega:', x1[2])
print('sigma:', x1[3])
print('f0:', x1[4])
print()

In [None]:
res.success == True

In [None]:
# Get estimated parameters and filter out the demand

x1par = initialize_parameters(res.x)
#x1par = initialize_parameters(x0)
f_est = filterGAS(y, x, x1par)

d = pd.DataFrame({'Obs_demand': y[:,0], 'Est_true_demand': f_est[:,0]})

# total real demand
print('Estimated true demand:', round(np.sum(f_est)))

# total observed demand
print('Observed demand:', np.sum(y))

# plt.rcParams["figure.figsize"] = [10,5]

pd.DataFrame(data = d).plot(linewidth = 5)
# plt.ylim(0, 1200)
plt.title('Net Demand Quantity: Observed & Estimated', size = 22)

pd.DataFrame(x).plot(linewidth = 5)
plt.title('Buy Availability', size = 22)
#plt.ylabel('Real Data')

plt.xlabel('Week', size = 22)

# Applied

In [92]:
dat0 = pd.read_csv('ch4k_df_eu.csv', low_memory = False, index_col = 0)

In [93]:
dat = dat0.copy()
dat.reset_index(inplace = True)
dat = dat[(dat.season == 'SS19') & (dat.country == 'EU') & (dat.season_net_qty > 1000)]

dat.sort_values(['article_number', 'year', 'week'], inplace = True)

dat.set_index(['article_number'], inplace = True)

In [95]:
dat.dropna(inplace=True) # ******

In [84]:
def GAS_est(df):
    # y: observed demand
    # x: buy_availability
    
#     y = df['net_qty'].values.reshape((df['net_qty'],1)) 
#     x = df['buy_availability'].values.reshape((len(y),1))
    
    y = df.net_qty.values # observed demand -- basically the response variable
    x = df.buy_availability.values # basically the explanatory variable

    y = y.reshape((len(y),1)) 
    x = x.reshape((len(y),1))
    
    abc = scipy.optimize.minimize(
        loglikest, # function to minimize (log likelihood y|x,theta)
        np.array([0.8, 0.9, np.mean(y), 1, np.mean(y)]), # initial parameter values (starting)
        args=(y, x), 
        options ={'eps':1e-09, 'maxiter': 200, 'ftol': 1e-12},
        method='L-BFGS-B', 
        bounds=((0,  None),             # alpha
                (-1, 1),                # beta
                (0.001, np.mean(y)*2),  # omega 
                (0.001, None),          # sigma
                (0.001, np.mean(y)*2)   # f
               )
    )
            
    x1par = initialize_parameters(abc.x) 
    f_est = filterGAS(y, x, x1par)
    
    ret = pd.DataFrame()
    ret['year'] = df['year']
    ret['week'] = df['week']
    ret['f_est'] = f_est # pd.Series(f_est.reshape(len(y))).round(2)
    ret['Convergence'] = [abc.success] * len(y)

    return ret

In [94]:
a = np.random.choice(dat.index.unique(), size = 10, replace = False)
print(a)
dat_samp = dat.loc[a ,:].copy()
dat_samp.reset_index(inplace = True)

# DNC: DW9818

# dat_samp.groupby(['article_number']).apply(
#     lambda u: GAS_est(u['net_qty'], u['buy_availability']))

dat_samp.groupby('article_number').apply(GAS_est)

['DV0823' 'B41482' 'CQ2012' 'CG6211' 'CM8410' 'DV1745' 'G15892' 'DV2881'
 'B75703' 'DV0863']


  grad[k] = (f(*((xk + d,) + args)) - f0) / d[k]


Unnamed: 0_level_0,Unnamed: 1_level_0,year,week,f_est,Convergence
article_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
B41482,22,2018.0,48.0,29.0,False
B41482,23,2018.0,49.0,7.4,False
B41482,24,2018.0,50.0,13.24,False
B41482,25,2018.0,51.0,17.024,False
B41482,26,2018.0,52.0,15.8024,False
B41482,27,2018.0,53.0,46.08024,False
B41482,28,2019.0,1.0,,False
B41482,29,2019.0,2.0,,False
B41482,30,2019.0,3.0,,False
B41482,31,2019.0,4.0,,False


In [96]:
a = np.random.choice(d.article_number, size = 1)
a[0]

d_samp = dat[dat.article_number == a[0]]
d_samp.head()


print('Estimated actual:', round(np.sum(d_samp.f_est))) # estimated total actual demand
print('Observed:', np.sum(d_samp.net_qty)) # total observed demand

pd.DataFrame(data = d_samp[['net_qty', 'f_est']]).plot(linewidth = 4)
plt.title('Observed & Estimated Gross Demand Quantity')

pd.DataFrame(d_samp.buy_availability).plot(linewidth = 4)
plt.ylim(0, 1)

plt.title('Buy Availability')
plt.ylabel('Real Data')

NameError: name 'd' is not defined

In [None]:
# ------------------------- Appendix --------------------------- 

In [None]:
# def GAS_est(y, x):
#     # y: observed demand
#     # x: buy_availability
    
#     y = y.values
#     x = x.values
    
#     y = y.reshape((len(y),1)) 
#     x = x.reshape((len(y),1))
    
#     x0 = np.array([0.8, 0.9, np.mean(y), 1, np.mean(y)]) # # initial parameter values (starting)
    
#     res = scipy.optimize.minimize(
#         loglikest, # function to minimize (log likelihood y|x,theta)
#         x0,        
#         args=(y, x), 
#         # y1: obs demand
#         # x: buy_availability
#         options ={'eps':1e-09},
#         method='L-BFGS-B', 
#         bounds=((0,  1),                # alpha
#                 (-0.99, 0.99),          # beta
#                 (0.001, np.mean(y)*2),  # omega 
#                 (0.001, 1),             # sigma
#                 (0.001, np.mean(y)*2)   # f
#                )
#     )
    
#     x1par = initialize_parameters(res.x) 
#     f_est = filterGAS(y, x, x1par)
    
#     return f_est[:,0]
    

# d = dat_samp.groupby(dat_samp.index, as_index=False).apply(lambda df: GAS_est(df['net_qty'], df['buy_availability']))

# d = d.to_dict()

# wtf = pd.DataFrame()

# for k in d.keys():
#     df = pd.DataFrame(data = d[k], index = [k] * d[k].shape[0], columns = ['f_est'])
#     wtf = wtf.append(df)
#     # wtf = wtf.append(pd.DataFrame(data = d[k], index = [k] * d[k].shape[0], columns = ['f_est']))

# dat_samp2.sort_index(inplace = True)
# wtf.sort_index(inplace = True)
# fme = pd.concat([dat_samp2, wtf], axis = 1)

# plt = fme.loc[np.random.choice(fme.index, size = 1),:]
# plt1 = plt[['net_qty', 'f_est']]

# plt1.plot(linewidth = 4)

# pd.DataFrame(plt['buy_availability']).plot(linewidth = 4)

In [None]:
# dictEst = {}
# dictEst[np.random.choice(dat.index, size = 1).item()] = f_est1.reshape(f_est1.shape[0],).tolist()

# dictEst
# pd.DataFrame(dictEst)

In [25]:
# def Benoit_GAS_est(g):
#     # y: observed demand
#     # x: buy_availability
    
#     g.sort_values(["year", "week"], inplace=True)

#     y = g['net_qty'].values
#     x = g['buy_availability'].values

#     y = y.reshape((len(y),1)) 
#     x = x.reshape((len(y),1))

#     x0 = np.array([0.8, 0.9, np.mean(y), 1, np.mean(y)]) # # initial parameter values (starting)
#     abc = scipy.optimize.minimize(
#         loglikest, # function to minimize (log likelihood y|x,theta)
#         x0,        
#         args=(y, x), 
#         # y1: obs demand
#         # x: buy_availability
#         options ={'eps':1e-09, 'maxiter': 200, 'ftol': 1e-12},
#         method='L-BFGS-B', 
#         bounds=((0,  None),             # alpha
#                 (-1, 1),                # beta
#                 (0.001, np.mean(y)*2),  # omega 
#                 (0.001, None),          # sigma
#                 (0.001, np.mean(y)*2)   # f
#                )
#     )

#     x1par = initialize_parameters(abc.x) 
#     f_est = filterGAS(y, x, x1par)

#     out = pd.DataFrame()

#     out["year"] = g["year"].values
#     out["week"] = g["week"].values
#     out['Convergence'] = [res.success]
#     out['f_est'] = f_est[:,0]
    
#     return out

In [26]:
# def test(a, b):
#     ret = pd.DataFrame()
#     ret["A"] = [1, 2, 3]
#     ret['B'] = [b] * 3
#     return ret

# test(5, 9)

Unnamed: 0,A,B
0,1,9
1,2,9
2,3,9
