In [None]:
# 3 input files
# -- Test_Forecast_5_Mil_List_v2 for josh
# -- TrueProductionData for production
# -- All_Shipments_1 for shipment

In [14]:
# common parameter
start = '2021-01-01'
initial = '540 days'
# ##############from 2019 to 2022 first half, 
### 3.5 year = 1260 days, 
### from 2020, 2.5 year = 900 days, 
### from 2021, 1.5 year = 540 days 

# josh list 
import pandas as pd
from prophet import Prophet
from prophet.diagnostics import cross_validation
from prophet.diagnostics import performance_metrics
from prophet.utilities import regressor_coefficients
from prophet.plot import plot_forecast_component
import json
from prophet.serialize import model_to_json, model_from_json

# from dask.distributed import Client
# client = Client()

import warnings
warnings.filterwarnings('ignore')
import logging
logging.getLogger('prophet').setLevel(logging.ERROR)
logger = logging.getLogger('cmdstanpy')
logger.addHandler(logging.NullHandler())
logger.propagate = False
logger.setLevel(logging.CRITICAL)

def load_data(flag):
    if flag == 'p':
        df = pd.read_csv("https://raw.githubusercontent.com/cosmosanalytics/forecast/main/TrueProductionData.csv").rename(columns={'Material_Group_Id_Mara_Matkl':'Generic'})
    else:
        df = pd.read_csv("https://raw.githubusercontent.com/cosmosanalytics/forecast/main/All_Shipments_1_simple.csv").rename(columns={'Actual GI Date':'Date'})        
    df = df[df['Plant'].isin([3803, 3809, 3811, 3833, 3835, 3841])]
    df['Date'] = pd.to_datetime(df['Date']) 
    df = df[df['Date']>=pd.to_datetime(start)]
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df = df.groupby(['Plant','Generic','Year','Month'])[['KG']].sum().reset_index()
    df['ds'] = pd.to_datetime(df['Year'].astype(str)+'-'+df['Month'].astype(str)+'-1')  
    df = df[df['ds']<=pd.to_datetime('2022-11-01')]         
  
    j_df = pd.read_csv("https://raw.githubusercontent.com/cosmosanalytics/forecast/main/Test_Forecast_5_Mil_List_v2.csv")
    gp = pd.DataFrame(columns=['Generic','Plant','KG'])
    gp['Generic'] = j_df['Generic-Plant'].apply(lambda x: x.split('-')[0])
    gp['Plant'] = j_df['Generic-Plant'].apply(lambda x: x.split('-')[1])
    gp['KG'] = gp\
    .apply(lambda x: df[df['Plant'].astype(str)==str(x['Plant'])][df['Generic'].astype(str)==str(x['Generic'])]['KG'].sum(), axis=1)    
    gp = gp.sort_values(by='KG', ascending=False)
 
    GDPC1_df = pd.read_csv('https://raw.githubusercontent.com/cosmosanalytics/forecast/main/GDPC1.csv')
    GDP_df = pd.read_csv('https://raw.githubusercontent.com/cosmosanalytics/forecast/main/GDP.csv') 
    GDPC1_df['Unnamed: 0'] = pd.to_datetime(GDPC1_df['Unnamed: 0'])
    GDPC1_df = GDPC1_df.set_index('Unnamed: 0')
    GDPC1_df = GDPC1_df.resample('MS').ffill()

    GDP_df['Unnamed: 0'] = pd.to_datetime(GDP_df['Unnamed: 0'])
    GDP_df = GDP_df.set_index('Unnamed: 0')
    GDP_df = GDP_df.resample('MS').ffill()
    return GDPC1_df, GDP_df, gp, df 

def output_f(df1):
    mode = 'additive'
    m1 = Prophet(seasonality_mode=mode, daily_seasonality=False, weekly_seasonality=False, yearly_seasonality=5);
    m1 = m1.add_seasonality(name='quarterly', period=365.24/4, fourier_order=5)
    m1.add_regressor(name='Inflation',
                      prior_scale=10,
                      standardize='auto',
                      mode=mode)
    m1.add_regressor(name='GDPC1',
                      prior_scale=10,
                      standardize='auto',
                      mode=mode)
    m1.fit(df1)
    forecast = m1.predict()
    return forecast[['ds','yhat']], m1

def performance_f(m1):
    df1_cv = cross_validation(m1,
                          horizon='30 days', #
                          period='30 days', #
                          initial=initial, #  
                          parallel="processes")#parallel='dask')
    df1_p = performance_metrics(df1_cv)
    return df1_p

def input_f(GDPC1_df, GDP_df, df, p,m):
    df = df[df['Plant'].astype(str)==str(p)][df['Generic'].astype(str)==str(m)]
    df = df.drop(columns=['Year','Month','Plant','Generic']).rename(columns={'KG':'y'})
    df = df.groupby('ds')['y'].sum().reset_index()
    df1 = df.set_index('ds') #
    df1_ = pd.DataFrame(index = pd.DatetimeIndex(pd.date_range(start=start, end='2022-09-01', freq="MS"))) #####
    df1 = df1_.join(df1).fillna(0) #
    df1 = df1.join(GDP_df).join(GDPC1_df).reset_index().rename(columns={'index':'ds'})
    df1['ds'] = pd.to_datetime(df1['ds'].astype(str))
    df1['Inflation'] = 1-df1['GDPC1']/df1['GDP'] 
    df1 = df1.ffill()
    return df1

def run_josh_list(GDPC1_df, GDP_df, df, lst):
    ct=0
    output_df = pd.DataFrame()
    failedLst = pd.DataFrame(columns=['Plant', 'Generic'])
    perf_s = pd.Series()
    model_s = pd.Series() 
    for p,m in zip(lst['Plant'],lst['Generic']):
        print(ct); ct=ct+1 
        try:
            df1 = input_f(GDPC1_df, GDP_df, df, p,m)
            fst, m1 = output_f(df1)
            model_s[str(p)+'_'+str(m)] = model_to_json(m1) 
            
            df1_p = performance_f(m1)
            tmp = df1.set_index('ds')[['y']].join(fst.set_index('ds'))
            y_label = str(p)+'-'+str(m)+'_actual'
            yhat_label = str(p)+'-'+str(m)+'_model (corr. coef='+str(tmp['y'].corr(tmp['yhat']).round(3))+')'
            tmp = tmp.rename(columns={'y': y_label, 'yhat': yhat_label})  
            output_df = output_df.join(tmp, how='right')
            str1 = str(df1_p['smape'].median().round(3))
            str2 = str(tmp[y_label].corr(tmp[yhat_label]).round(3))
            perf_s = perf_s.append(pd.Series([str1+'_'+str2],index=[str(p)+'-'+str(m)]))
        except:
            print('failed')
            failedLst.loc[ct] = [p, m]
    return output_df, perf_s, model_s, failedLst 

def p_or_s(flag):
    GDPC1_df, GDP_df, gp, df = load_data(flag)
    print(len(gp[gp['KG']==0]), len(gp))

    n1, n2 = 0,600
    lst = gp.iloc[n1:n2].copy()
    output_df, perf_s, model_s, failedLst = run_josh_list(GDPC1_df, GDP_df, df, lst) 
  
    df = output_df.set_index('ds')
    dg = df.stack().reset_index(level=['ds', None])
    dg['Index'] = dg['ds'].astype(str).apply(lambda x: x.split(' ')[0]) + '_' + dg['level_1']
    dg['Index'] = dg['Index'].apply(lambda x: x.replace('_',' ').split(' ')[:3])
    dg['Plant-Generic'] = dg['Index'].apply(lambda x: x[1])
    dg['Date'] = dg['Index'].apply(lambda x: x[0])
    dg['Actual or Model'] = dg['Index'].apply(lambda x: x[2])
    dg = dg.drop(columns=['ds', 'level_1', 'Index']).set_index(['Plant-Generic', 'Actual or Model', 'Date'])[0]\
        .unstack().unstack().dropna(axis=1)
    dg.columns = ['_'.join(col) for col in dg.columns.values]
    return output_df, perf_s, model_s, failedLst, dg

In [None]:
output_df, perf_s, model_s, failedLst, dg = p_or_s('p')

In [None]:
output_df, perf_s, model_s, failedLst, dg = p_or_s('s')   