In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.exponential_smoothing.ets import ETSModel
from scipy.stats import boxcox
from scipy.special import inv_boxcox

import warnings
warnings.filterwarnings("ignore")

In [2]:
'''
to do:

1- apply lstm
2- explore box cox problem
3- explore gp problem

'''

'\nto do:\n\n1- apply lstm\n2- explore box cox problem\n3- explore gp problem\n\n'

In [3]:
def ets(deaths): # train exponential smoothing model, input is either deaths or infects
    
    bc_deaths, maxlam = boxcox(deaths)
    bc_deaths = pd.Series(bc_deaths, index=deaths.index)
    
    model = ETSModel(bc_deaths, error = 'add',trend = 'add', damped_trend=True, initialization_method='heuristic')
    fit = model.fit()
    preds_test = fit.simulate(anchor=deaths.index[-2], nsimulations= 30,repetitions = 1000)

    upper_test = [0] * len(preds_test)
    lower_test = [0] * len(preds_test)
    mean_test = [0] * len(preds_test)

    for idx in range(len(preds_test)):
        mean_test[idx] = np.mean(preds_test.iloc[idx])
        temp = np.percentile(preds_test.iloc[idx].values,[5,95])
        lower_test[idx] = temp[0]
        upper_test[idx] = temp[1]

    test_index = pd.date_range(start = deaths.index[-1], periods = 30 )
    upper_test = pd.Series(inv_boxcox(upper_test,maxlam), index=test_index)
    mean_test = pd.Series(inv_boxcox(mean_test,maxlam), index=test_index) 
    lower_test = pd.Series(inv_boxcox(lower_test,maxlam), index=test_index)
    
    return upper_test,mean_test,lower_test

In [4]:
def ets(deaths, apply_boxcox = True): # train exponential smoothing model, input is either deaths or infects
    
    if any(deaths == 0.0):
        apply_boxcox = False
    
    if apply_boxcox:
        bc_deaths = boxcox(deaths,3)
        deaths = pd.Series(bc_deaths, index=deaths.index)
    
    model = ETSModel(deaths, error = 'add',trend = 'add', damped_trend=True, initialization_method='heuristic')
    fit = model.fit()
    preds_test = fit.simulate(anchor=deaths.index[-2], nsimulations= 30,repetitions = 1000)

    upper_test = [0] * len(preds_test)
    lower_test = [0] * len(preds_test)
    mean_test = [0] * len(preds_test)

    for idx in range(len(preds_test)):
        mean_test[idx] = np.mean(preds_test.iloc[idx])
        temp = np.percentile(preds_test.iloc[idx].values,[5,95])
        lower_test[idx] = temp[0]
        upper_test[idx] = temp[1]

    test_index = pd.date_range(start = deaths.index[-1], periods = 30 )
    upper_test = pd.Series(inv_boxcox(upper_test,3) if apply_boxcox else upper_test, index=test_index)
    mean_test = pd.Series(inv_boxcox(mean_test,3) if apply_boxcox else mean_test, index=test_index) 
    lower_test = pd.Series(inv_boxcox(lower_test,3) if apply_boxcox else lower_test, index=test_index)
    
    return upper_test,mean_test,lower_test


def convert_date(date): # converts date from timestamp to str

    m = date.strftime("%m")
    if m[0] == '0':
        m = m[1]

    d = date.strftime("%d")
    if d[0] == '0':
        d = d[1]

    y = date.strftime("%Y")[2:]

    return m + '/' + d + '/' + y


def evaluate(preds_df): # calculate the score acc. to given formula
    
    total_score = 0

    ground_truth_d= pd.read_csv('all_D.csv')
    ground_truth_n= pd.read_csv('all_N.csv')
    
    t_current =  preds_df.t_current.iloc[0]
    t_current = convert_date(t_current)
    
    t_last = (preds_df.t_current.iloc[0] +  pd.DateOffset(days=29))
    t_last = convert_date(t_last)

    for index,item in ground_truth_d.iterrows():

        province = item['Province/State']
        country = item['Country/Region']

        is_prov_null = (province != province)


        if not(is_prov_null):
            d_pred = preds_df[(preds_df['Country/Region'] == country) & (preds_df['Province/State'] == province)].D.values
            i_pred = preds_df[(preds_df['Country/Region'] == country) & (preds_df['Province/State'] == province)].N.values

        else:

            d_pred = preds_df[(preds_df['Country/Region'] == country) &(preds_df['Province/State'] != preds_df['Province/State'])].D.values
            i_pred = preds_df[(preds_df['Country/Region'] == country) &(preds_df['Province/State'] != preds_df['Province/State'])].N.values

        if len(d_pred) != 0:

            qw_d = item.loc[t_current:t_last]
            qw_n = ground_truth_n.iloc[index].loc[t_current:t_last]

            sc_d = (np.abs(np.log(list(d_pred + 1.0)) - np.log(list(qw_d.values + 1.0))))
            sc_n = (np.abs(np.log(list(i_pred + 1.0)) - np.log(list(qw_n.values + 1.0))))

            sc_t = 1 / ((sc_d + sc_n) + 1)
            
            total_score += np.sum(sc_t)


    normalized_score = (total_score / (30*country_count)) * 100
    
    return normalized_score

In [5]:
for i in [1,2,3]: # for each phase

    phase_d = pd.read_csv('train_D_phase_'+str(i)+'.csv')

    phase_n = pd.read_csv('train_N_phase_'+str(i)+'.csv')
    
    country_count = len(phase_d)

    preds_df = pd.DataFrame()

    for idx in range(len(phase_d)): # iterate over each country

        province = phase_d.iloc[idx]['Province/State']
        country = phase_d.iloc[idx]['Country/Region']

        is_prov_null = (province != province)

        if not(is_prov_null):
            deaths = phase_d[(phase_d['Country/Region'] == country) & (phase_d['Province/State'] == province)].values[0][2:]
            infects = phase_n[(phase_n['Country/Region'] == country) & (phase_n['Province/State'] == province)].values[0][2:]

        else:

            deaths = phase_d[(phase_d['Country/Region'] == country) & (phase_d['Province/State'] != phase_d['Province/State'])].values[0][2:]
            infects = phase_n[(phase_n['Country/Region'] == country) & (phase_n['Province/State'] != phase_n['Province/State'])].values[0][2:]


        index = pd.date_range(start=phase_d.columns[2], end = phase_d.columns[-1] )
        deaths = pd.Series(deaths, index=index).astype('float64')
        infects = pd.Series(infects, index=index).astype('float64')

        # train and predict
        up_d,mean_d,low_d = ets(deaths,False)
        up_i,mean_i,low_i = ets(infects,False)
        
        
        #print(province,country,mean_d)

        # convert it to submission format
        temp = pd.concat([mean_i,low_i,up_i,mean_d,low_d,up_d],axis = 1).reset_index().rename(columns={'index':'t_target',0:'N',1:'N_low',2:'N_high',3:'D',4:'D_low',5:'D_high'})
        t_current = pd.Series([index[-1]] *len(mean_d))
        country = pd.Series([country] *len(mean_d))
        province = pd.Series([province] *len(mean_d))
        temp = pd.concat([province,country,t_current,temp],axis = 1).rename(columns = {0:'Province/State',1:'Country/Region',2:'t_current'})


        preds_df = preds_df.append(temp,ignore_index = True)

    preds_df = preds_df.reset_index().rename(columns={'index':'Id'}).reset_index(drop = True)
    
    print('phase '+ str(i) + ' score: ' + str(evaluate(preds_df)))

phase 1 score: 82.62196039711891
phase 2 score: 90.35242412118087
phase 3 score: 91.00790195955261
