In [1]:
import time
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt

from fbprophet import Prophet
from fbprophet.diagnostics import cross_validation
from fbprophet.diagnostics import performance_metrics
from fbprophet.plot import plot_cross_validation_metric
from sklearn.model_selection import ParameterGrid
import itertools

In [20]:
# prepare and clean data
df = pd.read_csv('data/train_round2.csv', index_col=None)
df = df[['ID','Province_State','Date','Confirmed','Deaths']] #choose the columns needed
df['Date'] = pd.to_datetime(df['Date'])

# df1 for predicting confirmed cases
df1 = df.rename(columns = {'Date':'ds', 'Confirmed': 'y'})

# df2 for predicting death cases
df2 = df.rename(columns = {'Date':'ds', 'Deaths': 'y'})
df1,df2

(          ID Province_State         ds       y  Deaths
 0          0        Alabama 2020-04-12    3563      93
 1          1         Alaska 2020-04-12     272       8
 2          2        Arizona 2020-04-12    3542     115
 3          3       Arkansas 2020-04-12    1280      27
 4          4     California 2020-04-12   22795     640
 ...      ...            ...        ...     ...     ...
 11245  11245       Virginia 2020-11-22  217796    3938
 11246  11246     Washington 2020-11-22  141260    2619
 11247  11247  West Virginia 2020-11-22   40478     662
 11248  11248      Wisconsin 2020-11-22  376238    3150
 11249  11249        Wyoming 2020-11-22   28169     176
 
 [11250 rows x 5 columns],
           ID Province_State         ds  Confirmed     y
 0          0        Alabama 2020-04-12       3563    93
 1          1         Alaska 2020-04-12        272     8
 2          2        Arizona 2020-04-12       3542   115
 3          3       Arkansas 2020-04-12       1280    27
 4          4 

In [21]:
States = df['Province_State'].drop_duplicates()
len(States)

50

In [22]:
def generate_ForecastID(df_pred, target_str, start_date, state, States):
    '''
    df_pred: predicted data of one state from fbprophet 
    target_str: for renaming 'yhat'
    start_date: desired cutoff date
    States: list of states
    state: state of df_pred
    '''
    df = df_pred[['ds', 'yhat']]
    # select data after start_date
    df = df[df['ds'] >= start_date ].reset_index(drop = True) #'2020-09-01'

    #push index to first column and use 'index' column to generate 'ForecastID'
    df = df.reset_index() 
    df['index'] = df['index']*50 + States.index(state)

    df = df.rename(columns = {'index':'ForecastID', 'yhat': target_str }) 
    
    return df

In [23]:
def get_bestParameters(df, all_params):
    '''
    df: data of one state, has 'ds' and 'y'
    
    
    ## simple hyperparameter tuning
    '''
    mapes = []  # Store the RMSEs for each params here
    min_MAPE = 1000
    
    # Use cross validation to evaluate all parameters
    for params in all_params:
        m = Prophet(**params).fit(df)  # Fit model with given params
        df_cv = cross_validation(m, initial='30 days', period='30 days', horizon = '26 days')
        df_p = performance_metrics(df_cv, rolling_window=1)
        mape = df_p['mape'].values[0]
        mapes.append(mape)

        #find min mape and best parameters
        if min_MAPE > mape:
            min_MAPE = mape
            best_params = params

    # keep track of tuning results
    tuning_results = pd.DataFrame(all_params)
    tuning_results['mape'] = mapes
    
    return best_params
    

### Simple Model Fitting with default parameters

In [26]:
df1_all_states = [] #store predicted data for each state
df2_all_states = []
start_date = '2020-12-07'
States = df['Province_State'].drop_duplicates().tolist()

for state in States[:2]:  # States[:2] test out 2 states

    # training data: 2020-04-12 to 2020-08-31
    # predict confirmed cases
    df1_one_state = df1[df1['Province_State'] == state]
    
    m1 = Prophet().fit(df1_one_state) # fit data to model
    future = m1.make_future_dataframe(periods=21, freq = 'D')  #predict the data from 11/23 - 12/13
    forecast1 = m1.predict (future) # predict
    
    df1_one_state_pred = generate_ForecastID(forecast1, 'Confirmed', start_date, state, States)  #generate ForecastID
    df1_all_states.append(df1_one_state_pred)
    
    # predict death cases
    df2_one_state = df2[df2['Province_State'] == state]
    
    m2 = Prophet().fit(df2_one_state)
    forecast2 = m2.predict (future)
    
    df2_one_state_pred = generate_ForecastID(forecast2, 'Deaths', start_date, state, States) 
    df2_all_states.append(df2_one_state_pred)

INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.


In [27]:
df1_pred = pd.concat(df1_all_states, ignore_index=True).sort_values(by=['ForecastID'])
#df1_pred

Unnamed: 0,ForecastID,ds,Confirmed
0,0,2020-12-07,249894.229669
7,1,2020-12-07,31783.118066
1,50,2020-12-08,251248.747871
8,51,2020-12-08,32209.284705
2,100,2020-12-09,252700.279159
9,101,2020-12-09,32599.515647
3,150,2020-12-10,254438.542779
10,151,2020-12-10,32993.991929
4,200,2020-12-11,256169.526441
11,201,2020-12-11,33395.826776


In [28]:
df2_pred = pd.concat(df2_all_states, ignore_index=True).sort_values(by=['ForecastID'])
#df2_pred

Unnamed: 0,ForecastID,ds,Deaths
0,0,2020-12-07,3619.04539
7,1,2020-12-07,116.400396
1,50,2020-12-08,3641.545325
8,51,2020-12-08,117.701792
2,100,2020-12-09,3667.488399
9,101,2020-12-09,118.746085
3,150,2020-12-10,3688.911722
10,151,2020-12-10,119.723133
4,200,2020-12-11,3710.455247
11,201,2020-12-11,120.873338


In [32]:
df_pred = pd.concat((df1_pred, df2_pred['Deaths']), axis =1)
df_pred

Unnamed: 0,ForecastID,ds,Confirmed,Deaths
0,0,2020-12-07,249894.229669,3619.04539
7,1,2020-12-07,31783.118066,116.400396
1,50,2020-12-08,251248.747871,3641.545325
8,51,2020-12-08,32209.284705,117.701792
2,100,2020-12-09,252700.279159,3667.488399
9,101,2020-12-09,32599.515647,118.746085
3,150,2020-12-10,254438.542779,3688.911722
10,151,2020-12-10,32993.991929,119.723133
4,200,2020-12-11,256169.526441,3710.455247
11,201,2020-12-11,33395.826776,120.873338


In [22]:
# save only the needed columns
df_pred[['ForecastID','Confirmed','Deaths']].to_csv('output/submission4.csv', index = False) 

### Hyperparameter Tuning

In [39]:
# Simple Version
# Generate all combinations of parameters
param_grid = {  
    'changepoint_prior_scale': [0.001, 0.01, 0.05, 0.1, 0.5], #default 0.05
    'seasonality_prior_scale': [0.01, 0.1, 1.0, 10.0], #default 10
}
all_params = [dict(zip(param_grid.keys(), v)) for v in itertools.product(*param_grid.values())]

In [40]:
# Generate more model
# Generate all combinations of parameters
from sklearn.model_selection import ParameterGrid
params_grid = {'seasonality_mode':['multiplicative','additive'], 
               'changepoint_prior_scale': [0.001, 0.002, 0.005, 0.05, 0.5],  # default 0.05, reasonable range [0.001, 0.5]
               'seasonality_prior_scale': [0.01, 0.02, 0.05, 0.1, 1, 10.0],  # default 10, reasonable range [0.01, 10]
               'changepoint_range' : [0.8, 0.85, 0.9, 0.95] 
              }

all_params = ParameterGrid(params_grid)
cnt = 0
for params in all_params:
    #print(params)
    cnt = cnt+1
    
print('Total Possible Models',cnt)

Total Possible Models 240


**NOTE**

Since we have possible models upto 240, we need to run 240 * 3(cross validation) * 2 (cases/death cases) * 50 (states). Approximately 33 hours in total. So we might want to break up the following for loop into several one and run on differnt kernels to save time.

In [None]:
df1_all_states = [] #store predicted data for each state
df2_all_states = []
States = df['Province_State'].drop_duplicates().tolist()
start_date = '2020-12-07'
for state in States[:1]: #test for just one state

    # training data: 2020-04-12 to 2020-08-31
    # predict confirmed cases
    df1_one_state = df1[df1['Province_State'] == state]
    
    
    best_params1 = get_bestParameters(df1_one_state, all_params)
    m1 = Prophet(**best_params1).fit(df1_one_state) # fit data to model
    
    future = m1.make_future_dataframe(periods=26, freq = 'D')
    forecast1 = m1.predict (future) # predict
    
    df1_one_state_pred = generate_ForecastID(forecast1, 'Confirmed', start_date, state,States)  #generate ForecastID
    df1_all_states.append(df1_one_state_pred)
    
    # predict death cases
    df2_one_state = df2[df2['Province_State'] == state]
    
    best_params2 = get_bestParameters(df2_one_state, all_params)
    m2 = Prophet(**best_params2).fit(df2_one_state)
    forecast2 = m2.predict (future)
    
    df2_one_state_pred = generate_ForecastID(forecast2, 'Deaths', start_date, state,States) 
    df2_all_states.append(df2_one_state_pred)

INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
INFO:fbprophet:Making 6 forecasts with cutoffs between 2020-05-30 00:00:00 and 2020-10-27 00:00:00


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6.0), HTML(value='')))






INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.





INFO:fbprophet:Making 6 forecasts with cutoffs between 2020-05-30 00:00:00 and 2020-10-27 00:00:00


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6.0), HTML(value='')))

INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.





INFO:fbprophet:Making 6 forecasts with cutoffs between 2020-05-30 00:00:00 and 2020-10-27 00:00:00


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6.0), HTML(value='')))

INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.





INFO:fbprophet:Making 6 forecasts with cutoffs between 2020-05-30 00:00:00 and 2020-10-27 00:00:00


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6.0), HTML(value='')))

INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.





INFO:fbprophet:Making 6 forecasts with cutoffs between 2020-05-30 00:00:00 and 2020-10-27 00:00:00


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6.0), HTML(value='')))

INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.





INFO:fbprophet:Making 6 forecasts with cutoffs between 2020-05-30 00:00:00 and 2020-10-27 00:00:00


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6.0), HTML(value='')))

INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.





INFO:fbprophet:Making 6 forecasts with cutoffs between 2020-05-30 00:00:00 and 2020-10-27 00:00:00


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6.0), HTML(value='')))

INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.





INFO:fbprophet:Making 6 forecasts with cutoffs between 2020-05-30 00:00:00 and 2020-10-27 00:00:00


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6.0), HTML(value='')))

INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.





INFO:fbprophet:Making 6 forecasts with cutoffs between 2020-05-30 00:00:00 and 2020-10-27 00:00:00


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6.0), HTML(value='')))



In [None]:
df1_pred = pd.concat(df1_all_states, ignore_index=True).sort_values(by=['ForecastID'])
df2_pred = pd.concat(df2_all_states, ignore_index=True).sort_values(by=['ForecastID'])
df_pred = pd.concat((df1_pred, df2_pred['Deaths']), axis =1)
df_pred

In [None]:
# save only the needed columns
df_pred[['ForecastID','Confirmed','Deaths']].to_csv('output/submission4.csv', index = False) 