In [1]:
# Script to test out different parameters and see what effect this has on the model

import numpy as np
import matplotlib.pyplot as plt
from datetime import date, datetime, timedelta
from pandas.io.stata import StataParser
from pytrends.request import TrendReq
import time
import pandas as pd
import matplotlib
import google_trends_daily.gtrend as gtrend

from sklearn.model_selection import train_test_split
import platform

import RF
import lstm
import linear
import postprocessing
import platform
# Making a new preprocessing file so I can mess with the inputs/outputs
import preprocessing_for_variation as prep


In [2]:

# Just for reference for the JHU data files
startOfCovid = '1/22/20' # The first day where data has been recorded
endOfCovid = '10/30/21' # The last day of relevance to this project
# NOTE delay can be a maximum of 30 because of this end date and the way I load all the data at once

# Step 1: Get the trends data for a very large timeframe so we can pre-load this and not web-scrape each time

xDateFormat = '%Y-%m-%d'
if platform.system() == 'Windows':
    yDateFormat = '%#m/%#d/%y'
else:
    yDateFormat = '%-m/%-d/%y'

# Using a delay of 16 days
startDateX = '2020-02-01'
endDateX = '2021-09-30'
delay = timedelta(days=16)
startDateY = (datetime.strptime(startDateX, xDateFormat) + delay).strftime(xDateFormat) # these should be in xdate format
endDateY = (datetime.strptime(endDateX, xDateFormat) + delay).strftime(xDateFormat)
geo = 'US-CA'
state = 'California'


X_all, Y_all_16delay, Y_all = prep.getDataNew(startDateX, endDateX, startDateY, endDateY, geo, state)
# X_all is the search term data for the entire giant period
# Y_all_16delay is the case data for the entire period, with the delay included


Fetching 'covid' for period:2021-01-04 2021-09-30
Fetching 'covid' for period:2020-07-19 2021-04-14
Normalize by overlapping period:2021-01-04 2021-04-14
Fetching 'covid' for period:2020-02-01 2020-10-27
Normalize by overlapping period:2020-07-19 2020-10-27
Fetching 'covid' for period:2019-08-16 2020-05-11
Normalize by overlapping period:2020-02-01 2020-05-11
Fetching 'coronavirus' for period:2021-01-04 2021-09-30
Fetching 'coronavirus' for period:2020-07-19 2021-04-14
Normalize by overlapping period:2021-01-04 2021-04-14
Fetching 'coronavirus' for period:2020-02-01 2020-10-27
Normalize by overlapping period:2020-07-19 2020-10-27
Fetching 'coronavirus' for period:2019-08-16 2020-05-11
Normalize by overlapping period:2020-02-01 2020-05-11
Fetching 'covid-19' for period:2021-01-04 2021-09-30
Fetching 'covid-19' for period:2020-07-19 2021-04-14
Normalize by overlapping period:2021-01-04 2021-04-14
Fetching 'covid-19' for period:2020-02-01 2020-10-27
Normalize by overlapping period:2020-07

In [3]:

# Indexing this by date as well:
sdate = datetime.strptime(startDateX, xDateFormat)
edate = datetime.strptime(endDateX, xDateFormat)
dates = pd.date_range(sdate,edate,freq='d')
dateList = dates.strftime(xDateFormat).to_list()
X_all_dateIndexed = X_all.copy(deep=True)
X_all_dateIndexed['dates'] = dateList
X_all_dateIndexed = X_all_dateIndexed.set_index('dates')
# Y is already indexed by date, but needs to be converted to datetime form
Y_all_16delay.index = pd.to_datetime(Y_all_16delay.index)
Y_all.index = pd.to_datetime(Y_all.index)



In [4]:

# Varying the split percentage for train and test ############################################################

# Get a subset of data based on a smaller timeframe (let this be a 4 month period)

# X_all is indexed by day number, rather than the date itself
# So, index X_all by the actual dates rather than just integers


startDate_subset_X = '2020-02-01'
endDate_subset_X = '2020-06-01'
startDate_subset_Y = '2/17/20'
endDate_subset_Y = '6/17/20'

X = X_all_dateIndexed[startDate_subset_X:endDate_subset_X]
Y = Y_all_16delay[startDate_subset_Y:endDate_subset_Y]

rmsdata_varyTrainPct = [] # initialize

percentages = [0.5, 0.6, 0.7, 0.8, 0.9]

for train_percentage in percentages:
    split_idx = round(len(Y)*train_percentage)
    print(f'split idx in my code: {split_idx}')
    # Linear regression
    linear_predict = linear.linear(X, Y, train_percentage)
    print('Y[splitidx:] size= ')
    print(Y[split_idx:].size)
    print('linear_predict len')
    print(len(linear_predict))
    linear_rms = postprocessing.MAPE(Y[split_idx:], linear_predict)
    # LSTM
    lstm_predict = lstm.lstm(X, Y, train_percentage)
    lstm_rms = postprocessing.MAPE(Y[split_idx:], lstm_predict)
    # RF
    RF_kwargs = {'bootstrap': True,'criterion': 'squared_error','max_depth': None,'min_samples_leaf': 1,'n_estimators': 1000,'min_samples_split': 2,'random_state': 0 }
    RFmodel, RF_predict = RF.randomForest(X[:split_idx], X[split_idx:], Y[:split_idx], RF_kwargs)
    RF_rms = postprocessing.MAPE(Y[split_idx:], lstm_predict)

    rmsdata_varyTrainPct.append([train_percentage, linear_rms, lstm_rms, RF_rms])
    # Store the rms data 

trainPct_df = pd.DataFrame(rmsdata_varyTrainPct, columns=['Train %', 'Linear Regression', 'LSTM', 'Random Forest'])
trainPct_df.set_index('Train %')
trainPct_df.style.set_caption('Varying the Train/Test Split (Train Percentage): Effects on MAPE')
print(trainPct_df)



split idx in my code: 61
(122, 11)
(61, 11)
(61, 11)
Y[splitidx:] size= 
61
linear_predict len
61
Epoch 1/200
1/1 - 2s - loss: 0.3619 - val_loss: 0.5863 - 2s/epoch - 2s/step
Epoch 2/200
1/1 - 0s - loss: 0.3457 - val_loss: 0.5679 - 26ms/epoch - 26ms/step
Epoch 3/200
1/1 - 0s - loss: 0.3295 - val_loss: 0.5495 - 30ms/epoch - 30ms/step
Epoch 4/200
1/1 - 0s - loss: 0.3132 - val_loss: 0.5310 - 34ms/epoch - 34ms/step
Epoch 5/200
1/1 - 0s - loss: 0.2968 - val_loss: 0.5124 - 35ms/epoch - 35ms/step
Epoch 6/200
1/1 - 0s - loss: 0.2804 - val_loss: 0.4936 - 33ms/epoch - 33ms/step
Epoch 7/200
1/1 - 0s - loss: 0.2638 - val_loss: 0.4747 - 37ms/epoch - 37ms/step
Epoch 8/200
1/1 - 0s - loss: 0.2472 - val_loss: 0.4557 - 35ms/epoch - 35ms/step
Epoch 9/200
1/1 - 0s - loss: 0.2304 - val_loss: 0.4365 - 28ms/epoch - 28ms/step
Epoch 10/200
1/1 - 0s - loss: 0.2135 - val_loss: 0.4170 - 35ms/epoch - 35ms/step
Epoch 11/200
1/1 - 0s - loss: 0.1970 - val_loss: 0.3975 - 36ms/epoch - 36ms/step
Epoch 12/200
1/1 - 0s - 

In [5]:
trainPct_df

Unnamed: 0,Train %,Linear Regression,LSTM,Random Forest
0,0.5,0.539043,0.751602,0.751602
1,0.6,0.402675,0.454209,0.454209
2,0.7,0.33621,0.461212,0.461212
3,0.8,0.311729,0.336806,0.336806
4,0.9,0.289912,0.273797,0.273797


In [6]:

# Change the length of the timeframe being considered ############################################################
durations = [1,2,3,4,5,6,7,8,9] # months

# Arbitrary start date
startDate_subset_X = '2020-02-01'
startDate_subset_Y = '2020-02-17'

rmsdata_varyDuration = [] # initialize

train_percentage = 0.75

for duration in durations:
    dt = timedelta(days=30*duration) # timedelta does not work with months, using 30*months for days
    endDate_subset_X = (datetime.strptime(startDate_subset_X, xDateFormat) + dt).strftime(xDateFormat)
    endDate_subset_Y = (datetime.strptime(startDate_subset_Y, xDateFormat) + dt).strftime(yDateFormat)
    X = X_all_dateIndexed[startDate_subset_X:endDate_subset_X]
    Y = Y_all_16delay[startDate_subset_Y:endDate_subset_Y]
    
    split_idx = round(len(Y)*train_percentage)
    # Linear regression
    linear_predict = linear.linear(X, Y, train_percentage)
    linear_rms = postprocessing.MAPE(Y[split_idx:], linear_predict)
    # LSTM
    lstm_predict = lstm.lstm(X, Y, train_percentage)
    lstm_rms = postprocessing.MAPE(Y[split_idx:], lstm_predict)
    # RF
    RF_kwargs = {'bootstrap': True,'criterion': 'squared_error','max_depth': None,'min_samples_leaf': 1,'n_estimators': 1000,'min_samples_split': 2,'random_state': 0 }
    RFmodel, RF_predict = RF.randomForest(X[:split_idx], X[split_idx:], Y[:split_idx], RF_kwargs)
    RF_rms = postprocessing.MAPE(Y[split_idx:], lstm_predict)

    rmsdata_varyDuration.append([duration, linear_rms, lstm_rms, RF_rms])

duration_df = pd.DataFrame(rmsdata_varyDuration, columns=['Duration', 'Linear Regression', 'LSTM', 'Random Forest'])
duration_df.set_index('Duration')
duration_df.style.set_caption('Varying the model duration (months of data): Effects on MAPE')
print(duration_df)



(31, 11)
(23, 11)
(8, 11)
Epoch 1/200
1/1 - 2s - loss: 0.0513 - val_loss: 0.4666 - 2s/epoch - 2s/step
Epoch 2/200
1/1 - 0s - loss: 0.0428 - val_loss: 0.4418 - 27ms/epoch - 27ms/step
Epoch 3/200
1/1 - 0s - loss: 0.0357 - val_loss: 0.4200 - 28ms/epoch - 28ms/step
Epoch 4/200
1/1 - 0s - loss: 0.0346 - val_loss: 0.4037 - 35ms/epoch - 35ms/step
Epoch 5/200
1/1 - 0s - loss: 0.0347 - val_loss: 0.3920 - 40ms/epoch - 40ms/step
Epoch 6/200
1/1 - 0s - loss: 0.0347 - val_loss: 0.3839 - 43ms/epoch - 43ms/step
Epoch 7/200
1/1 - 0s - loss: 0.0344 - val_loss: 0.3787 - 34ms/epoch - 34ms/step
Epoch 8/200
1/1 - 0s - loss: 0.0336 - val_loss: 0.3756 - 38ms/epoch - 38ms/step
Epoch 9/200
1/1 - 0s - loss: 0.0326 - val_loss: 0.3741 - 43ms/epoch - 43ms/step
Epoch 10/200
1/1 - 0s - loss: 0.0313 - val_loss: 0.3745 - 34ms/epoch - 34ms/step
Epoch 11/200
1/1 - 0s - loss: 0.0299 - val_loss: 0.3763 - 39ms/epoch - 39ms/step
Epoch 12/200
1/1 - 0s - loss: 0.0282 - val_loss: 0.3795 - 35ms/epoch - 35ms/step
Epoch 13/200
1/

In [7]:
duration_df

Unnamed: 0,Duration,Linear Regression,LSTM,Random Forest
0,1,0.581079,0.351074,0.351074
1,2,5.167295,0.416824,0.416824
2,3,0.37538,0.43945,0.43945
3,4,0.313941,0.430156,0.430156
4,5,0.510405,0.527744,0.527744
5,6,0.246124,0.42746,0.42746
6,7,0.28284,0.231652,0.231652
7,8,0.375436,0.634258,0.634258
8,9,0.35142,0.290509,0.290509


In [8]:

# Change the start time ############################################################
startDatesX = ['2020-03-01', '2020-04-01', '2020-05-01', '2020-06-01', '2020-07-01', '2020-08-01', '2020-09-01', '2020-10-01', '2020-11-01']
dt = timedelta(days=16)
duration = timedelta(days = 120)
rmsdata_varyStartDate = []
for sdateX in startDatesX:
    sdateY = (datetime.strptime(sdateX, xDateFormat) + dt).strftime(yDateFormat)
    edateX = (datetime.strptime(sdateX, xDateFormat) + duration).strftime(xDateFormat)
    edateY = (datetime.strptime(sdateX, xDateFormat) + dt + duration ).strftime(yDateFormat)

    X = X_all_dateIndexed[sdateX:edateX]
    Y = Y_all_16delay[sdateY:edateY]
    
    split_idx = round(len(Y)*train_percentage)
    # Linear regression
    linear_predict = linear.linear(X, Y, train_percentage)
    linear_rms = postprocessing.MAPE(Y[split_idx:], linear_predict)
    # LSTM
    lstm_predict = lstm.lstm(X, Y, train_percentage)
    lstm_rms = postprocessing.MAPE(Y[split_idx:], lstm_predict)
    # RF
    RF_kwargs = {'bootstrap': True,'criterion': 'squared_error','max_depth': None,'min_samples_leaf': 1,'n_estimators': 1000,'min_samples_split': 2,'random_state': 0 }
    RFmodel, RF_predict = RF.randomForest(X[:split_idx], X[split_idx:], Y[:split_idx], RF_kwargs)
    RF_rms = postprocessing.MAPE(Y[split_idx:], lstm_predict)

    rmsdata_varyStartDate.append([(sdateX+ ' : ' + edateX), linear_rms, lstm_rms, RF_rms])

startDate_df = pd.DataFrame(rmsdata_varyStartDate, columns=['Timeframe', 'Linear Regression', 'LSTM', 'Random Forest'])
startDate_df.set_index('Timeframe')
startDate_df.style.set_caption('Adjusting the timeframe period: Effects on MAPE')
print(startDate_df)




(121, 11)
(91, 11)
(30, 11)
Epoch 1/200
2/2 - 2s - loss: 0.3179 - val_loss: 0.6865 - 2s/epoch - 1s/step
Epoch 2/200
2/2 - 0s - loss: 0.2820 - val_loss: 0.6626 - 28ms/epoch - 14ms/step
Epoch 3/200
2/2 - 0s - loss: 0.2486 - val_loss: 0.6389 - 31ms/epoch - 15ms/step
Epoch 4/200
2/2 - 0s - loss: 0.2154 - val_loss: 0.6151 - 51ms/epoch - 25ms/step
Epoch 5/200
2/2 - 0s - loss: 0.1822 - val_loss: 0.5913 - 48ms/epoch - 24ms/step
Epoch 6/200
2/2 - 0s - loss: 0.1521 - val_loss: 0.5676 - 44ms/epoch - 22ms/step
Epoch 7/200
2/2 - 0s - loss: 0.1366 - val_loss: 0.5446 - 44ms/epoch - 22ms/step
Epoch 8/200
2/2 - 0s - loss: 0.1275 - val_loss: 0.5227 - 40ms/epoch - 20ms/step
Epoch 9/200
2/2 - 0s - loss: 0.1225 - val_loss: 0.5020 - 41ms/epoch - 20ms/step
Epoch 10/200
2/2 - 0s - loss: 0.1210 - val_loss: 0.4826 - 65ms/epoch - 32ms/step
Epoch 11/200
2/2 - 0s - loss: 0.1217 - val_loss: 0.4648 - 62ms/epoch - 31ms/step
Epoch 12/200
2/2 - 0s - loss: 0.1232 - val_loss: 0.4486 - 52ms/epoch - 26ms/step
Epoch 13/200


In [9]:
startDate_df

Unnamed: 0,Timeframe,Linear Regression,LSTM,Random Forest
0,2020-03-01 : 2020-06-29,0.514421,0.597092,0.597092
1,2020-04-01 : 2020-07-30,0.314911,0.247219,0.247219
2,2020-05-01 : 2020-08-29,0.761793,0.501307,0.501307
3,2020-06-01 : 2020-09-29,0.182064,0.24172,0.24172
4,2020-07-01 : 2020-10-29,0.507028,0.350969,0.350969
5,2020-08-01 : 2020-11-29,0.48384,0.522466,0.522466
6,2020-09-01 : 2020-12-30,0.706955,0.903548,0.903548
7,2020-10-01 : 2021-01-29,0.753958,1.021113,1.021113
8,2020-11-01 : 2021-03-01,0.539867,0.427772,0.427772


In [10]:

# Change the amount of time delay ############################################################

delays = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
rmsdata_varydelay = []
# Arbitrary start and end dates
sdateX = '2020-02-01'
edateX = '2020-06-01'
train_percentage = 0.75

for delay in delays:
    dt = timedelta(days=delay)

    sdateY = (datetime.strptime(sdateX, xDateFormat) + dt).strftime(yDateFormat)
    edateY = (datetime.strptime(edateX, xDateFormat) + dt).strftime(yDateFormat)

    X = X_all_dateIndexed[sdateX:edateX]
    Y = Y_all[sdateY:edateY]
    
    split_idx = round(len(Y)*train_percentage)
    # Linear regression
    linear_predict = linear.linear(X, Y, train_percentage)
    linear_rms = postprocessing.MAPE(Y[split_idx:], linear_predict)
    # LSTM
    lstm_predict = lstm.lstm(X, Y, train_percentage)
    lstm_rms = postprocessing.MAPE(Y[split_idx:], lstm_predict)
    # RF
    RF_kwargs = {'bootstrap': True,'criterion': 'squared_error','max_depth': None,'min_samples_leaf': 1,'n_estimators': 1000,'min_samples_split': 2,'random_state': 0 }
    RFmodel, RF_predict = RF.randomForest(X[:split_idx], X[split_idx:], Y[:split_idx], RF_kwargs)
    RF_rms = postprocessing.MAPE(Y[split_idx:], lstm_predict)

    rmsdata_varydelay.append([delay, linear_rms, lstm_rms, RF_rms])

delay_df = pd.DataFrame(rmsdata_varydelay, columns=['Delay, days', 'Linear Regression', 'LSTM', 'Random Forest'])
delay_df.set_index('Delay, days')
delay_df.style.set_caption('Adjusting the delay between searches and cases: Effects on MAPE')
print(delay_df)


(122, 11)
(92, 11)
(30, 11)
Epoch 1/200
2/2 - 2s - loss: 0.5050 - val_loss: 0.7386 - 2s/epoch - 1s/step
Epoch 2/200
2/2 - 0s - loss: 0.4683 - val_loss: 0.7079 - 29ms/epoch - 14ms/step
Epoch 3/200
2/2 - 0s - loss: 0.4327 - val_loss: 0.6772 - 29ms/epoch - 14ms/step
Epoch 4/200
2/2 - 0s - loss: 0.3982 - val_loss: 0.6464 - 148ms/epoch - 74ms/step
Epoch 5/200
2/2 - 0s - loss: 0.3636 - val_loss: 0.6154 - 75ms/epoch - 37ms/step
Epoch 6/200
2/2 - 0s - loss: 0.3287 - val_loss: 0.5839 - 44ms/epoch - 22ms/step
Epoch 7/200
2/2 - 0s - loss: 0.2933 - val_loss: 0.5520 - 36ms/epoch - 18ms/step
Epoch 8/200
2/2 - 0s - loss: 0.2612 - val_loss: 0.5199 - 39ms/epoch - 19ms/step
Epoch 9/200
2/2 - 0s - loss: 0.2405 - val_loss: 0.4880 - 36ms/epoch - 18ms/step
Epoch 10/200
2/2 - 0s - loss: 0.2229 - val_loss: 0.4560 - 34ms/epoch - 17ms/step
Epoch 11/200
2/2 - 0s - loss: 0.2080 - val_loss: 0.4242 - 36ms/epoch - 18ms/step
Epoch 12/200
2/2 - 0s - loss: 0.1968 - val_loss: 0.3924 - 37ms/epoch - 18ms/step
Epoch 13/200

In [11]:
delay_df

Unnamed: 0,"Delay, days",Linear Regression,LSTM,Random Forest
0,0,0.275964,0.317172,0.317172
1,2,0.259195,0.394456,0.394456
2,4,0.23941,0.261039,0.261039
3,6,0.303361,0.250846,0.250846
4,8,0.29789,0.301668,0.301668
5,10,0.326743,0.397384,0.397384
6,12,0.300948,0.296262,0.296262
7,14,0.27155,0.305663,0.305663
8,16,0.334751,0.456348,0.456348
9,18,0.324615,0.25145,0.25145


In [12]:

# Change the state being considered ############################################################
# This will require re-parsing the data from gtrends

other_geos = ['US-MA', 'US-WA', 'US-TX', 'US-IL']
other_states = ['Massachusetts', 'Washington', 'Texas', 'Illinois']

# Setting the timeframe to an arbitrary 4-month period
startDateX = '2020-02-01'
endDateX = '2020-06-01'
delay = timedelta(16)
startDateY = (datetime.strptime(startDateX, xDateFormat) + delay).strftime(xDateFormat) # THESE SHOULD BE IN XDATE FORMAT
endDateY = (datetime.strptime(endDateX, xDateFormat) + delay).strftime(xDateFormat) #  ... because these are passed onto getDataNew()

train_percentage = 0.75

rmsdata_varyState = []

for i in range(len(other_states)):
    geo = other_geos[i]
    state = other_states[i]

    X, Y, Y_noDelay = prep.getDataNew(startDateX, endDateX, startDateY, endDateY, geo, state)

    split_idx = round(len(Y)*train_percentage)
    # Linear regression
    linear_predict = linear.linear(X, Y, train_percentage)
    linear_rms = postprocessing.MAPE(Y[split_idx:], linear_predict)
    # LSTM
    lstm_predict = lstm.lstm(X, Y, train_percentage)
    lstm_rms = postprocessing.MAPE(Y[split_idx:], lstm_predict)
    # RF
    RF_kwargs = {'bootstrap': True,'criterion': 'squared_error','max_depth': None,'min_samples_leaf': 1,'n_estimators': 1000,'min_samples_split': 2,'random_state': 0 }
    RFmodel, RF_predict = RF.randomForest(X[:split_idx], X[split_idx:], Y[:split_idx], RF_kwargs)
    RF_rms = postprocessing.MAPE(Y[split_idx:], lstm_predict)

    rmsdata_varyState.append([state, linear_rms, lstm_rms, RF_rms])

state_df = pd.DataFrame(rmsdata_varyState, columns=['State', 'Linear Regression', 'LSTM', 'Random Forest'])
state_df = state_df.set_index('State')
state_df = state_df.style.set_caption('Comparing the MAPE when training on different states for a given time period')
print(state_df)





Fetching 'covid' for period:2019-09-06 2020-06-01
Fetching 'coronavirus' for period:2019-09-06 2020-06-01
Fetching 'covid-19' for period:2019-09-06 2020-06-01
Fetching 'covid cases' for period:2019-09-06 2020-06-01
Fetching 'coronavirus cases' for period:2019-09-06 2020-06-01
Fetching 'covid symptoms' for period:2019-09-06 2020-06-01
Fetching 'coronavirus symptoms' for period:2019-09-06 2020-06-01
Fetching 'cough' for period:2019-09-06 2020-06-01
Fetching 'virus' for period:2019-09-06 2020-06-01
Fetching 'vaccine' for period:2019-09-06 2020-06-01
Fetching 'covid vaccine' for period:2019-09-06 2020-06-01
(122, 11)
(92, 11)
(30, 11)
Epoch 1/200
2/2 - 2s - loss: 0.3701 - val_loss: 0.1545 - 2s/epoch - 1s/step
Epoch 2/200
2/2 - 0s - loss: 0.3400 - val_loss: 0.1275 - 32ms/epoch - 16ms/step
Epoch 3/200
2/2 - 0s - loss: 0.3143 - val_loss: 0.1017 - 36ms/epoch - 18ms/step
Epoch 4/200
2/2 - 0s - loss: 0.2905 - val_loss: 0.0814 - 49ms/epoch - 24ms/step
Epoch 5/200
2/2 - 0s - loss: 0.2669 - val_los

In [13]:
state_df

Unnamed: 0_level_0,Linear Regression,LSTM,Random Forest
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Massachusetts,2.868492,1.745976,1.745976
Washington,1.5984386523443772e+16,2.469384669734721e+16,2.469384669734721e+16
Texas,0.392996,0.388287,0.388287
Illinois,0.956853,0.909461,0.909461
