<a href="https://colab.research.google.com/github/danlingzhou16/stat390/blob/GraceZhu/Prophet_Throw_Everything.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [66]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import prophet
import time
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [67]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [68]:
train = pd.read_csv('/content/gdrive/MyDrive/complete covid dataset/train_final.csv', parse_dates = ['date'])
test = pd.read_csv('/content/gdrive/MyDrive/complete covid dataset/test_final.csv', parse_dates = ['date'])

In [69]:
print(train.country_code.unique())


['US' 'LU' 'IE' 'NO' 'CH' 'SG' 'QA' 'IS' 'DK' 'AU']


In [70]:
from prophet.make_holidays import make_holidays_df

us_holidays = make_holidays_df(year_list=[2020 + i for i in range(3)], country='US')
us_holidays['country_code'] ='US'
lu_holidays = make_holidays_df(year_list=[2020 + i for i in range(3)], country='LU')
lu_holidays['country_code'] ='LU'
ie_holidays = make_holidays_df(year_list=[2020 + i for i in range(3)], country='IE')
ie_holidays['country_code'] ='IE'
no_holidays = make_holidays_df(year_list=[2020 + i for i in range(3)], country='NO')
no_holidays['country_code'] = 'NO'
ch_holidays = make_holidays_df(year_list=[2020 + i for i in range(3)], country='CH')
ch_holidays['country_code'] = 'CH'
sg_holidays = make_holidays_df(year_list=[2020 + i for i in range(3)], country='SG')
sg_holidays['country_code'] = 'SG'
# qa_holidays = make_holidays_df(year_list=[2020 + i for i in range(3)], country='QA') Qatar is not supported
is_holidays = make_holidays_df(year_list=[2020 + i for i in range(3)], country='IS')
is_holidays['country_code'] = 'IS'
dk_holidays = make_holidays_df(year_list=[2020 + i for i in range(3)], country='DK')
dk_holidays['country_code'] = 'DK'
au_holidays = make_holidays_df(year_list=[2020 + i for i in range(3)], country='AU')
au_holidays['country_code'] = 'AU'

In [71]:
holidays = pd.concat([us_holidays, lu_holidays, ie_holidays, no_holidays, ch_holidays, sg_holidays, is_holidays, dk_holidays, au_holidays])

In [72]:
holidays.holiday = 1
holidays.tail()

Unnamed: 0,ds,holiday,country_code
23,2022-01-03,1,AU
24,2022-04-15,1,AU
25,2022-04-18,1,AU
26,2022-12-25,1,AU
27,2022-12-27,1,AU


In [73]:
train_with_holiday = train.merge(holidays, how = 'left', left_on = ['date', 'country_code'], right_on = ['ds', 'country_code'])
train_with_holiday['holiday'].fillna(0, inplace = True)
test_with_holiday = test.merge(holidays, how = 'left', left_on = ['date', 'country_code'], right_on = ['ds', 'country_code'])
test_with_holiday['holiday'].fillna(0, inplace = True)

In [74]:
# drop useless columns
train_multi_prophet = train_with_holiday.drop(columns = ['Unnamed: 0.1', 'Unnamed: 0', 'day_name', 'new_confirmed_mean1', 'new_confirmed_max1', 'new_confirmed_min1', 'ds', 'new_confirmed_std1'])
test_multi_prophet = test_with_holiday.drop(columns = ['Unnamed: 0.1', 'Unnamed: 0', 'day_name', 'new_confirmed_mean1', 'new_confirmed_max1', 'new_confirmed_min1', 'ds', 'new_confirmed_std1'])

In [75]:
train_multi_prophet.head()

Unnamed: 0,date,location_key_x,country_code,new_deceased,cumulative_deceased,population,population_male,population_female,latitude,longitude,...,new_confirmed_min7,day_of_week,quarter,month,year,dayofyear,dayofmonth,weekofyear,season,holiday
0,2020-01-22,US_AK,US,0.0,0.0,733391.0,424916.0,391925.0,64.0,-150.0,...,,2,1,1,2020,22,22,4,Winter,0.0
1,2020-01-23,US_AK,US,0.0,0.0,733391.0,424916.0,391925.0,64.0,-150.0,...,,3,1,1,2020,23,23,4,Winter,0.0
2,2020-01-24,US_AK,US,0.0,0.0,733391.0,424916.0,391925.0,64.0,-150.0,...,,4,1,1,2020,24,24,4,Winter,0.0
3,2020-01-25,US_AK,US,0.0,0.0,733391.0,424916.0,391925.0,64.0,-150.0,...,,5,1,1,2020,25,25,4,Winter,0.0
4,2020-01-26,US_AK,US,0.0,0.0,733391.0,424916.0,391925.0,64.0,-150.0,...,,6,1,1,2020,26,26,4,Winter,0.0


In [76]:
# label encoding
from sklearn.preprocessing import LabelEncoder

label_encoder2 = LabelEncoder() # for season

# day of the week -- turns out there is a day_of_week column that has already been encoded
# however, Mon-Sun is from 0-6 and I want it to be 1-7
train_multi_prophet['day_of_week']= train_multi_prophet['day_of_week'] + 1
test_multi_prophet['day_of_week']= test_multi_prophet['day_of_week'] + 1
# season
train_multi_prophet['season'] = label_encoder2.fit_transform(train_multi_prophet['season'])
test_multi_prophet['season']= label_encoder2.transform(test_multi_prophet['season'])

In [77]:
# put zeros to missing lagging features
train_multi_prophet.fillna(0, inplace = True)
test_multi_prophet.fillna(0, inplace = True)

In [80]:
model2 = prophet.Prophet()
for x in train_multi_prophet.columns:
  if x not in ['date', 'new_confirmed', 'location_key_x', 'country_code']:
    model2.add_regressor(x)
model2=model2.fit(train_multi_prophet.reset_index().rename(columns={'date':'ds', 'new_confirmed':'y'}))

INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmphsssw6jr/90y46gci.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmphsssw6jr/dvo33pyy.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.10/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=51957', 'data', 'file=/tmp/tmphsssw6jr/90y46gci.json', 'init=/tmp/tmphsssw6jr/dvo33pyy.json', 'output', 'file=/tmp/tmphsssw6jr/prophet_modelz_b88sbm/prophet_model-20231112213838.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
21:38:38 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
21:42:31 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


In [81]:
train_pred_multi=model2.predict(train_multi_prophet.reset_index().rename(columns={'date':'ds', 'new_confirmed':'y'})).set_index('ds')
train_pred_multi.head()

  df['trend'] = self.predict_trend(df)


Unnamed: 0_level_0,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,AG.LND.AGRI.K2,AG.LND.AGRI.K2_lower,AG.LND.AGRI.K2_upper,AG.LND.AGRI.ZS,AG.LND.AGRI.ZS_lower,...,year,year_lower,year_upper,yearly,yearly_lower,yearly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-01-01,0.023025,-1391.991382,669.322488,0.023025,0.023025,0.251258,0.251258,0.251258,1.281745,1.281745,...,-16.970351,-16.970351,-16.970351,12.539071,12.539071,12.539071,0.0,0.0,0.0,-360.631368
2020-01-01,0.023025,-1296.897605,823.037639,0.023025,0.023025,0.248669,0.248669,0.248669,1.972763,1.972763,...,-16.970351,-16.970351,-16.970351,12.539071,12.539071,12.539071,0.0,0.0,0.0,-285.312132
2020-01-01,0.023025,-1256.003889,799.939644,0.023025,0.023025,0.251324,0.251324,0.251324,-2.708239,-2.708239,...,-16.970351,-16.970351,-16.970351,12.539071,12.539071,12.539071,0.0,0.0,0.0,-239.211003
2020-01-01,0.023025,-1425.87742,786.3652,0.023025,0.023025,0.251393,0.251393,0.251393,-3.111056,-3.111056,...,-16.970351,-16.970351,-16.970351,12.539071,12.539071,12.539071,0.0,0.0,0.0,-352.178325
2020-01-01,0.023025,-1411.919915,771.417199,0.023025,0.023025,0.246741,0.246741,0.246741,2.182737,2.182737,...,-16.970351,-16.970351,-16.970351,12.539071,12.539071,12.539071,0.0,0.0,0.0,-300.144883


In [90]:
test_pred_multi=model2.predict(test_multi_prophet.reset_index().rename(columns={'date':'ds', 'new_confirmed':'y'})).set_index('ds')

  df['trend'] = self.predict_trend(df)


In [91]:
print("the rmse on the training data is ", mean_squared_error(train_multi_prophet.new_confirmed, train_pred_multi.yhat, squared = False))
print("the rmse on the testing data is ", mean_squared_error(test_multi_prophet.new_confirmed, test_pred_multi.yhat, squared = False))

the rmse on the training data is  3498.5264515913764
the rmse on the testing data is  9671.22141184692


In [92]:
train_simp = train[['date', 'new_confirmed']]
test_simp = test[['date', 'new_confirmed']]

In [94]:
train_simp = train_simp.reset_index().rename(columns={'date':'ds', 'new_confirmed':'y'})
test_simp = test_simp.reset_index().rename(columns={'date':'ds', 'new_confirmed':'y'})

In [95]:
model1 = prophet.Prophet().fit(train_simp)

INFO:prophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
DEBUG:cmdstanpy:input tempfile: /tmp/tmphsssw6jr/i4e83pj6.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmphsssw6jr/2h61vqy7.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.10/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=47940', 'data', 'file=/tmp/tmphsssw6jr/i4e83pj6.json', 'init=/tmp/tmphsssw6jr/2h61vqy7.json', 'output', 'file=/tmp/tmphsssw6jr/prophet_model9gtf6x4p/prophet_model-20231112215828.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
21:58:28 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
21:59:05 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


In [97]:
train_predict = model1.predict(train_simp)

In [102]:
train_predict.head()

Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,additive_terms,additive_terms_lower,additive_terms_upper,weekly,weekly_lower,weekly_upper,yearly,yearly_lower,yearly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
0,2020-01-01,456.955859,-3741.236132,2555.68267,456.955859,456.955859,-1052.24957,-1052.24957,-1052.24957,83.894805,83.894805,83.894805,-1136.144375,-1136.144375,-1136.144375,0.0,0.0,0.0,-595.293711
1,2020-01-01,456.955859,-3633.168966,2268.703268,456.955859,456.955859,-1052.24957,-1052.24957,-1052.24957,83.894805,83.894805,83.894805,-1136.144375,-1136.144375,-1136.144375,0.0,0.0,0.0,-595.293711
2,2020-01-01,456.955859,-3544.814342,2476.944273,456.955859,456.955859,-1052.24957,-1052.24957,-1052.24957,83.894805,83.894805,83.894805,-1136.144375,-1136.144375,-1136.144375,0.0,0.0,0.0,-595.293711
3,2020-01-01,456.955859,-3468.553663,2524.579456,456.955859,456.955859,-1052.24957,-1052.24957,-1052.24957,83.894805,83.894805,83.894805,-1136.144375,-1136.144375,-1136.144375,0.0,0.0,0.0,-595.293711
4,2020-01-01,456.955859,-3817.250452,2479.391282,456.955859,456.955859,-1052.24957,-1052.24957,-1052.24957,83.894805,83.894805,83.894805,-1136.144375,-1136.144375,-1136.144375,0.0,0.0,0.0,-595.293711


In [99]:
# make the date the index column for y_train and y_train_predict
train_predict_y = train_predict[['ds','yhat']].set_index('ds')
train_date_index = train_simp.set_index('ds')

In [100]:
# predict the testing data
test_predict = model1.predict(test_simp)
test_predict_y = test_predict[['ds','yhat']].set_index('ds')
test_date_index = test_simp.set_index('ds')

In [104]:
print('The RMSE of the training data is ', mean_squared_error(train_simp.y, train_predict.yhat, squared=False))
print('The RMSE of the testing data is ', mean_squared_error(test_simp.y, test_predict.yhat, squared=False))

The RMSE of the training data is  2643.0029299775597
The RMSE of the testing data is  12620.692051886057
