### This notebook evaluates the accuracy of the MSPH projections for COVID daily new infections (from May 3rd through May 10th)
> The focus is on the period from May 3rd to May 10th in order to evaluate the model over the same period that we evaluate the IHME model. This the most recent week of historical data before the start date for projected data.  <br><br>
> The results are...

In [1]:
# import libraries

import numpy as np
import pandas as pd
import us
import datetime

from sklearn import metrics 

import matplotlib.pyplot as plt

### Get the predictions

In [2]:
# read in the May 3rd predictions

msph = pd.read_csv('./Raw_Data/MSPH_5_3_Projection_80contact.csv')

print(msph.shape)
msph.head()

(131964, 13)


Unnamed: 0,county,fips,Date,report_2.5,report_25,report_50,report_75,report_97.5,total_2.5,total_25,total_50,total_75,total_97.5
0,Autauga County AL,1001,05/03/20,0,1,2,3,5,3,8,12,17,28
1,Baldwin County AL,1003,05/03/20,2,4,5,8,12,4,9,14,21,40
2,Barbour County AL,1005,05/03/20,0,0,0,0,1,0,0,0,1,5
3,Bibb County AL,1007,05/03/20,0,0,0,0,1,0,0,0,0,2
4,Blount County AL,1009,05/03/20,0,0,1,2,4,1,4,6,9,22


In [3]:
# Extract the rows for California

# Remove rows for all counties outside California
msph = msph.loc[msph['county'].str.endswith('CA')]

# reset the index
msph.reset_index(inplace=True)
msph.drop(columns=['index'], inplace=True)

# Display the results
print(msph.shape)
msph.head()

(2436, 13)


Unnamed: 0,county,fips,Date,report_2.5,report_25,report_50,report_75,report_97.5,total_2.5,total_25,total_50,total_75,total_97.5
0,Alameda County CA,6001,05/03/20,27,37,43,49,58,117,157,189,213,268
1,Alpine County CA,6003,05/03/20,0,0,0,0,0,0,0,0,0,0
2,Amador County CA,6005,05/03/20,0,0,0,0,0,0,0,0,0,0
3,Butte County CA,6007,05/03/20,0,0,0,0,3,0,0,0,0,2
4,Calaveras County CA,6009,05/03/20,0,0,0,0,2,0,0,0,0,3


In [4]:
# Remove the extraneous words from the county column

msph['county'] = msph['county'].str.replace(' County CA', '')

# Display the results
print(msph.shape)
msph.head()

(2436, 13)


Unnamed: 0,county,fips,Date,report_2.5,report_25,report_50,report_75,report_97.5,total_2.5,total_25,total_50,total_75,total_97.5
0,Alameda,6001,05/03/20,27,37,43,49,58,117,157,189,213,268
1,Alpine,6003,05/03/20,0,0,0,0,0,0,0,0,0,0
2,Amador,6005,05/03/20,0,0,0,0,0,0,0,0,0,0
3,Butte,6007,05/03/20,0,0,0,0,3,0,0,0,0,2
4,Calaveras,6009,05/03/20,0,0,0,0,2,0,0,0,0,3


In [5]:
# Select the target column: 
# the middle projection for the total number of new daily infections,
# which includes both reported and unreported cases.

msph = msph[['county', 'Date', 'total_50']]

# Adjust the column names
msph.rename(columns={'Date': 'date', 'total_50':'est_new_infections'}, inplace=True)

# Display the results
print(msph.shape)
msph.head()

(2436, 3)


Unnamed: 0,county,date,est_new_infections
0,Alameda,05/03/20,189
1,Alpine,05/03/20,0
2,Amador,05/03/20,0
3,Butte,05/03/20,0
4,Calaveras,05/03/20,0


In [6]:
# Convert the date column from string to date-time dtype.

msph["date"] = pd.to_datetime(msph['date'])

# Display the results
print(msph.shape)
msph.head()

(2436, 3)


Unnamed: 0,county,date,est_new_infections
0,Alameda,2020-05-03,189
1,Alpine,2020-05-03,0
2,Amador,2020-05-03,0
3,Butte,2020-05-03,0
4,Calaveras,2020-05-03,0


In [7]:
# Select dates from May 3rd through May 10th

msph = msph.loc[(msph['date'] >= '2020-05-03') & 
                (msph['date'] <= '2020-05-10')]

print(msph.shape)
msph.head()

(464, 3)


Unnamed: 0,county,date,est_new_infections
0,Alameda,2020-05-03,189
1,Alpine,2020-05-03,0
2,Amador,2020-05-03,0
3,Butte,2020-05-03,0
4,Calaveras,2020-05-03,0


In [8]:
# Resort the rows by county and then date

msph = msph.sort_values(['county','date'])

# reset the index
msph.reset_index(inplace=True)
msph.drop(columns=['index'], inplace=True)

# Display the results
print(msph.shape)
msph.head()

(464, 3)


Unnamed: 0,county,date,est_new_infections
0,Alameda,2020-05-03,189
1,Alameda,2020-05-04,180
2,Alameda,2020-05-05,178
3,Alameda,2020-05-06,173
4,Alameda,2020-05-07,177


### Get the actual values