In [6]:
import pandas as pd 
import csv  

#Import csv with cases and deaths by state

In [7]:
# Import Kaggle csv file with total cases and deaths by US state and county
# Data was downloaded from https://www.kaggle.com/imdevskp/corona-virus-report#usa_county_wise.csv
file = "Data/usa_county_wise.csv"
kaggle_df = pd.read_csv(file)
kaggle_df.head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Date,Confirmed,Deaths
0,16,AS,ASM,16,60.0,,American Samoa,US,-14.271,-170.132,"American Samoa, US",1/22/20,0,0
1,316,GU,GUM,316,66.0,,Guam,US,13.4443,144.7937,"Guam, US",1/22/20,0,0
2,580,MP,MNP,580,69.0,,Northern Mariana Islands,US,15.0979,145.6739,"Northern Mariana Islands, US",1/22/20,0,0
3,630,PR,PRI,630,72.0,,Puerto Rico,US,18.2208,-66.5901,"Puerto Rico, US",1/22/20,0,0
4,850,VI,VIR,850,78.0,,Virgin Islands,US,18.3358,-64.8963,"Virgin Islands, US",1/22/20,0,0


In [8]:
# Only keep rows for US states (i.e. remove Guam, Virgin Islands, etc.)
kaggle_us = kaggle_df.loc[kaggle_df["iso2"] == "US",:]

In [9]:
# Remove columns not needed (UID, iso2(country/territory), iso3(country/territory), & code3)
kaggle_rem_cols = kaggle_us[["FIPS", "Admin2","Province_State", "Lat", "Long_", "Combined_Key", "Date", "Confirmed", "Deaths"]]

In [10]:
# Rename columns (Admin 2 to County, Province_State to State, Long_ to Lng)
kaggle_renamed = kaggle_rem_cols.rename(columns={"Admin2":"County", "Province_State":"State", "Long_":"Lng"})
kaggle_renamed.head(10)

Unnamed: 0,FIPS,County,State,Lat,Lng,Combined_Key,Date,Confirmed,Deaths
5,1001.0,Autauga,Alabama,32.539527,-86.644082,"Autauga, Alabama, US",1/22/20,0,0
6,1003.0,Baldwin,Alabama,30.72775,-87.722071,"Baldwin, Alabama, US",1/22/20,0,0
7,1005.0,Barbour,Alabama,31.868263,-85.387129,"Barbour, Alabama, US",1/22/20,0,0
8,1007.0,Bibb,Alabama,32.996421,-87.125115,"Bibb, Alabama, US",1/22/20,0,0
9,1009.0,Blount,Alabama,33.982109,-86.567906,"Blount, Alabama, US",1/22/20,0,0
10,1011.0,Bullock,Alabama,32.100305,-85.712655,"Bullock, Alabama, US",1/22/20,0,0
11,1013.0,Butler,Alabama,31.753001,-86.680575,"Butler, Alabama, US",1/22/20,0,0
12,1015.0,Calhoun,Alabama,33.774837,-85.826304,"Calhoun, Alabama, US",1/22/20,0,0
13,1017.0,Chambers,Alabama,32.913601,-85.390727,"Chambers, Alabama, US",1/22/20,0,0
14,1019.0,Cherokee,Alabama,34.17806,-85.60639,"Cherokee, Alabama, US",1/22/20,0,0


In [11]:
# Export kaggle df as a csv so it can be imported to postgres
kaggle_renamed.to_csv("totals_cases_deaths.csv", encoding="utf-8", index=False)

#Import csv with COVID forecast by state

In [13]:
# Import csv file with forecast of potential deaths by state
# Data was downloaded from https://www.cdc.gov/coronavirus/2019-ncov/covid-data/forecasting-us.html
file2 = "Data/forecast_data_0413.csv"
forecast_raw = pd.read_csv(file2)
forecast_raw.tail()

Unnamed: 0,model,forecast_date,target,target_week_end_date,location_name,point,quantile_0.025,quantile_0.975
1325,MOBS,4/13/2020,2 wk ahead cum death,4/25/2020,Washington,939,,
1326,MOBS,4/13/2020,2 wk ahead cum death,4/25/2020,Wisconsin,199,,
1327,MOBS,4/13/2020,2 wk ahead cum death,4/25/2020,West Virginia,13,,
1328,MOBS,4/13/2020,2 wk ahead cum death,4/25/2020,Wyoming,15,,
1329,MOBS,4/13/2020,2 wk ahead cum death,4/25/2020,US,46548,,


In [15]:
# Only keep rows for states, not whole of US
forecast_states = forecast_raw.loc[forecast_raw["location_name"] != "US",:]
forecast_states.head()

Unnamed: 0,model,forecast_date,target,target_week_end_date,location_name,point,quantile_0.025,quantile_0.975
4,ensemble forecast,4/13/2020,1 wk ahead cum death,4/18/2020,Alabama,152,102.0,325.0
5,ensemble forecast,4/13/2020,2 wk ahead cum death,4/25/2020,Alabama,243,116.0,676.0
6,ensemble forecast,4/13/2020,3 wk ahead cum death,5/2/2020,Alabama,339,131.0,1151.0
7,ensemble forecast,4/13/2020,4 wk ahead cum death,5/9/2020,Alabama,428,142.0,1836.0
8,ensemble forecast,4/13/2020,1 wk ahead cum death,4/18/2020,Delaware,52,36.0,146.0


In [17]:
# Drop rows with missing data
forecast_states = forecast_states.dropna()
forecast_states.count()

model                   1220
forecast_date           1220
target                  1220
target_week_end_date    1220
location_name           1220
point                   1220
quantile_0.025          1220
quantile_0.975          1220
dtype: int64

In [20]:
# Rename columns (target week end date to target end date, location name to state, point to actual)
forecast_renamed = forecast_states.rename(columns={"target_week_end_date":"target_end_date", "location_name":"state", "point":"actual"})
forecast_renamed.sample(10)

Unnamed: 0,model,forecast_date,target,target_end_date,state,actual,quantile_0.025,quantile_0.975
1018,IHME,4/13/2020,3 wk ahead cum death,5/2/2020,Colorado,444,334.0,670.0
1237,LANL,4/13/2020,4 wk ahead cum death,5/9/2020,West Virginia,17,6.0,179.0
400,CU 40% contact reduction,4/13/2020,4 wk ahead cum death,5/9/2020,Missouri,451,185.0,1146.0
586,CU 30% contact reduction,4/13/2020,4 wk ahead cum death,5/9/2020,Iowa,284,43.0,1768.0
105,ensemble forecast,4/13/2020,2 wk ahead cum death,4/25/2020,New Jersey,4717,3001.0,9820.0
1051,LANL,4/13/2020,1 wk ahead cum death,4/18/2020,Minnesota,108,77.0,226.0
67,ensemble forecast,4/13/2020,4 wk ahead cum death,5/9/2020,Maryland,1015,369.0,4527.0
641,CU 20% contact reduction,4/13/2020,1 wk ahead cum death,4/18/2020,Montana,32,6.0,77.0
465,CU 30% contact reduction,4/13/2020,2 wk ahead cum death,4/25/2020,Arizona,541,172.0,1098.0
418,CU 30% contact reduction,4/13/2020,1 wk ahead cum death,4/18/2020,Connecticut,602,314.0,1139.0


In [21]:
# Export forecast df as a csv so it can be imported to postgres
forecast_renamed.to_csv("forecast.csv", encoding="utf-8", index=False)