In [6]:
import pandas as pd 
import csv  

# Import Kaggle CSV With Cases & Deaths By State #

In [7]:
# Import Kaggle csv file with total cases and deaths by US state and county
# Data was downloaded from https://www.kaggle.com/imdevskp/corona-virus-report#usa_county_wise.csv
file = "Data/usa_county_wise.csv"
kaggle_df = pd.read_csv(file)
kaggle_df.head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Date,Confirmed,Deaths
0,16,AS,ASM,16,60.0,,American Samoa,US,-14.271,-170.132,"American Samoa, US",1/22/20,0,0
1,316,GU,GUM,316,66.0,,Guam,US,13.4443,144.7937,"Guam, US",1/22/20,0,0
2,580,MP,MNP,580,69.0,,Northern Mariana Islands,US,15.0979,145.6739,"Northern Mariana Islands, US",1/22/20,0,0
3,630,PR,PRI,630,72.0,,Puerto Rico,US,18.2208,-66.5901,"Puerto Rico, US",1/22/20,0,0
4,850,VI,VIR,850,78.0,,Virgin Islands,US,18.3358,-64.8963,"Virgin Islands, US",1/22/20,0,0


In [8]:
# Only keep rows for US states (i.e. remove Guam, Virgin Islands, etc.)
kaggle_us = kaggle_df.loc[kaggle_df["iso2"] == "US",:]

In [9]:
# Remove columns not needed (UID, iso2(country/territory), iso3(country/territory), & code3)
kaggle_rem_cols = kaggle_us[["FIPS", "Admin2","Province_State", "Lat", "Long_", "Combined_Key", "Date", "Confirmed", "Deaths"]]

In [33]:
# Rename columns (Admin 2 to County, Province_State to State, Long_ to Lng)
kaggle_renamed = kaggle_rem_cols.rename(columns={"Admin2":"County", "Province_State":"State", "Long_":"Lng"})
kaggle_renamed.sample(15)

Unnamed: 0,FIPS,County,State,Lat,Lng,Combined_Key,Date,Confirmed,Deaths
184097,35031.0,McKinley,New Mexico,35.580077,-108.262245,"McKinley, New Mexico, US",3/18/20,0,0
217659,51009.0,Amherst,Virginia,37.603083,-79.145487,"Amherst, Virginia, US",3/28/20,2,0
43613,26125.0,Oakland,Michigan,42.660901,-83.385954,"Oakland, Michigan, US",2/4/20,0,0
71200,51041.0,Chesterfield,Virginia,37.373732,-77.586801,"Chesterfield, Virginia, US",2/12/20,0,0
193424,27121.0,Pope,Minnesota,45.586067,-95.444512,"Pope, Minnesota, US",3/21/20,0,0
198847,8085.0,Montrose,Colorado,38.402141,-108.263902,"Montrose, Colorado, US",3/23/20,1,0
27646,30007.0,Broadwater,Montana,46.332776,-111.495581,"Broadwater, Montana, US",1/30/20,0,0
91456,9007.0,Middlesex,Connecticut,41.459497,-72.537149,"Middlesex, Connecticut, US",2/19/20,0,0
61940,2230.0,Skagway,Alaska,59.5615,-135.333775,"Skagway, Alaska, US",2/10/20,0,0
208330,1011.0,Bullock,Alabama,32.100305,-85.712655,"Bullock, Alabama, US",3/26/20,2,0


In [34]:
# Check if date column date are strings or date objects
kaggle_renamed.dtypes

FIPS            float64
County           object
State            object
Lat             float64
Lng             float64
Combined_Key     object
Date             object
Confirmed         int64
Deaths            int64
dtype: object

In [37]:
# Change dates from strings to datetime objects
kaggle_renamed['Date'] = pd.to_datetime(kaggle_renamed['Date'],format='%m/%d/%y')

In [38]:
# Check that column type change
kaggle_renamed.dtypes

FIPS                   float64
County                  object
State                   object
Lat                    float64
Lng                    float64
Combined_Key            object
Date            datetime64[ns]
Confirmed                int64
Deaths                   int64
dtype: object

In [39]:
# Export kaggle df as a csv so it can be imported to postgres
kaggle_renamed.to_csv("totals_cases_deaths.csv", encoding="utf-8", index=False)

# Import CDC CSV With COVID Forecasts By State #

In [13]:
# Import csv file with forecast of potential deaths by state
# Data was downloaded from https://www.cdc.gov/coronavirus/2019-ncov/covid-data/forecasting-us.html
file2 = "Data/forecast_data_0413.csv"
forecast_raw = pd.read_csv(file2)
forecast_raw.tail()

Unnamed: 0,model,forecast_date,target,target_week_end_date,location_name,point,quantile_0.025,quantile_0.975
1325,MOBS,4/13/2020,2 wk ahead cum death,4/25/2020,Washington,939,,
1326,MOBS,4/13/2020,2 wk ahead cum death,4/25/2020,Wisconsin,199,,
1327,MOBS,4/13/2020,2 wk ahead cum death,4/25/2020,West Virginia,13,,
1328,MOBS,4/13/2020,2 wk ahead cum death,4/25/2020,Wyoming,15,,
1329,MOBS,4/13/2020,2 wk ahead cum death,4/25/2020,US,46548,,


In [15]:
# Only keep rows for states, not whole of US
forecast_states = forecast_raw.loc[forecast_raw["location_name"] != "US",:]
forecast_states.head()

Unnamed: 0,model,forecast_date,target,target_week_end_date,location_name,point,quantile_0.025,quantile_0.975
4,ensemble forecast,4/13/2020,1 wk ahead cum death,4/18/2020,Alabama,152,102.0,325.0
5,ensemble forecast,4/13/2020,2 wk ahead cum death,4/25/2020,Alabama,243,116.0,676.0
6,ensemble forecast,4/13/2020,3 wk ahead cum death,5/2/2020,Alabama,339,131.0,1151.0
7,ensemble forecast,4/13/2020,4 wk ahead cum death,5/9/2020,Alabama,428,142.0,1836.0
8,ensemble forecast,4/13/2020,1 wk ahead cum death,4/18/2020,Delaware,52,36.0,146.0


In [17]:
# Drop rows with missing data
forecast_states = forecast_states.dropna()
forecast_states.count()

model                   1220
forecast_date           1220
target                  1220
target_week_end_date    1220
location_name           1220
point                   1220
quantile_0.025          1220
quantile_0.975          1220
dtype: int64

In [32]:
# Rename columns (target week end date to target end date, location name to state, point to actual)
forecast_renamed = forecast_states.rename(columns={"target_week_end_date":"target_end_date", "location_name":"state", "point":"actual"})
forecast_renamed.head(20)

Unnamed: 0,model,forecast_date,target,target_end_date,state,actual,quantile_0.025,quantile_0.975
4,ensemble forecast,4/13/2020,1 wk ahead cum death,4/18/2020,Alabama,152,102.0,325.0
5,ensemble forecast,4/13/2020,2 wk ahead cum death,4/25/2020,Alabama,243,116.0,676.0
6,ensemble forecast,4/13/2020,3 wk ahead cum death,5/2/2020,Alabama,339,131.0,1151.0
7,ensemble forecast,4/13/2020,4 wk ahead cum death,5/9/2020,Alabama,428,142.0,1836.0
8,ensemble forecast,4/13/2020,1 wk ahead cum death,4/18/2020,Delaware,52,36.0,146.0
9,ensemble forecast,4/13/2020,2 wk ahead cum death,4/25/2020,Delaware,71,36.0,295.0
10,ensemble forecast,4/13/2020,3 wk ahead cum death,5/2/2020,Delaware,90,37.0,496.0
11,ensemble forecast,4/13/2020,4 wk ahead cum death,5/9/2020,Delaware,108,38.0,828.0
12,ensemble forecast,4/13/2020,1 wk ahead cum death,4/18/2020,District of Columbia,75,54.0,154.0
13,ensemble forecast,4/13/2020,2 wk ahead cum death,4/25/2020,District of Columbia,103,59.0,272.0


In [24]:
# Check if date column types are datetime or string 
forecast_renamed.dtypes

model               object
forecast_date       object
target              object
target_end_date     object
state               object
actual               int64
quantile_0.025     float64
quantile_0.975     float64
dtype: object

In [28]:
# Change dates from strings to datetime objects
forecast_renamed['forecast_date'] = pd.to_datetime(forecast_renamed['forecast_date'],format='%m/%d/%Y')
forecast_renamed['target_end_date'] = pd.to_datetime(forecast_renamed['target_end_date'],format='%m/%d/%Y')

In [29]:
# Verify columns changed to datetime
forecast_renamed.dtypes

model                      object
forecast_date      datetime64[ns]
target                     object
target_end_date    datetime64[ns]
state                      object
actual                      int64
quantile_0.025            float64
quantile_0.975            float64
dtype: object

In [30]:
# Export forecast df as a csv so it can be imported to postgres
forecast_renamed.to_csv("forecast.csv", encoding="utf-8", index=False)