In [1]:
import pandas as pd 
import csv  

# Import Kaggle CSV With Cases & Deaths By State #

In [2]:
# Import Kaggle csv file with total cases and deaths by US state and county
# Data was downloaded from https://www.kaggle.com/imdevskp/corona-virus-report#usa_county_wise.csv
file = "Raw_Data/usa_county_wise.csv"
kaggle_df = pd.read_csv(file)
kaggle_df.head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Date,Confirmed,Deaths
0,16,AS,ASM,16,60.0,,American Samoa,US,-14.271,-170.132,"American Samoa, US",1/22/20,0,0
1,316,GU,GUM,316,66.0,,Guam,US,13.4443,144.7937,"Guam, US",1/22/20,0,0
2,580,MP,MNP,580,69.0,,Northern Mariana Islands,US,15.0979,145.6739,"Northern Mariana Islands, US",1/22/20,0,0
3,630,PR,PRI,630,72.0,,Puerto Rico,US,18.2208,-66.5901,"Puerto Rico, US",1/22/20,0,0
4,850,VI,VIR,850,78.0,,Virgin Islands,US,18.3358,-64.8963,"Virgin Islands, US",1/22/20,0,0


In [3]:
# Only keep rows for US states (i.e. remove Guam, Virgin Islands, etc.)
kaggle_us = kaggle_df.loc[kaggle_df["iso2"] == "US",:]

In [4]:
# Remove columns not needed (UID, iso2(country/territory), iso3(country/territory), & code3)
kaggle_rem_cols = kaggle_us[["FIPS", "Admin2","Province_State", "Lat", "Long_", "Combined_Key", "Date", "Confirmed", "Deaths"]]

In [5]:
# Rename columns (Admin 2 to County, Province_State to State, Long_ to Lng)
kaggle_renamed = kaggle_rem_cols.rename(columns={"Admin2":"County", "Province_State":"State", "Long_":"Lng"})
kaggle_renamed.sample(10)

Unnamed: 0,FIPS,County,State,Lat,Lng,Combined_Key,Date,Confirmed,Deaths
248895,29055.0,Crawford,Missouri,37.976637,-91.305281,"Crawford, Missouri, US",4/7/20,3,0
122285,36035.0,Fulton,New York,43.113639,-74.417988,"Fulton, New York, US",2/28/20,0,0
91753,17027.0,Clinton,Illinois,38.607049,-89.423853,"Clinton, Illinois, US",2/19/20,0,0
96809,46097.0,Miner,South Dakota,44.021979,-97.609922,"Miner, South Dakota, US",2/20/20,0,0
71914,8109.0,Saguache,Colorado,38.080546,-106.282466,"Saguache, Colorado, US",2/13/20,0,0
103168,42029.0,Chester,Pennsylvania,39.972918,-75.747684,"Chester, Pennsylvania, US",2/22/20,0,0
9793,1047.0,Dallas,Alabama,32.326881,-87.108667,"Dallas, Alabama, US",1/25/20,0,0
119756,48097.0,Cooke,Texas,33.639796,-97.212409,"Cooke, Texas, US",2/27/20,0,0
86375,31173.0,Thurston,Nebraska,42.157254,-96.541571,"Thurston, Nebraska, US",2/17/20,0,0
126581,51139.0,Page,Virginia,38.618075,-78.487972,"Page, Virginia, US",2/29/20,0,0


In [6]:
# Check for missing values
kaggle_renamed.count()

FIPS            285648
County          285824
State           286000
Lat             286000
Lng             286000
Combined_Key    286000
Date            286000
Confirmed       286000
Deaths          286000
dtype: int64

In [7]:
# Drop rows with missing data
kaggle_drop = kaggle_renamed.dropna()
kaggle_drop.count()

FIPS            285472
County          285472
State           285472
Lat             285472
Lng             285472
Combined_Key    285472
Date            285472
Confirmed       285472
Deaths          285472
dtype: int64

In [8]:
# Check if date column date are strings or date objects
kaggle_drop.dtypes

FIPS            float64
County           object
State            object
Lat             float64
Lng             float64
Combined_Key     object
Date             object
Confirmed         int64
Deaths            int64
dtype: object

In [9]:
# Change dates from strings to datetime objects
kaggle_drop['Date'] = pd.to_datetime(kaggle_drop['Date'],format='%m/%d/%y')

In [10]:
# Check that column type changed
kaggle_drop.dtypes

FIPS                   float64
County                  object
State                   object
Lat                    float64
Lng                    float64
Combined_Key            object
Date            datetime64[ns]
Confirmed                int64
Deaths                   int64
dtype: object

In [11]:
# Remove rows without a county name
kaggle_final = kaggle_drop.loc[kaggle_drop["County"] != "Unassigned",:]
kaggle_final2 = kaggle_final[~kaggle_final["County"].str.contains("Out of")]
kaggle_final2.tail(10)

Unnamed: 0,FIPS,County,State,Lat,Lng,Combined_Key,Date,Confirmed,Deaths
286322,56027.0,Niobrara,Wyoming,43.056077,-104.47589,"Niobrara, Wyoming, US",2020-04-18,1,0
286323,56029.0,Park,Wyoming,44.521575,-109.585282,"Park, Wyoming, US",2020-04-18,1,0
286324,56031.0,Platte,Wyoming,42.132991,-104.966331,"Platte, Wyoming, US",2020-04-18,0,0
286325,56033.0,Sheridan,Wyoming,44.790489,-106.886239,"Sheridan, Wyoming, US",2020-04-18,12,0
286326,56035.0,Sublette,Wyoming,42.765583,-109.913092,"Sublette, Wyoming, US",2020-04-18,1,0
286327,56037.0,Sweetwater,Wyoming,41.659439,-108.882788,"Sweetwater, Wyoming, US",2020-04-18,10,0
286328,56039.0,Teton,Wyoming,43.935225,-110.58908,"Teton, Wyoming, US",2020-04-18,62,0
286329,56041.0,Uinta,Wyoming,41.287818,-110.547578,"Uinta, Wyoming, US",2020-04-18,6,0
286330,56043.0,Washakie,Wyoming,43.904516,-107.680187,"Washakie, Wyoming, US",2020-04-18,5,0
286331,56045.0,Weston,Wyoming,43.839612,-104.567488,"Weston, Wyoming, US",2020-04-18,0,0


In [23]:
# Reset index to use as primary key for county table
kaggle_final3 = kaggle_final2.reset_index(drop=True)
kaggle_final3.head()

Unnamed: 0,FIPS,County,State,Lat,Lng,Combined_Key,Date,Confirmed,Deaths
0,1001.0,Autauga,Alabama,32.539527,-86.644082,"Autauga, Alabama, US",2020-01-22,0,0
1,1003.0,Baldwin,Alabama,30.72775,-87.722071,"Baldwin, Alabama, US",2020-01-22,0,0
2,1005.0,Barbour,Alabama,31.868263,-85.387129,"Barbour, Alabama, US",2020-01-22,0,0
3,1007.0,Bibb,Alabama,32.996421,-87.125115,"Bibb, Alabama, US",2020-01-22,0,0
4,1009.0,Blount,Alabama,33.982109,-86.567906,"Blount, Alabama, US",2020-01-22,0,0


In [25]:
# Export kaggle df as a csv so it can be imported to postgres
kaggle_final3.to_csv("county_data.csv", encoding="utf-8", index=True)

# Import CDC CSV With COVID Forecasts By State #

In [26]:
# Import csv file with forecast of potential deaths by state
# Data was downloaded from https://www.cdc.gov/coronavirus/2019-ncov/covid-data/forecasting-us.html
file2 = "Raw_Data/forecast_data_0420.csv"
forecast_raw = pd.read_csv(file2)
forecast_raw.tail()

Unnamed: 0,model,forecast_date,target,target_week_end_date,location_name,point,quantile_0.025,quantile_0.975
1640,YYG,4/20/2020,4 wk ahead cum death,5/16/2020,West Virginia,75,54.0,110.0
1641,YYG,4/20/2020,1 wk ahead cum death,4/25/2020,Wyoming,4,4.0,4.0
1642,YYG,4/20/2020,2 wk ahead cum death,5/2/2020,Wyoming,8,7.0,9.0
1643,YYG,4/20/2020,3 wk ahead cum death,5/9/2020,Wyoming,11,10.0,14.0
1644,YYG,4/20/2020,4 wk ahead cum death,5/16/2020,Wyoming,15,13.0,19.0


In [27]:
# Only keep rows for states, not whole of US
forecast_states = forecast_raw.loc[forecast_raw["location_name"] != "US",:]
forecast_states.head()

Unnamed: 0,model,forecast_date,target,target_week_end_date,location_name,point,quantile_0.025,quantile_0.975
1,CU 40% contact reduction,4/20/2020,1 wk ahead cum death,4/25/2020,Alabama,184,157.0,226.0
2,CU 40% contact reduction,4/20/2020,1 wk ahead cum death,4/25/2020,Alaska,15,15.0,18.0
3,CU 40% contact reduction,4/20/2020,1 wk ahead cum death,4/25/2020,Arizona,265,222.0,325.0
4,CU 40% contact reduction,4/20/2020,1 wk ahead cum death,4/25/2020,Arkansas,50,42.0,62.0
5,CU 40% contact reduction,4/20/2020,1 wk ahead cum death,4/25/2020,California,2012,1792.0,2520.0


In [28]:
# Drop rows with missing data
forecast_states = forecast_states.dropna()
forecast_states.count()

model                   1620
forecast_date           1620
target                  1620
target_week_end_date    1620
location_name           1620
point                   1620
quantile_0.025          1620
quantile_0.975          1620
dtype: int64

In [30]:
# Rename columns (target week end date to target end date, location name to state, point to actual)
forecast_renamed = forecast_states.rename(columns={"target_week_end_date":"target_end_date", "location_name":"state", "point":"actual"})
forecast_renamed.head(10)

Unnamed: 0,model,forecast_date,target,target_end_date,state,actual,quantile_0.025,quantile_0.975
1,CU 40% contact reduction,4/20/2020,1 wk ahead cum death,4/25/2020,Alabama,184,157.0,226.0
2,CU 40% contact reduction,4/20/2020,1 wk ahead cum death,4/25/2020,Alaska,15,15.0,18.0
3,CU 40% contact reduction,4/20/2020,1 wk ahead cum death,4/25/2020,Arizona,265,222.0,325.0
4,CU 40% contact reduction,4/20/2020,1 wk ahead cum death,4/25/2020,Arkansas,50,42.0,62.0
5,CU 40% contact reduction,4/20/2020,1 wk ahead cum death,4/25/2020,California,2012,1792.0,2520.0
6,CU 40% contact reduction,4/20/2020,1 wk ahead cum death,4/25/2020,Colorado,585,525.0,650.0
7,CU 40% contact reduction,4/20/2020,1 wk ahead cum death,4/25/2020,Connecticut,2153,1748.0,2483.0
8,CU 40% contact reduction,4/20/2020,1 wk ahead cum death,4/25/2020,Delaware,232,175.0,306.0
9,CU 40% contact reduction,4/20/2020,1 wk ahead cum death,4/25/2020,Florida,1419,1162.0,1715.0
10,CU 40% contact reduction,4/20/2020,1 wk ahead cum death,4/25/2020,Georgia,927,843.0,1026.0


In [31]:
# Check if date column types are datetime or string 
forecast_renamed.dtypes

model               object
forecast_date       object
target              object
target_end_date     object
state               object
actual               int64
quantile_0.025     float64
quantile_0.975     float64
dtype: object

In [32]:
# Change dates from strings to datetime objects
forecast_renamed['forecast_date'] = pd.to_datetime(forecast_renamed['forecast_date'],format='%m/%d/%Y')
forecast_renamed['target_end_date'] = pd.to_datetime(forecast_renamed['target_end_date'],format='%m/%d/%Y')
forecast_renamed.tail()

Unnamed: 0,model,forecast_date,target,target_end_date,state,actual,quantile_0.025,quantile_0.975
1640,YYG,2020-04-20,4 wk ahead cum death,2020-05-16,West Virginia,75,54.0,110.0
1641,YYG,2020-04-20,1 wk ahead cum death,2020-04-25,Wyoming,4,4.0,4.0
1642,YYG,2020-04-20,2 wk ahead cum death,2020-05-02,Wyoming,8,7.0,9.0
1643,YYG,2020-04-20,3 wk ahead cum death,2020-05-09,Wyoming,11,10.0,14.0
1644,YYG,2020-04-20,4 wk ahead cum death,2020-05-16,Wyoming,15,13.0,19.0


In [33]:
# Verify columns changed to datetime
forecast_renamed.dtypes

model                      object
forecast_date      datetime64[ns]
target                     object
target_end_date    datetime64[ns]
state                      object
actual                      int64
quantile_0.025            float64
quantile_0.975            float64
dtype: object

In [34]:
# Reset index to use as primary key for county table
forecast_final = forecast_renamed.reset_index(drop=True)
forecast_final.tail()

Unnamed: 0,model,forecast_date,target,target_end_date,state,actual,quantile_0.025,quantile_0.975
1615,YYG,2020-04-20,4 wk ahead cum death,2020-05-16,West Virginia,75,54.0,110.0
1616,YYG,2020-04-20,1 wk ahead cum death,2020-04-25,Wyoming,4,4.0,4.0
1617,YYG,2020-04-20,2 wk ahead cum death,2020-05-02,Wyoming,8,7.0,9.0
1618,YYG,2020-04-20,3 wk ahead cum death,2020-05-09,Wyoming,11,10.0,14.0
1619,YYG,2020-04-20,4 wk ahead cum death,2020-05-16,Wyoming,15,13.0,19.0


In [35]:
# Export forecast df as a csv so it can be imported to postgres
forecast_final.to_csv("forecast_cdc.csv", encoding="utf-8", index=True)