In [2]:
import pandas as pd 
import csv  

# Import Kaggle CSV With Cases & Deaths By State #

In [3]:
# Import Kaggle csv file with total cases and deaths by US state and county
# Data was downloaded from https://www.kaggle.com/imdevskp/corona-virus-report#usa_county_wise.csv
file = "Raw_Data/usa_county_wise.csv"
kaggle_df = pd.read_csv(file)
kaggle_df.head()

Unnamed: 0,UID,iso2,iso3,code3,FIPS,Admin2,Province_State,Country_Region,Lat,Long_,Combined_Key,Date,Confirmed,Deaths
0,16,AS,ASM,16,60.0,,American Samoa,US,-14.271,-170.132,"American Samoa, US",1/22/20,0,0
1,316,GU,GUM,316,66.0,,Guam,US,13.4443,144.7937,"Guam, US",1/22/20,0,0
2,580,MP,MNP,580,69.0,,Northern Mariana Islands,US,15.0979,145.6739,"Northern Mariana Islands, US",1/22/20,0,0
3,630,PR,PRI,630,72.0,,Puerto Rico,US,18.2208,-66.5901,"Puerto Rico, US",1/22/20,0,0
4,850,VI,VIR,850,78.0,,Virgin Islands,US,18.3358,-64.8963,"Virgin Islands, US",1/22/20,0,0


In [4]:
# Only keep rows for US states (i.e. remove Guam, Virgin Islands, etc.)
kaggle_us = kaggle_df.loc[kaggle_df["iso2"] == "US",:]

In [5]:
# Remove columns not needed (UID, iso2(country/territory), iso3(country/territory), & code3)
kaggle_rem_cols = kaggle_us[["FIPS", "Admin2","Province_State", "Lat", "Long_", "Combined_Key", "Date", "Confirmed", "Deaths"]]

In [6]:
# Rename columns (Admin 2 to County, Province_State to State, Long_ to Lng)
kaggle_renamed = kaggle_rem_cols.rename(columns={"Admin2":"County", "Province_State":"State", "Long_":"Lng"})
kaggle_renamed.sample(10)

Unnamed: 0,FIPS,County,State,Lat,Lng,Combined_Key,Date,Confirmed,Deaths
121105,17141.0,Ogle,Illinois,42.041652,-89.321847,"Ogle, Illinois, US",2/28/20,0,0
95507,21229.0,Washington,Kentucky,37.755995,-85.174077,"Washington, Kentucky, US",2/20/20,0,0
44728,46095.0,Mellette,South Dakota,43.580788,-100.756319,"Mellette, South Dakota, US",2/4/20,0,0
212525,20115.0,Marion,Kansas,38.358554,-97.098525,"Marion, Kansas, US",3/27/20,0,0
156975,18067.0,Howard,Indiana,40.485277,-86.113519,"Howard, Indiana, US",3/10/20,0,0
193637,29211.0,Sullivan,Missouri,40.20991,-93.111501,"Sullivan, Missouri, US",3/21/20,0,0
164989,41053.0,Polk,Oregon,44.903228,-123.412889,"Polk, Oregon, US",3/12/20,1,0
145272,39009.0,Athens,Ohio,39.334256,-82.042786,"Athens, Ohio, US",3/6/20,0,0
276615,80051.0,Out of VA,Virginia,0.0,0.0,"Out of VA, Virginia, US",4/15/20,0,0
262373,37157.0,Rockingham,North Carolina,36.395931,-79.775048,"Rockingham, North Carolina, US",4/11/20,14,2


In [7]:
# Check for missing values
kaggle_renamed.count()

FIPS            285648
County          285824
State           286000
Lat             286000
Lng             286000
Combined_Key    286000
Date            286000
Confirmed       286000
Deaths          286000
dtype: int64

In [8]:
# Drop rows with missing data
kaggle_drop = kaggle_renamed.dropna()
kaggle_drop.count()

FIPS            285472
County          285472
State           285472
Lat             285472
Lng             285472
Combined_Key    285472
Date            285472
Confirmed       285472
Deaths          285472
dtype: int64

In [9]:
# Check if date column date are strings or date objects
kaggle_drop.dtypes

FIPS            float64
County           object
State            object
Lat             float64
Lng             float64
Combined_Key     object
Date             object
Confirmed         int64
Deaths            int64
dtype: object

In [10]:
# Change dates from strings to datetime objects
kaggle_drop['Date'] = pd.to_datetime(kaggle_drop['Date'],format='%m/%d/%y')

In [11]:
# Check that column type changed
kaggle_drop.dtypes

FIPS                   float64
County                  object
State                   object
Lat                    float64
Lng                    float64
Combined_Key            object
Date            datetime64[ns]
Confirmed                int64
Deaths                   int64
dtype: object

In [12]:
kaggle_drop.reset_index(drop=True)
kaggle_drop.tail()

Unnamed: 0,FIPS,County,State,Lat,Lng,Combined_Key,Date,Confirmed,Deaths
286432,90051.0,Unassigned,Virginia,0.0,0.0,"Unassigned, Virginia, US",2020-04-18,0,94
286433,90053.0,Unassigned,Washington,0.0,0.0,"Unassigned, Washington, US",2020-04-18,444,0
286434,90054.0,Unassigned,West Virginia,0.0,0.0,"Unassigned, West Virginia, US",2020-04-18,0,9
286435,90055.0,Unassigned,Wisconsin,0.0,0.0,"Unassigned, Wisconsin, US",2020-04-18,0,0
286436,90056.0,Unassigned,Wyoming,0.0,0.0,"Unassigned, Wyoming, US",2020-04-18,0,1


In [23]:
# Remove rows without a county name
kaggle_final = kaggle_drop.loc[kaggle_drop["County"] != "Unassigned",:]
kaggle_final2 = kaggle_final[~kaggle_final["County"].str.contains("Out of")]
kaggle_final2.tail(15)

Unnamed: 0,FIPS,County,State,Lat,Lng,Combined_Key,Date,Confirmed,Deaths
286317,56017.0,Hot Springs,Wyoming,43.719307,-108.442317,"Hot Springs, Wyoming, US",2020-04-18,1,0
286318,56019.0,Johnson,Wyoming,44.040572,-106.584517,"Johnson, Wyoming, US",2020-04-18,11,1
286319,56021.0,Laramie,Wyoming,41.307025,-104.68875,"Laramie, Wyoming, US",2020-04-18,70,0
286320,56023.0,Lincoln,Wyoming,42.263764,-110.6564,"Lincoln, Wyoming, US",2020-04-18,5,0
286321,56025.0,Natrona,Wyoming,42.961801,-106.797885,"Natrona, Wyoming, US",2020-04-18,38,0
286322,56027.0,Niobrara,Wyoming,43.056077,-104.47589,"Niobrara, Wyoming, US",2020-04-18,1,0
286323,56029.0,Park,Wyoming,44.521575,-109.585282,"Park, Wyoming, US",2020-04-18,1,0
286324,56031.0,Platte,Wyoming,42.132991,-104.966331,"Platte, Wyoming, US",2020-04-18,0,0
286325,56033.0,Sheridan,Wyoming,44.790489,-106.886239,"Sheridan, Wyoming, US",2020-04-18,12,0
286326,56035.0,Sublette,Wyoming,42.765583,-109.913092,"Sublette, Wyoming, US",2020-04-18,1,0


In [25]:
# Export kaggle df as a csv so it can be imported to postgres
kaggle_final2.to_csv("county_data.csv", encoding="utf-8", index=True)

# Import CDC CSV With COVID Forecasts By State #

In [14]:
# Import csv file with forecast of potential deaths by state
# Data was downloaded from https://www.cdc.gov/coronavirus/2019-ncov/covid-data/forecasting-us.html
file2 = "Raw_Data/forecast_data_0413.csv"
forecast_raw = pd.read_csv(file2)
forecast_raw.tail()

Unnamed: 0,model,forecast_date,target,target_week_end_date,location_name,point,quantile_0.025,quantile_0.975
1325,MOBS,4/13/2020,2 wk ahead cum death,4/25/2020,Washington,939,,
1326,MOBS,4/13/2020,2 wk ahead cum death,4/25/2020,Wisconsin,199,,
1327,MOBS,4/13/2020,2 wk ahead cum death,4/25/2020,West Virginia,13,,
1328,MOBS,4/13/2020,2 wk ahead cum death,4/25/2020,Wyoming,15,,
1329,MOBS,4/13/2020,2 wk ahead cum death,4/25/2020,US,46548,,


In [15]:
# Only keep rows for states, not whole of US
forecast_states = forecast_raw.loc[forecast_raw["location_name"] != "US",:]
forecast_states.head()

Unnamed: 0,model,forecast_date,target,target_week_end_date,location_name,point,quantile_0.025,quantile_0.975
4,ensemble forecast,4/13/2020,1 wk ahead cum death,4/18/2020,Alabama,152,102.0,325.0
5,ensemble forecast,4/13/2020,2 wk ahead cum death,4/25/2020,Alabama,243,116.0,676.0
6,ensemble forecast,4/13/2020,3 wk ahead cum death,5/2/2020,Alabama,339,131.0,1151.0
7,ensemble forecast,4/13/2020,4 wk ahead cum death,5/9/2020,Alabama,428,142.0,1836.0
8,ensemble forecast,4/13/2020,1 wk ahead cum death,4/18/2020,Delaware,52,36.0,146.0


In [16]:
# Drop rows with missing data
forecast_states = forecast_states.dropna()
forecast_states.count()

model                   1220
forecast_date           1220
target                  1220
target_week_end_date    1220
location_name           1220
point                   1220
quantile_0.025          1220
quantile_0.975          1220
dtype: int64

In [17]:
# Rename columns (target week end date to target end date, location name to state, point to actual)
forecast_renamed = forecast_states.rename(columns={"target_week_end_date":"target_end_date", "location_name":"state", "point":"actual"})
forecast_renamed.head(20)

Unnamed: 0,model,forecast_date,target,target_end_date,state,actual,quantile_0.025,quantile_0.975
4,ensemble forecast,4/13/2020,1 wk ahead cum death,4/18/2020,Alabama,152,102.0,325.0
5,ensemble forecast,4/13/2020,2 wk ahead cum death,4/25/2020,Alabama,243,116.0,676.0
6,ensemble forecast,4/13/2020,3 wk ahead cum death,5/2/2020,Alabama,339,131.0,1151.0
7,ensemble forecast,4/13/2020,4 wk ahead cum death,5/9/2020,Alabama,428,142.0,1836.0
8,ensemble forecast,4/13/2020,1 wk ahead cum death,4/18/2020,Delaware,52,36.0,146.0
9,ensemble forecast,4/13/2020,2 wk ahead cum death,4/25/2020,Delaware,71,36.0,295.0
10,ensemble forecast,4/13/2020,3 wk ahead cum death,5/2/2020,Delaware,90,37.0,496.0
11,ensemble forecast,4/13/2020,4 wk ahead cum death,5/9/2020,Delaware,108,38.0,828.0
12,ensemble forecast,4/13/2020,1 wk ahead cum death,4/18/2020,District of Columbia,75,54.0,154.0
13,ensemble forecast,4/13/2020,2 wk ahead cum death,4/25/2020,District of Columbia,103,59.0,272.0


In [18]:
# Check if date column types are datetime or string 
forecast_renamed.dtypes

model               object
forecast_date       object
target              object
target_end_date     object
state               object
actual               int64
quantile_0.025     float64
quantile_0.975     float64
dtype: object

In [19]:
# Change dates from strings to datetime objects
forecast_renamed['forecast_date'] = pd.to_datetime(forecast_renamed['forecast_date'],format='%m/%d/%Y')
forecast_renamed['target_end_date'] = pd.to_datetime(forecast_renamed['target_end_date'],format='%m/%d/%Y')

In [20]:
# Verify columns changed to datetime
forecast_renamed.dtypes

model                      object
forecast_date      datetime64[ns]
target                     object
target_end_date    datetime64[ns]
state                      object
actual                      int64
quantile_0.025            float64
quantile_0.975            float64
dtype: object

In [21]:
# Export forecast df as a csv so it can be imported to postgres
forecast_renamed.to_csv("forecast.csv", encoding="utf-8", index=False)