## Set up

In [2]:
import pandas as pd
from sqlalchemy import create_engine
import json
import requests

## Get NYT county COVID data

In [2]:
# -----------------------
# This is a NYTimes report on national COVID-19 cases and deaths, sorted by county
# From: https://github.com/nytimes/covid-19-data (us-counties.csv)
# -----------------------

# Pull in the live data
counties_path = "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv"

# Save to a dataframe
covid_all_df = pd.read_csv(counties_path)

#### Build all values dataframe

In [3]:
# Drop rows with null values (we want complete data only)
covid_all_df = covid_all_df.dropna()

# Convert fips to int
covid_all_df = covid_all_df.astype({'fips': 'int'})

In [4]:
# Change the null values (no data) to 0
covid_all_df = covid_all_df.fillna(0)

In [5]:
# Display the dataframe
#covid_all_df

#### Build latest values dataframe

In [6]:
# Get the most up-to-date data
latest_date = covid_all_df['date'].iloc[-1]

# Make a dataframe holding that most recent data
covid_latest_df = covid_all_df.loc[covid_all_df['date']==latest_date]

In [7]:
# Change the null values (no data) to 0
covid_latest_df = covid_latest_df.fillna(0)

In [8]:
# Display the dataframe
#covid_latest_df

## Get mask use data

In [9]:
# Import the mask CSV data
mask_path = "https://raw.githubusercontent.com/nytimes/covid-19-data/master/mask-use/mask-use-by-county.csv"
masks_df = pd.read_csv(mask_path)

In [10]:
# Clean up column names
masks_df.columns = ['fips','never','rarely','sometimes','frequently','always']

In [11]:
# Change the null values (no data) to 0
masks_df = masks_df.fillna(0)

In [49]:
# Display the dataframe
masks_df

Unnamed: 0,fips,never,rarely,sometimes,frequently,always
0,1001,0.053,0.074,0.134,0.295,0.444
1,1003,0.083,0.059,0.098,0.323,0.436
2,1005,0.067,0.121,0.120,0.201,0.491
3,1007,0.020,0.034,0.096,0.278,0.572
4,1009,0.053,0.114,0.180,0.194,0.459
...,...,...,...,...,...,...
3137,56037,0.061,0.295,0.230,0.146,0.268
3138,56039,0.095,0.157,0.160,0.247,0.340
3139,56041,0.098,0.278,0.154,0.207,0.264
3140,56043,0.204,0.155,0.069,0.285,0.287


## Get CDC data
#### Source: https://data.cdc.gov/NCHS/Weekly-Counts-of-Deaths-by-State-and-Select-Causes/muzy-jte6

In [13]:
cdc_url = 'https://data.cdc.gov/resource/muzy-jte6.json'
response = requests.get(cdc_url)
cdc_json = response.json()

# Convert to a dataframe
cdc_df = pd.DataFrame.from_dict(cdc_json)

In [14]:
# Rename columns
cdc_df.columns = ['state',
                  'year',
                  'week',
                  'week_ending_date',
                  'all_causes',
                  'natural_causes',
                  'septicemia',
                  'malignant_neoplasms',
                  'diabetes',
                  'alzheimers',
                  'influenza_and_pneumonia',
                  'chronic_lower_respiratory',
                  'other_diseases_of_respiratory',
                  'nephritis_nephrotic_syndrome',
                  'symptoms_signs_and_abnormal',
                  'diseases_of_heart',
                  'cerebrovascular_diseases',
                  'covid_19_multiple_causes',
                  'covid_19_underlying_cause',
                  'flag_otherresp',
                  'flag_otherunk',
                  'flag_nephr',
                  'flag_inflpn',
                  'flag_cov19mcod',
                  'flag_cov19ucod',
                  'flag_sept',
                  'flag_diab',
                  'flag_alz',
                  'flag_clrd',
                  'flag_stroke',
                  'flag_hd',
                  'flag_neopl',
                  'flag_allcause',
                  'flag_natcause']     

In [15]:
# Change the null values (no data) to 0
cdc_df = cdc_df.fillna(0)

In [16]:
# Display the dataframe
#cdc_df

In [17]:
# Build 2019 and 2020 dataframes, just in case
cdc_2019_df = cdc_df.loc[cdc_df['year']=="2019"]
cdc_2020_df = cdc_df.loc[cdc_df['year']=="2020"]

## Get Covid Tracking Project data
Source: https://covidtracking.com

#### Build latest values dataframe

In [45]:
covidtracking_current_url = 'https://api.covidtracking.com/v1/states/current.json'
covidtracking_current_response = requests.get(covidtracking_current_url)
covidtracking_current_json = covidtracking_current_response.json()

# Convert to a dataframe
covidtracking_current_df = pd.DataFrame.from_dict(covidtracking_current_json)

In [47]:
# Change the null values (no data) to 0
covidtracking_current_df = covidtracking_current_df.fillna(0)

In [48]:
# Display the dataframe
covidtracking_current_df

Unnamed: 0,date,state,positive,negative,pending,totalTestResults,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,...,posNeg,deathIncrease,hospitalizedIncrease,hash,commercialScore,negativeRegularScore,negativeScore,positiveScore,score,grade
0,20201001,AK,8912,453411,0.0,462323,42.0,0.0,0.0,0.0,...,462323,1,0,6fce5edde2805fba8ccfabc0232b39bb079a0c0a,0,0,0,0,0,
1,20201001,AL,155744,1000570,0.0,1138732,760.0,17257.0,0.0,1832.0,...,1156314,8,0,d7277459136bf4c2075ceaea9cd64c6a193a9e80,0,0,0,0,0,
2,20201001,AR,84821,961085,0.0,1042616,479.0,5445.0,226.0,0.0,...,1045906,15,91,7c4e2564ec03b53f01b8413cc9f5dfcaa6211a9c,0,0,0,0,0,
3,20201001,AS,0,1571,0.0,1571,0.0,0.0,0.0,0.0,...,1571,0,0,477353724ddacdac0dea1688c06ac98c7c0f2e00,0,0,0,0,0,
4,20201001,AZ,219212,1256760,0.0,1471368,620.0,22226.0,122.0,0.0,...,1475972,24,107,27ad4565e95a54e45eed8d16d78c44e8446df774,0,0,0,0,0,
5,20201001,CA,813687,13958164,0.0,14771851,3205.0,0.0,817.0,0.0,...,14771851,96,0,1b11c154c294a0a1d2bfcf1ae44b2dc18b5233c0,0,0,0,0,0,
6,20201001,CO,70536,845848,0.0,1357159,264.0,7568.0,0.0,0.0,...,916384,8,10,72afec34fd7cf9b6af572a9ad0ee7ac4b2fcb4c0,0,0,0,0,0,
7,20201001,CT,57742,1546638,0.0,1604380,107.0,11699.0,0.0,0.0,...,1604380,3,139,079df2f369d017d641f44f0fe7e92368af9ae8e2,0,0,0,0,0,
8,20201001,DC,15358,374822,0.0,390180,98.0,0.0,28.0,0.0,...,390180,1,0,c8dacefc0e22fc03ffc3437fa14e6dc67eea3a3b,0,0,0,0,0,
9,20201001,DE,20787,267430,0.0,288217,78.0,0.0,17.0,0.0,...,288217,0,0,34dfccfaab56176001a9473d40b2dd8a734724b6,0,0,0,0,0,


#### Build latest values dataframe

In [21]:
covidtracking_all_url = 'https://api.covidtracking.com/v1/us/daily.json'
covidtracking_all_response = requests.get(covidtracking_all_url)
covidtracking_all_json = covidtracking_all_response.json()

# Convert to a dataframe
covidtracking_all_df = pd.DataFrame.from_dict(covidtracking_all_json)

In [22]:
# Change the null values (no data) to 0
covidtracking_all_df = covidtracking_all_df.fillna(0)

In [23]:
# Display the dataframe
#covidtracking_all_df

## Get the county information table

In [24]:
# Scrape the county info table from Wikipedia
county_url = 'https://en.wikipedia.org/wiki/User:Michael_J/County_table'
county_table = pd.read_html(county_url)

In [25]:
# Grab the first table on the page and convert to dataframe
county_table_df = county_table[0]

# Drop columns we don't need
county_table_df = county_table_df.drop(columns=['Land Areakm²','Land Areami²','Water Areakm²','Water Areami²','Total Areakm²','Total Areami²','Sort [1]','Population(2010)'])

# Rename the columns
county_table_df.columns = ['state','fips','county','county_seat','lat','lon']

# Remove the + sign from latitude column
county_table_df['lat'] = county_table_df['lat'].str[1:]
county_table_df['lon'] = county_table_df['lon'].str[1:]

# Remove degree symbol from lat and lon
county_table_df['lat'] = county_table_df['lat'].str[:-1]
county_table_df['lon'] = county_table_df['lon'].str[:-1]

# Set as a float for lat and lon
county_table_df['lat'] = county_table_df['lat'].astype(float)
county_table_df['lon'] = county_table_df['lon'].astype(float)

# Convert the lon to an actual negative value (for all, since North America)
county_table_df['lon'] = -county_table_df['lon']

In [26]:
# Change the null values (no data) to 0
county_table_df = county_table_df.fillna(0)

In [42]:
# Display the dataframe
#county_table_df

## Connect and load to postgressql
### Available dataframes:
<b>NYT:</b> covid_all_df, covid_latest_df <br>
<b>Masks:</b> masks_df <br>
<b>CDC:</b> cdc_df, cdc_2019_df, cdc_2020_df <br>
<b>Covid tracking:</b> covidtracking_current_df, covidtracking_all_df<br>
<b>County info:</b> county_table_df

In [3]:
# Connect to local database
# Make sure you fill out the user / password for MySQL
rds_connection_string = "covid_db_admin:pass123@localhost:5432/covid_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [4]:
# Check for Tables
engine.table_names()

['covid',
 'masks',
 'cdc',
 'covidtracking_current',
 'covidtracking_all',
 'county',
 'test']

### Load data into database

In [30]:
# Load covid_all_df dataframe into database
covid_all_df.to_sql(name='covid', con=engine, if_exists='append', index=False)

In [31]:
# Load mask_df dataframe into database
masks_df.to_sql(name='masks', con=engine, if_exists='append', index=False)

In [32]:
# Load cdc_df dataframe into database
cdc_df.to_sql(name='cdc', con=engine, if_exists='append', index=False)

In [33]:
# Not yet working
#covidtracking_current_df.to_sql(name='covidtracking_current', con=engine, if_exists='append', index=False)

In [34]:
# Not yet working
#covidtracking_all_df.to_sql(name='covidtracking_all', con=engine, if_exists='append', index=False)

In [35]:
# Load county_table_df dataframe into database
county_table_df.to_sql(name='county', con=engine, if_exists='append', index=False)

### Confirm data load

In [36]:
# Confirm covid data has been added
pd.read_sql_query('select * from covid', con=engine)

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061,1,0
1,2020-01-22,Snohomish,Washington,53061,1,0
2,2020-01-23,Snohomish,Washington,53061,1,0
3,2020-01-24,Cook,Illinois,17031,1,0
4,2020-01-24,Snohomish,Washington,53061,1,0
...,...,...,...,...,...,...
580653,2020-09-30,Sweetwater,Wyoming,56037,339,2
580654,2020-09-30,Teton,Wyoming,56039,577,1
580655,2020-09-30,Uinta,Wyoming,56041,357,2
580656,2020-09-30,Washakie,Wyoming,56043,115,6


In [37]:
# Confirm mask use data has been added
pd.read_sql_query('select * from masks', con=engine)

Unnamed: 0,fips,never,rarely,sometimes,frequently,always
0,1001,0.053,0.074,0.134,0.295,0.444
1,1003,0.083,0.059,0.098,0.323,0.436
2,1005,0.067,0.121,0.120,0.201,0.491
3,1007,0.020,0.034,0.096,0.278,0.572
4,1009,0.053,0.114,0.180,0.194,0.459
...,...,...,...,...,...,...
3137,56037,0.061,0.295,0.230,0.146,0.268
3138,56039,0.095,0.157,0.160,0.247,0.340
3139,56041,0.098,0.278,0.154,0.207,0.264
3140,56043,0.204,0.155,0.069,0.285,0.287


In [38]:
# Confirm cdc data has been added
pd.read_sql_query('select * from cdc', con=engine)

Unnamed: 0,state,year,week,week_ending_date,all_causes,natural_causes,septicemia,malignant_neoplasms,diabetes,alzheimers,...,flag_cov19ucod,flag_sept,flag_diab,flag_alz,flag_clrd,flag_stroke,flag_hd,flag_neopl,flag_allcause,flag_natcause
0,Alabama,2019,1,1/5/2019,1077,993,30,198,22,60,...,0,0,0,0,0,0,0,0,0,0
1,Alabama,2019,2,1/12/2019,1090,994,25,187,24,49,...,0,0,0,0,0,0,0,0,0,0
2,Alabama,2019,3,1/19/2019,1114,1042,22,238,18,48,...,0,0,0,0,0,0,0,0,0,0
3,Alabama,2019,4,1/26/2019,1063,994,21,165,22,50,...,0,0,0,0,0,0,0,0,0,0
4,Alabama,2019,5,2/2/2019,1095,1026,18,199,19,52,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Hawaii,2019,6,2/9/2019,220,195,0,48,0,0,...,0,Suppressed (counts 1-9),Suppressed (counts 1-9),Suppressed (counts 1-9),0,0,0,0,0,0
996,Hawaii,2019,7,2/16/2019,257,236,0,50,11,10,...,0,Suppressed (counts 1-9),0,0,0,0,0,0,0,0
997,Hawaii,2019,8,2/23/2019,236,211,0,45,0,12,...,0,Suppressed (counts 1-9),Suppressed (counts 1-9),0,Suppressed (counts 1-9),0,0,0,0,0
998,Hawaii,2019,9,3/2/2019,241,224,0,41,0,0,...,0,Suppressed (counts 1-9),Suppressed (counts 1-9),Suppressed (counts 1-9),0,0,0,0,0,0


In [39]:
# Confirm county data has been added
pd.read_sql_query('select * from county', con=engine)

Unnamed: 0,state,fips,county,county_seat,lat,lon
0,AL,1001,Autauga,Prattville,32.536382,-86.644490
1,AL,1003,Baldwin,Bay Minette,30.659218,-87.746067
2,AL,1005,Barbour,Clayton,31.870670,-85.405456
3,AL,1007,Bibb,Centreville,33.015893,-87.127148
4,AL,1009,Blount,Oneonta,33.977448,-86.567246
...,...,...,...,...,...,...
3138,WY,56037,Sweetwater,Green River,41.660339,-108.875676
3139,WY,56039,Teton,Jackson,44.049321,-110.588102
3140,WY,56041,Uinta,Evanston,41.284726,-110.558947
3141,WY,56043,Washakie,Worland,43.878831,-107.669052


In [5]:
pd.read_sql_query('select * from test', con=engine)

Unnamed: 0,date,county,state,cases,deaths,fips,never,rarely,sometimes,frequently,always,county_seat,lat,lon
0,2020-01-21,Snohomish,Washington,1,0,53061,0.017,0.014,0.056,0.191,0.721,Everett,48.054913,-121.766412
1,2020-01-22,Snohomish,Washington,1,0,53061,0.017,0.014,0.056,0.191,0.721,Everett,48.054913,-121.766412
2,2020-01-23,Snohomish,Washington,1,0,53061,0.017,0.014,0.056,0.191,0.721,Everett,48.054913,-121.766412
3,2020-01-24,Cook,Illinois,1,0,17031,0.023,0.021,0.072,0.162,0.722,Chicago,41.894294,-87.645455
4,2020-01-24,Snohomish,Washington,1,0,53061,0.017,0.014,0.056,0.191,0.721,Everett,48.054913,-121.766412
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
568286,2020-09-30,Sweetwater,Wyoming,339,2,56037,0.061,0.295,0.230,0.146,0.268,Green River,41.660339,-108.875676
568287,2020-09-30,Teton,Wyoming,577,1,56039,0.095,0.157,0.160,0.247,0.340,Jackson,44.049321,-110.588102
568288,2020-09-30,Uinta,Wyoming,357,2,56041,0.098,0.278,0.154,0.207,0.264,Evanston,41.284726,-110.558947
568289,2020-09-30,Washakie,Wyoming,115,6,56043,0.204,0.155,0.069,0.285,0.287,Worland,43.878831,-107.669052
