## Set up

In [1]:
import pandas as pd
from sqlalchemy import create_engine
import json
import requests

## Get NYT county COVID data

In [2]:
# -----------------------
# This is a NYTimes report on national COVID-19 cases and deaths, sorted by county
# From: https://github.com/nytimes/covid-19-data (us-counties.csv)
# -----------------------

# Pull in the live data
counties_path = "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv"

# Save to a dataframe
covid_all_df = pd.read_csv(counties_path)

#### Build all values dataframe

In [3]:
# Drop rows with null values (we want complete data only)
covid_all_df = covid_all_df.dropna()

# Convert fips to int
covid_all_df = covid_all_df.astype({'fips': 'int'})

In [4]:
covid_all_df

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061,1,0
1,2020-01-22,Snohomish,Washington,53061,1,0
2,2020-01-23,Snohomish,Washington,53061,1,0
3,2020-01-24,Cook,Illinois,17031,1,0
4,2020-01-24,Snohomish,Washington,53061,1,0
...,...,...,...,...,...,...
570091,2020-09-25,Sweetwater,Wyoming,56037,332,2
570092,2020-09-25,Teton,Wyoming,56039,532,1
570093,2020-09-25,Uinta,Wyoming,56041,348,2
570094,2020-09-25,Washakie,Wyoming,56043,113,6


#### Build latest values dataframe

In [5]:
# Get the most up-to-date data
latest_date = covid_all_df['date'].iloc[-1]

# Make a dataframe holding that most recent data
covid_latest_df = covid_all_df.loc[covid_all_df['date']==latest_date]

In [6]:
covid_latest_df

Unnamed: 0,date,county,state,fips,cases,deaths
566855,2020-09-25,Autauga,Alabama,1001,1757,25
566856,2020-09-25,Baldwin,Alabama,1003,5456,50
566857,2020-09-25,Barbour,Alabama,1005,873,7
566858,2020-09-25,Bibb,Alabama,1007,652,10
566859,2020-09-25,Blount,Alabama,1009,1608,15
...,...,...,...,...,...,...
570091,2020-09-25,Sweetwater,Wyoming,56037,332,2
570092,2020-09-25,Teton,Wyoming,56039,532,1
570093,2020-09-25,Uinta,Wyoming,56041,348,2
570094,2020-09-25,Washakie,Wyoming,56043,113,6


## Get mask use data

In [7]:
# Import the mask CSV data
mask_path = "https://raw.githubusercontent.com/nytimes/covid-19-data/master/mask-use/mask-use-by-county.csv"
masks_df = pd.read_csv(mask_path)

In [8]:
# Clean up column names
masks_df.columns = ['fips','never','rarely','sometimes','frequently','always']

In [9]:
# Display the dataframe
masks_df

Unnamed: 0,fips,never,rarely,sometimes,frequently,always
0,1001,0.053,0.074,0.134,0.295,0.444
1,1003,0.083,0.059,0.098,0.323,0.436
2,1005,0.067,0.121,0.120,0.201,0.491
3,1007,0.020,0.034,0.096,0.278,0.572
4,1009,0.053,0.114,0.180,0.194,0.459
...,...,...,...,...,...,...
3137,56037,0.061,0.295,0.230,0.146,0.268
3138,56039,0.095,0.157,0.160,0.247,0.340
3139,56041,0.098,0.278,0.154,0.207,0.264
3140,56043,0.204,0.155,0.069,0.285,0.287


## Get CDC data
#### Source: https://data.cdc.gov/NCHS/Weekly-Counts-of-Deaths-by-State-and-Select-Causes/muzy-jte6

In [23]:
cdc_url = 'https://data.cdc.gov/resource/muzy-jte6.json'
response = requests.get(cdc_url)
cdc_json = response.json()
cdc_df = pd.DataFrame.from_dict(cdc_json)

In [25]:
cdc_df.columns = ['state',
                  'year',
                  'week',
                  'week_ending_date',
                  'all_causes',
                  'natural_causes',
                  'septicemia',
                  'malignant_neoplasms',
                  'diabetes',
                  'alzheimers',
                  'influenza_and_pneumonia',
                  'chronic_lower_respiratory',
                  'other_diseases_of_respiratory',
                  'nephritis_nephrotic_syndrome',
                  'symptoms_signs_and_abnormal',
                  'diseases_of_heart',
                  'cerebrovascular_diseases',
                  'covid_19_multiple_causes',
                  'covid_19_underlying_cause',
                  'flag_otherresp',
                  'flag_otherunk',
                  'flag_nephr',
                  'flag_inflpn',
                  'flag_cov19mcod',
                  'flag_cov19ucod',
                  'flag_sept',
                  'flag_diab',
                  'flag_alz',
                  'flag_clrd',
                  'flag_stroke',
                  'flag_hd',
                  'flag_neopl',
                  'flag_allcause',
                  'flag_natcause']     

In [26]:
cdc_df

Unnamed: 0,state,year,week,week_ending_date,all_causes,natural_causes,septicemia,malignant_neoplasms,diabetes,alzheimers,...,flag_cov19ucod,flag_sept,flag_diab,flag_alz,flag_clrd,flag_stroke,flag_hd,flag_neopl,flag_allcause,flag_natcause
0,Alabama,2019,1,2019-01-05,1077,993,30,198,22,60,...,,,,,,,,,,
1,Alabama,2019,2,2019-01-12,1090,994,25,187,24,49,...,,,,,,,,,,
2,Alabama,2019,3,2019-01-19,1114,1042,22,238,18,48,...,,,,,,,,,,
3,Alabama,2019,4,2019-01-26,1063,994,21,165,22,50,...,,,,,,,,,,
4,Alabama,2019,5,2019-02-02,1095,1026,18,199,19,52,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Hawaii,2019,17,2019-04-27,221,211,,49,,,...,,Suppressed (counts 1-9),Suppressed (counts 1-9),Suppressed (counts 1-9),Suppressed (counts 1-9),,,,,
996,Hawaii,2019,18,2019-05-04,209,190,,37,,,...,,Suppressed (counts 1-9),Suppressed (counts 1-9),Suppressed (counts 1-9),Suppressed (counts 1-9),,,,,
997,Hawaii,2019,19,2019-05-11,246,232,,52,,,...,,Suppressed (counts 1-9),Suppressed (counts 1-9),Suppressed (counts 1-9),Suppressed (counts 1-9),,,,,
998,Hawaii,2019,20,2019-05-18,233,213,,53,,,...,,Suppressed (counts 1-9),Suppressed (counts 1-9),Suppressed (counts 1-9),Suppressed (counts 1-9),,,,,


In [13]:
# Build 2019 and 2020 dataframes, just in case
cdc_2019_df = cdc_df.loc[cdc_df['mmwryear']=="2019"]
cdc_2020_df = cdc_df.loc[cdc_df['mmwryear']=="2020"]

## Get Covid Tracking Project data
#### Source: https://covidtracking.com

In [16]:
covidtracking_current_url = 'https://api.covidtracking.com/v1/states/current.json'
covidtracking_current_response = requests.get(covidtracking_current_url)
covidtracking_current_json = covidtracking_current_response.json()
covidtracking_current_df = pd.DataFrame.from_dict(covidtracking_current_json)

In [27]:
for col in covidtracking_current_df.columns: 
    print(col)

date
state
positive
negative
pending
totalTestResults
hospitalizedCurrently
hospitalizedCumulative
inIcuCurrently
inIcuCumulative
onVentilatorCurrently
onVentilatorCumulative
recovered
dataQualityGrade
lastUpdateEt
dateModified
checkTimeEt
death
hospitalized
dateChecked
totalTestsViral
positiveTestsViral
negativeTestsViral
positiveCasesViral
deathConfirmed
deathProbable
totalTestEncountersViral
totalTestsPeopleViral
totalTestsAntibody
positiveTestsAntibody
negativeTestsAntibody
totalTestsPeopleAntibody
positiveTestsPeopleAntibody
negativeTestsPeopleAntibody
totalTestsPeopleAntigen
positiveTestsPeopleAntigen
totalTestsAntigen
positiveTestsAntigen
fips
positiveIncrease
negativeIncrease
total
totalTestResultsSource
totalTestResultsIncrease
posNeg
deathIncrease
hospitalizedIncrease
hash
commercialScore
negativeRegularScore
negativeScore
positiveScore
score
grade


In [33]:
# Display the dataframe
covidtracking_current_df

Unnamed: 0,date,state,positive,negative,pending,totalTestResults,hospitalizedCurrently,hospitalizedCumulative,inIcuCurrently,inIcuCumulative,...,posNeg,deathIncrease,hospitalizedIncrease,hash,commercialScore,negativeRegularScore,negativeScore,positiveScore,score,grade
0,20200926,AK,8315,434554,,442869,43.0,,,,...,442869,1,0,bf7518dcc571d23d1d8206a95e1a1c12173bcb0a,0,0,0,0,0,
1,20200926,AL,151591,970048,,1104932,709.0,16852.0,,1791.0,...,1121639,10,0,88370a431af70a13b1499a5cd6afe94a14fc9fb3,0,0,0,0,0,
2,20200926,AR,80755,855635,,933895,447.0,5202.0,213.0,,...,936390,19,0,1177f6d15b81adb770caebd6b66b47189c93f198,0,0,0,0,0,
3,20200926,AS,0,1571,,1571,,,,,...,1571,0,0,b9eccb7369bdc4ee75708c7a892c5437ab915d83,0,0,0,0,0,
4,20200926,AZ,216826,1221851,,1434227,509.0,22005.0,115.0,,...,1438677,35,33,130636fd317b3160d4644679abe3501863463881,0,0,0,0,0,
5,20200926,CA,798237,13384493,,14182730,3203.0,,862.0,,...,14182730,134,0,bc22d39302ce5646a61aef19adbfa503f5de5e76,0,0,0,0,0,
6,20200926,CO,67926,804806,,1277613,248.0,7479.0,,,...,872732,5,25,e66aa724278acdbfea40842a634bf1ac4a32fa77,0,0,0,0,0,
7,20200926,CT,56587,1462564,,1519151,76.0,11560.0,,,...,1519151,0,0,597a6f3bd03d5e8297830e769c3da2cb90e767e7,0,0,0,0,0,
8,20200926,DC,15215,362764,,377979,101.0,,25.0,,...,377979,1,0,5dd3065a746eea8ae0e9a70d3218cd0129db3054,0,0,0,0,0,
9,20200926,DE,20156,260243,,280399,57.0,,10.0,,...,280399,2,0,e3e0fc4116a2a29665fe6bffd51a89a13780977a,0,0,0,0,0,


In [18]:
covidtracking_all_url = 'https://api.covidtracking.com/v1/us/daily.json'
covidtracking_all_response = requests.get(covidtracking_all_url)
covidtracking_all_json = covidtracking_all_response.json()
covidtracking_all_df = pd.DataFrame.from_dict(covidtracking_all_json)

In [52]:
# Display the dataframe
covidtracking_all_df.to_csv("test.csv")

### Connect and load to postgressql

### Available dataframes:
<b>NYT:</b> covid_all_df, covid_latest_df <br>
<b>Masks:</b> masks_df <br>
<b>CDC:</b> cdc_df, cdc_2019_df, cdc_2020_df <br>
<b>Covid tracking:</b> covidtracking_current_df, covidtracking_all_df

In [57]:
# Connect to local database
# Make sure you fill out the user / password for MySQL
rds_connection_string = "covid_db_admin:pass123@localhost:5432/covid_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [58]:
# Check for Tables
engine.table_names()

['cdc', 'covid', 'masks']

### Load data into database

In [59]:
# Load covid_all_df dataframe into database
covid_all_df.to_sql(name='covid', con=engine, if_exists='append', index=False)

In [60]:
# Load mask_df dataframe into database
masks_df.to_sql(name='masks', con=engine, if_exists='append', index=False)

In [61]:
# Load cdc_df dataframe into database
cdc_df.to_sql(name='cdc', con=engine, if_exists='append', index=False)

In [62]:
# Not yet working
#covidtracking_current_df.to_sql(name='covidtracking_current', con=engine, if_exists='append', index=False)

In [63]:
# Not yet working
#covidtracking_all_df.to_sql(name='covidtracking_all', con=engine, if_exists='append', index=False)

### Confirm data load

In [64]:
# Confirm covid data has been added
pd.read_sql_query('select * from covid', con=engine)

Unnamed: 0,date,county,state,fips,cases,deaths
0,2020-01-21,Snohomish,Washington,53061,1,0
1,2020-01-22,Snohomish,Washington,53061,1,0
2,2020-01-23,Snohomish,Washington,53061,1,0
3,2020-01-24,Cook,Illinois,17031,1,0
4,2020-01-24,Snohomish,Washington,53061,1,0
...,...,...,...,...,...,...
564603,2020-09-25,Sweetwater,Wyoming,56037,332,2
564604,2020-09-25,Teton,Wyoming,56039,532,1
564605,2020-09-25,Uinta,Wyoming,56041,348,2
564606,2020-09-25,Washakie,Wyoming,56043,113,6


In [65]:
# Confirm mask use data has been added
pd.read_sql_query('select * from masks', con=engine)

Unnamed: 0,fips,never,rarely,sometimes,frequently,always
0,1001,0.053,0.074,0.134,0.295,0.444
1,1003,0.083,0.059,0.098,0.323,0.436
2,1005,0.067,0.121,0.120,0.201,0.491
3,1007,0.020,0.034,0.096,0.278,0.572
4,1009,0.053,0.114,0.180,0.194,0.459
...,...,...,...,...,...,...
3137,56037,0.061,0.295,0.230,0.146,0.268
3138,56039,0.095,0.157,0.160,0.247,0.340
3139,56041,0.098,0.278,0.154,0.207,0.264
3140,56043,0.204,0.155,0.069,0.285,0.287


In [66]:
# Confirm cdc data has been added
pd.read_sql_query('select * from cdc', con=engine)

Unnamed: 0,state,year,week,week_ending_date,all_causes,natural_causes,septicemia,malignant_neoplasms,diabetes,alzheimers,...,flag_cov19ucod,flag_sept,flag_diab,flag_alz,flag_clrd,flag_stroke,flag_hd,flag_neopl,flag_allcause,flag_natcause
0,Alabama,2019,1,2019-01-05,1077.0,993.0,30.0,198.0,22.0,60.0,...,,,,,,,,,,
1,Alabama,2019,2,2019-01-12,1090.0,994.0,25.0,187.0,24.0,49.0,...,,,,,,,,,,
2,Alabama,2019,3,2019-01-19,1114.0,1042.0,22.0,238.0,18.0,48.0,...,,,,,,,,,,
3,Alabama,2019,4,2019-01-26,1063.0,994.0,21.0,165.0,22.0,50.0,...,,,,,,,,,,
4,Alabama,2019,5,2019-02-02,1095.0,1026.0,18.0,199.0,19.0,52.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,Hawaii,2019,17,2019-04-27,221.0,211.0,,49.0,,,...,,Suppressed (counts 1-9),Suppressed (counts 1-9),Suppressed (counts 1-9),Suppressed (counts 1-9),,,,,
996,Hawaii,2019,18,2019-05-04,209.0,190.0,,37.0,,,...,,Suppressed (counts 1-9),Suppressed (counts 1-9),Suppressed (counts 1-9),Suppressed (counts 1-9),,,,,
997,Hawaii,2019,19,2019-05-11,246.0,232.0,,52.0,,,...,,Suppressed (counts 1-9),Suppressed (counts 1-9),Suppressed (counts 1-9),Suppressed (counts 1-9),,,,,
998,Hawaii,2019,20,2019-05-18,233.0,213.0,,53.0,,,...,,Suppressed (counts 1-9),Suppressed (counts 1-9),Suppressed (counts 1-9),Suppressed (counts 1-9),,,,,
