In [12]:
#pip install covidcast

In [1]:
import covidcast
from datetime import date
import pandas as pd
import pathlib

In [11]:
covidcast.metadata()

Unnamed: 0,data_source,signal,time_type,geo_type,min_time,max_time,num_locations,min_value,max_value,mean_value,stdev_value,last_update,max_issue,min_lag,max_lag
0,chng,smoothed_adj_outpatient_cli,day,county,2020-02-01,2021-03-17,3039,0.001303,99.925925,4.592721,7.537250,2021-03-22 03:11:03,20210321,3,386
1,chng,smoothed_adj_outpatient_cli,day,hhs,2020-02-01,2021-03-17,10,0.007255,15.212463,3.517410,2.830343,2021-03-22 03:11:06,20210321,4,386
2,chng,smoothed_adj_outpatient_cli,day,hrr,2020-02-01,2021-03-17,306,0.001360,50.815903,3.444361,3.481861,2021-03-22 03:11:06,20210321,4,386
3,chng,smoothed_adj_outpatient_cli,day,msa,2020-02-01,2021-03-17,392,0.001350,50.943135,3.114366,3.491192,2021-03-22 03:11:06,20210321,4,386
4,chng,smoothed_adj_outpatient_cli,day,nation,2020-02-01,2021-03-17,1,0.016727,10.397712,3.795624,2.489618,2021-03-22 03:11:07,20210321,4,386
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1194,usa-facts,deaths_incidence_prop,day,state,2020-01-25,2021-03-19,51,-17.505844,22.964196,0.365023,0.749548,2021-03-21 18:50:25,20210321,1,312
1195,youtube-survey,raw_cli,day,state,2020-04-21,2020-06-01,19,0.000000,3.083082,0.859854,0.642198,2020-06-02 11:51:34,20200603,2,11
1196,youtube-survey,raw_ili,day,state,2020-04-21,2020-06-01,19,0.000000,3.195694,0.859178,0.649258,2020-06-02 11:51:34,20200603,2,11
1197,youtube-survey,smoothed_cli,day,state,2020-04-21,2020-06-22,42,0.000000,4.532164,0.898019,0.573726,2020-06-24 12:51:35,20200625,2,11


## Data to Use
Covidcast: 
* SafeGraph (social distancing metrics, weekly patterns) - https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/safegraph.html - may want to look at data prior to pandemic as well since the data goes back all the way to January 1st, 2019.
* NCHS (CDC) mortality data - https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/nchs-mortality.html -  this data is different from the death data from USAFacts and JHU because in this data, deaths are reported by the date they occur, not the date they were reported. Also, see details regarding "Missingness" since some data was removed when death counts were 50% less of the expected number.
* USAFacts Cases and Deaths - https://cmu-delphi.github.io/delphi-epidata/api/covidcast-signals/usa-facts.html - I plan to use this priimarily for the case count.

IPUMS USA:
Will not use this since there's no 2020 data.

U.S. Bureau of Labor Statistics: https://www.bls.gov/
Use for employment and salary data.

In [2]:
data = covidcast.signal("usa-facts", "confirmed_cumulative_num", date(2020, 1, 25), date(2020, 12, 31))

In [3]:
data.head()

Unnamed: 0,geo_value,signal,time_value,issue,lag,value,stderr,sample_size,geo_type,data_source
0,1000,confirmed_cumulative_num,2020-01-25,2020-10-17,266,0,,,county,usa-facts
1,1001,confirmed_cumulative_num,2020-01-25,2020-10-17,266,0,,,county,usa-facts
2,1003,confirmed_cumulative_num,2020-01-25,2020-10-17,266,0,,,county,usa-facts
3,1005,confirmed_cumulative_num,2020-01-25,2020-10-17,266,0,,,county,usa-facts
4,1007,confirmed_cumulative_num,2020-01-25,2020-10-17,266,0,,,county,usa-facts


In [4]:
data.tail()

Unnamed: 0,geo_value,signal,time_value,issue,lag,value,stderr,sample_size,geo_type,data_source
3188,56037,confirmed_cumulative_num,2020-12-31,2021-01-02,2,2966,,,county,usa-facts
3189,56039,confirmed_cumulative_num,2020-12-31,2021-01-02,2,2138,,,county,usa-facts
3190,56041,confirmed_cumulative_num,2020-12-31,2021-01-02,2,1558,,,county,usa-facts
3191,56043,confirmed_cumulative_num,2020-12-31,2021-01-02,2,781,,,county,usa-facts
3192,56045,confirmed_cumulative_num,2020-12-31,2021-01-02,2,476,,,county,usa-facts


In [5]:
#nchs_new_death_data = covidcast.signal("nchs-mortality", "deaths_covid_incidence_num")

In [None]:
#covidcast.signal("jhu-csse", "")

In [6]:
usa_facts_data = covidcast.signal("usa-facts", "confirmed_cumulative_prop", date(2020, 1, 25), date(2020, 12, 31), geo_type = "state")

In [7]:
usa_facts_data.head()

Unnamed: 0,geo_value,signal,time_value,issue,lag,value,stderr,sample_size,geo_type,data_source
0,ak,confirmed_cumulative_prop,2020-01-25,2020-10-17,266,0.0,,,state,usa-facts
1,al,confirmed_cumulative_prop,2020-01-25,2020-10-17,266,0.0,,,state,usa-facts
2,ar,confirmed_cumulative_prop,2020-01-25,2020-10-17,266,0.0,,,state,usa-facts
3,az,confirmed_cumulative_prop,2020-01-25,2020-10-17,266,0.0,,,state,usa-facts
4,ca,confirmed_cumulative_prop,2020-01-25,2020-10-17,266,0.0,,,state,usa-facts


In [8]:
usa_facts_data.tail()

Unnamed: 0,geo_value,signal,time_value,issue,lag,value,stderr,sample_size,geo_type,data_source
46,vt,confirmed_cumulative_prop,2020-12-31,2021-01-02,2,1187.841452,,,state,usa-facts
47,wa,confirmed_cumulative_prop,2020-12-31,2021-01-03,3,3240.386963,,,state,usa-facts
48,wi,confirmed_cumulative_prop,2020-12-31,2021-01-02,2,8938.478306,,,state,usa-facts
49,wv,confirmed_cumulative_prop,2020-12-31,2021-02-11,42,3973.72537,,,state,usa-facts
50,wy,confirmed_cumulative_prop,2020-12-31,2021-01-02,2,7673.142016,,,state,usa-facts


In [9]:
kerstin_folder_path = pathlib.Path.cwd()

In [36]:
# This worked, but not like how I wanted. There's just too much extra stuff.
usa_facts_data.to_csv(kerstin_folder_path/"usafacts_cum_cases_pop_state.csv")

Notice the many rows in the below dataframe. Ignore the index because that only matches with the state name, so the index repeats.

In [13]:
#usafacts_cum_cases_pop_state = usa_facts_data[['geo_value','time_value','value']]
#usafacts_cum_cases_pop_state

In [20]:
usafacts_il_county_cum_num = covidcast.signal("usa-facts", "confirmed_cumulative_prop", date(2020, 1, 25), date(2020, 12, 31), geo_type = "county", geo_values = "il")

In [21]:
usafacts_il_county_cum_num

In [14]:
usafacts_county_cum_prop = covidcast.signal("usa-facts", "confirmed_cumulative_prop", date(2020, 1, 25), date(2020, 12, 31), geo_type = "county")



In [15]:
# As seen by the warnings, there is no data for Feb 13-19
usafacts_county_cum_prop

Unnamed: 0,geo_value,signal,time_value,issue,lag,value,stderr,sample_size,geo_type,data_source
0,01001,confirmed_cumulative_prop,2020-01-25,2020-10-17,266,0.000000,,,county,usa-facts
1,01003,confirmed_cumulative_prop,2020-01-25,2020-10-17,266,0.000000,,,county,usa-facts
2,01005,confirmed_cumulative_prop,2020-01-25,2020-10-17,266,0.000000,,,county,usa-facts
3,01007,confirmed_cumulative_prop,2020-01-25,2020-10-17,266,0.000000,,,county,usa-facts
4,01009,confirmed_cumulative_prop,2020-01-25,2020-10-17,266,0.000000,,,county,usa-facts
...,...,...,...,...,...,...,...,...,...,...
3137,56037,confirmed_cumulative_prop,2020-12-31,2021-01-02,2,7004.699714,,,county,usa-facts
3138,56039,confirmed_cumulative_prop,2020-12-31,2021-01-02,2,9111.830890,,,county,usa-facts
3139,56041,confirmed_cumulative_prop,2020-12-31,2021-01-02,2,7702.956591,,,county,usa-facts
3140,56043,confirmed_cumulative_prop,2020-12-31,2021-01-02,2,10006.406150,,,county,usa-facts


In [29]:
usafacts_il_county_cum_7day_prop = covidcast.signal("usa-facts", "confirmed_7day_cumulative_prop", date(2020, 1, 25), date(2020, 12, 31), geo_type = "county")

In [30]:
usafacts_il_county_cum_7day_prop