In [1]:
# packages for data collection and cleaning
import requests
import json
from io import StringIO
from bs4 import BeautifulSoup as bs
from datetime import date, timedelta
import covidcast
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)

# packages for mapping
import geopandas as gpd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl

## NYT Mask Wearing Survey Data

- This data comes from a large number of interviews conducted online by the firm Dynata, which asked the question to obtain 250,000 survey responses between July 2 and July 14, enough data to provide estimates more detailed than the state level. 

- This survey was conducted for a single run and there are no plans for a follow-up study. Specifically, each participant was asked the following question: 

<center><strong>How often do you wear a mask in public when you expect to be within six feet of another person?</strong></center>

- Here are the definitions for the column headings:

    - **COUNTYFP**: The county FIPS code.
    - **NEVER**: The estimated share of people in this county who would say never in response to the question: “How often do you wear a mask in public when you expect to be within six feet of another person?”
    - **RARELY**: The estimated share of people in this county who would say rarely
    - **SOMETIMES**: The estimated share of people in this county who would say sometimes
    - **FREQUENTLY**: The estimated share of people in this county who would say frequently
    - **ALWAYS**: The estimated share of people in this county who would say always
    

- In their analysis, they assumed the following relationships to be true:  
    - ALWAYS : 100%
    - FREQUENTLY : 80%
    - SOMETIMES : 50%
    - RARELY : 20%
    - NEVER : 0%



In [2]:
# There was no API available here, just the file made available through their Github repo
url = 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/mask-use/mask-use-by-county.csv'
s = requests.get(url).text
nymask = pd.read_csv(StringIO(s))
nymask.COUNTYFP = nymask.COUNTYFP.astype(str)
nymask.COUNTYFP = np.where(nymask['COUNTYFP'].str.len() == 4, '0' + nymask.COUNTYFP, nymask.COUNTYFP)
nymask.columns = ['FIPS', 'NEVER', 'RARELY', 'SOMETIMES', 'FREQUENTLY', 'ALWAYS']

In [3]:
nymask.describe()

Unnamed: 0,NEVER,RARELY,SOMETIMES,FREQUENTLY,ALWAYS
count,3142.0,3142.0,3142.0,3142.0,3142.0
mean,0.07994,0.082919,0.121318,0.207725,0.508094
std,0.058538,0.055464,0.058011,0.063571,0.152191
min,0.0,0.0,0.001,0.029,0.115
25%,0.034,0.04,0.079,0.164,0.39325
50%,0.068,0.073,0.115,0.204,0.497
75%,0.113,0.115,0.156,0.247,0.61375
max,0.432,0.384,0.422,0.549,0.889


<img src='images/fb_mask_data.png'>

## Carnegie Mellon Mask Wearing Facebook Survey Data

- In conjunction with Facebook and survey firm, the Carnegie Mellon study collected 1,220,000 valid responses from respondents across the US.

- The survey involves questions about symptoms, mask wearing, testing, and the other important topics in relation to COVID-19 as described above, along with demographic details about the respondent. 
    - These demographics include age, gender, race, occupation, and education, allowing us to understand how different groups have been affected and which groups are currently most vulnerable to COVID-19. 
    
- The two relevant questions in relation to face masks are the following:
    1. In the past 5 days, did you wear a mask most or all of the time in public?
    2. In the past 7 days, when you were in public places where social distancing is not possible, did most or all other people wear masks?

- The four signals available for data collection through Covidcast are:
    1. `**smoothed_wearing_mask**`
    2. `**smoothed_others_masked**`
    3. `**smoothed_wwearing_mask**`
    4. `**smoothed_wothers_masked**`

- The smoothed versions of all the `fb-survey` signals (with `smoothed_` prefix) are calculated using seven day pooling.
- The weighting versions (with the `smoothed_w` prefix) adjust the data to be representative of the US population, adjusting both for:

    1. the differences between the US population and US Facebook users (according to a state-by-age-gender stratification of the US population from the 2018 Census March Supplement)
    2. the propensity of a Facebook user to take our survey in the first place
    
**Dashboard website:** https://delphi.cmu.edu/covidcast/?date=20210102&region=42003


**First question link:** https://delphi.cmu.edu/covidcast/survey-results/?date=20210102®ion=42003#self-reported-mask-use


**Second question link:** https://delphi.cmu.edu/covidcast/survey-results/?date=20210102&region=42003#other-people-wearing-masks


- Data for the first question starts on the 8th of September, while data for the second question starts on the 25th of November.

In [5]:
# create variable to hold date object for two days ago
two_days_ago = date.today() - timedelta(days=2)

# get the most recent survey data
mask_ind = covidcast.signal("fb-survey", "smoothed_wearing_mask", date(2020, 9, 8), two_days_ago, "county")
mask_oth = covidcast.signal('fb-survey', 'smoothed_others_masked', date(2020, 11, 24),two_days_ago, "county")

In [None]:
# remove data that isn't at the county level since they lumped data into **000 where responses were too few
## FIPS code is state code + county code, 2 and 3 digit codes
mask_ind = mask_ind.loc[~mask_ind.geo_value.str.endswith('000')]
mask_oth = mask_oth.loc[~mask_oth.geo_value.str.endswith('000')]

# check number of counties represented 
print(mask_ind.geo_value.value_counts().shape)
print(mask_oth.geo_value.value_counts().shape)

# change four digit FIPS code to all five digit FIPS code
mask_ind.rename(columns = {'geo_value': 'FIPS'}, inplace=True)
mask_oth.rename(columns = {'geo_value': 'FIPS'}, inplace=True)
mask_ind.FIPS = mask_ind.FIPS.apply(lambda x: x.zfill(5))
mask_oth.FIPS = mask_oth.FIPS.apply(lambda x: x.zfill(5))

In [None]:
# find means of values for each county to concatenate with NYT dataset
mask_ind_means = pd.DataFrame(mask_ind.groupby(['FIPS'])['value'].mean()).reset_index()
mask_ind_means.columns = ['FIPS', 'ind_mask']
mask_oth_means = pd.DataFrame(mask_oth.groupby(['FIPS'])['value'].mean()).reset_index()
mask_oth_means.columns = ['FIPS', 'oth_mask']

# merge both datasets together
mask_means = mask_ind_means.merge(mask_oth_means, on = 'FIPS', how = 'outer')

<img src='images/delphi_dec.png'>

<center>Data from Delphi COVIDcast. Obtained via the Delphi Epidata API. <a href>https://cmu-delphi.github.io/delphi-epidata/api/covidcast.html</a></center>