Coronavirus Coverage by US State
================================

A quick investigation into media coverage of the coronavirus pandemic by state in the US.
By Rahul Bhargava

## Setup

In [80]:
# Install the requirements
import sys
!{sys.executable} -m pip install -r requirements.txt



In [81]:
from IPython.display import JSON
import os, mediacloud.api
import csv
import datetime as dt
from dotenv import load_dotenv
import json
load_dotenv()  # load config from .env file
mc = mediacloud.api.MediaCloud(os.getenv('MC_API_KEY'))
mediacloud.__version__
COVID_QUERY = 'coronavirus OR covid or "covid 19"'
DATE_RANGE = mc.dates_as_query_clause(dt.date(2020,3,1), dt.date(2020,6,1))
DATA_FILE = os.path.join('data', 'state-media-data.json')

## 1. Generate Static List of US Media by State

In [82]:
# load all US state tags (static file coped from MediaCloud-Web-tools repository)
geo_tag_hierarchy = json.load(open(os.path.join('data','mc-geo-adm1.json')))['byCountry']
us = [country for country in geo_tag_hierarchy if country['country']['alpha3'] == 'USA'][0]
# filter out the "national" and combined "state & local" collections
us_state_collections = [s for s in us['collections'] if not s['label'].startswith('United States')]
# add in a handy state name
for s in us_state_collections:
    s['name'] = s['label'].split(',')[0]
us_state_collections[0]

{'show_on_stories': False,
 'label': 'Alabama, United States - State & Local',
 'tag_sets_id': 15765102,
 'tag_set_description': 'Tags in this set indicate that the media source covers a certain geographic area',
 'tags_id': 38381313,
 'tag_set_label': 'Geographic Collections',
 'is_static': False,
 'tag_set_name': 'geographic_collection',
 'show_on_media': True,
 'tag': 'geo_US-AL',
 'description': 'Media is largely about Alabama, United States - State & Local',
 'name': 'Alabama'}

In [83]:
# now get the media in each collection
def all_media_list(**kwargs):
    # page through a list of media list results (copied from Media Cloud API Tutorial notebooks)
    last_media_id = None
    more_results = True
    matching_media = []
    while more_results:
        media_page = mc.mediaList(**kwargs, last_media_id=last_media_id)
        #print("  got a page of {} matching media".format(len(media_page)))
        if len(media_page) == 0:
            more_results = False
        else:
            matching_media += media_page
            last_media_id = media_page[-1]['media_id']
    return matching_media

for s in us_state_collections:
    state_media = all_media_list(tags_id=s['tags_id'], rows=500)
    s['media'] = state_media
    s['media_count'] = len(state_media)
    s['avg_stories_per_day'] = round(sum([m['num_stories_90'] for m in state_media]))
    print("{} - {} media sources ({} stories/day)".format(s['name'], len(s['media']), s['avg_stories_per_day']))

Alabama - 146 media sources (1021 stories/day)
Alaska - 81 media sources (1259 stories/day)
Arizona - 184 media sources (3254 stories/day)
Arkansas - 141 media sources (1495 stories/day)
California - 1453 media sources (15427 stories/day)
Colorado - 231 media sources (1928 stories/day)
Connecticut - 126 media sources (741 stories/day)
Delaware - 29 media sources (354 stories/day)
District of Columbia - 49 media sources (1647 stories/day)
Florida - 338 media sources (5802 stories/day)
Georgia - 231 media sources (2001 stories/day)
Hawaii - 46 media sources (567 stories/day)
Idaho - 74 media sources (241 stories/day)
Illinois - 335 media sources (5438 stories/day)
Indiana - 188 media sources (1644 stories/day)
Iowa - 173 media sources (1138 stories/day)
Kansas - 168 media sources (1864 stories/day)
Kentucky - 141 media sources (867 stories/day)
Louisiana - 134 media sources (1043 stories/day)
Maine - 75 media sources (682 stories/day)
Maryland - 105 media sources (1601 stories/day)
Massa

In [84]:
with open(DATA_FILE, 'w') as outfile:
    json.dump(us_state_collections, outfile)

## 2. Add in State Coronavirus Attention Data

In [85]:
# load the static file of media sources by state, which now includes media lists for each state
us_state_collections = json.load(open(DATA_FILE))
len(us_state_collections)

51

In [86]:
for s in us_state_collections:
    # add in the total number of stories published from sources in this state
    s['total_stories'] = mc.storyCount("tags_id_media:{}".format(s['tags_id']), DATE_RANGE, split=True)['counts']
    # add in the total number of stories ABOUT COVID published from sources in this state
    s['covid_stories'] = mc.storyCount("tags_id_media:{} AND {}".format(s['tags_id'], COVID_QUERY), DATE_RANGE, split=True)['counts']

In [87]:
# generate CSVs of attention data for each state
for s in us_state_collections:
    state_abbr = s['tag'].split('-')[1]
    state_csv_file = os.path.join('data', 'state-attention', "{}-stories.csv".format(state_abbr))
    s['state_csv_file'] = state_csv_file
    data = []
    for day in s['total_stories']:
        date = day['date']
        total_stories = day['count']
        covid_day = [d for d in s['covid_stories'] if d['date']==date] # if there were no stories, day won't have entry in covid results
        covid_stories = covid_day[0]['count'] if len(covid_day) == 1 else 0
        data.append({'state': s['name'],
                     'date': date,
                     'total': total_stories,
                     'covid': covid_stories
                    })
    with open(state_csv_file, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=['state', 'date', 'total', 'covid'], extrasaction='ignore')
        writer.writeheader()
        for d in data:
            writer.writerow(d)
# and add the CSV path to the main file
with open(DATA_FILE, 'w') as outfile:
    json.dump(us_state_collections, outfile)

## 3. Chart Attention Data

In [88]:
import pandas as pd
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import ColumnDataSource
from bokeh.palettes import Spectral3
from bokeh.layouts import gridplot

output_notebook()

In [89]:
# load the static file of media sources by state, which now also has attention data
us_state_collections = json.load(open(DATA_FILE))
len(us_state_collections)

51

In [90]:
plots = []
plots_by_abbr = {}
for s in us_state_collections:
    state_abbr = s['tag'].split('-')[1]
    df = pd.read_csv(s['state_csv_file'])
    df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
    df['pct'] = df['covid']/df['total']
    # apply a 3-day window to smooth out weekends and show more of a trend chart (ie. not as spikey)
    df['rolling_total'] = df.rolling(window=3)['total'].mean()
    df['rolling_covid'] = df.rolling(window=3)['covid'].mean()
    df['rolling_pct'] = df['rolling_covid']/df['rolling_total']
    # now make a chart for the state
    source = ColumnDataSource(df)
    p = figure(plot_width=140, plot_height=140, x_axis_type='datetime', title=s['name'], y_range=(0, 1))
    p.xgrid.grid_line_color = None
    p.ygrid.grid_line_color = None
    p.axis.visible = False
    p.outline_line_color = None
    p.toolbar.logo = None
    p.toolbar_location = None
    p.line(x='date', y='rolling_pct', line_width=1, source=source)
    plots.append(p)
    plots_by_abbr[state_abbr] = p
    #show(p)

In [91]:
def chunks(lst, n):
    # credit: https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
rows = list(chunks(plots,9))
grid = gridplot(rows)
# chart them in a grid
show(grid)

In [92]:
# now let's replicate R's geofacet library by hand
geo_aligned_plots = []
order = ['AK', None, None, None, None, None, None, None, None, 'VT', 'NH', 'ME',
        None, None, None, None, None, None, None, None, 'NY', 'CT', 'MA', None,
        'WA', 'MT', 'ND', 'SD', 'MN', 'WI', 'MI', None, 'PA', 'NJ', 'RI', None,
        'OR', 'ID', 'WY', 'NE', 'IA', 'IL', 'IN', 'OH', 'VA', 'DC', 'DE', None,
        'NV', 'UT', 'CO', 'KS', 'MO', 'TN', 'KY', 'WV', 'NC', 'MD', None, None,
        'CA', 'AZ', 'NM', 'OK', 'AR', 'MS', 'AL', 'GA', 'SC', None, None, None,
        None, None, None, 'TX', 'LA', None, None, None, 'FL', None, None, None,
        'HI', None, None, None, None, None, None, None, None, None, None, None,]
for abbr in order:
    plot = plots_by_abbr[abbr] if abbr is not None else None
    geo_aligned_plots.append(plot)
rows = list(chunks(geo_aligned_plots,12))
grid = gridplot(rows)
show(grid)