# Compile US-level cases, deaths, caseRate, and deathRate for specific days and weeks for waves 1 - 3.

**[Work in progress]**

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import pandas as pd
from pathlib import Path
from py2neo import Graph
import time
import unidecode
import difflib
from functools import reduce

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

#### Connect to COVID-19-Net Knowledge Graph

In [3]:
graph = Graph("bolt://132.249.238.185:7687", user="reader", password="demo")

In [4]:
query = """
MATCH (k:Country{iso3: 'USA'})<-[:IN]-(Admin1)<-[:IN]-(a2:Admin2),
      (a2)-[:HAS_DEMOGRAPHICS]->(d:Demographics{aggregationLevel: 'Admin2'}) 

WITH k, a2, d
OPTIONAL MATCH (c:Cases{source:'JHU'})-[:REPORTED_IN]->(a2)
WHERE c.date = date($date)
      
RETURN k.iso3 AS iso3, 
       sum(d.totalPopulation) as population, 
       sum(c.cases) AS cases, 
       sum(c.deaths) AS deaths,
       sum(c.cases)*100000.0/sum(d.totalPopulation) AS caseRate, 
       sum(c.deaths)*100000.0/sum(d.totalPopulation) AS deathRate
"""

### Data for 2020-02-28

In [5]:
data_20200228 = graph.run(query, date='2020-02-28').to_data_frame() 

In [6]:
data_20200228.rename(columns={'cases': 'cases_2020-02-28', 'deaths': 'death_2020-02-28', 
                              'caseRate': 'caseRate_2020-02-28', 'deathRate': 'deathRate_2020-02-28'}, inplace=True)

In [7]:
data_20200228.head()

Unnamed: 0,iso3,population,cases_2020-02-28,death_2020-02-28,caseRate_2020-02-28,deathRate_2020-02-28
0,USA,322903030,17,0,0.005265,0.0


### Data for 2020-04-27

In [8]:
data_20200427 = graph.run(query, date='2020-04-27').to_data_frame()

In [9]:
data_20200427.rename(columns={'cases': 'cases_2020-04-27', 'deaths': 'death_2020-04-27', 
                              'caseRate': 'caseRate_2020-04-27', 'deathRate': 'deathRate_2020-04-27'}, inplace=True)

In [10]:
data_20200427.head()

Unnamed: 0,iso3,population,cases_2020-04-27,death_2020-04-27,caseRate_2020-04-27,deathRate_2020-04-27
0,USA,322903030,985175,52673,305.099336,16.312328


### Data for 2020-08-05

In [11]:
data_20200805 = graph.run(query, date='2020-08-05').to_data_frame() 

In [12]:
data_20200805.rename(columns={'cases': 'cases_2020-08-05', 'deaths': 'death_2020-08-05', 
                              'caseRate': 'caseRate_2020-08-05', 'deathRate': 'deathRate_2020-08-05'}, inplace=True)

In [13]:
data_20200805.head()

Unnamed: 0,iso3,population,cases_2020-08-05,death_2020-08-05,caseRate_2020-08-05,deathRate_2020-08-05
0,USA,322903030,4744188,156315,1469.229942,48.40927


### Get 7-day averages

In [14]:
# Wave 1: Feb. 21-27 (pre-launch) versus Jan 24-Jan 30 (comparison baseline)
# Wave 2: Apr 20-26 versus Mar 23-29
# Wave 3: July 29-Aug 4 versus June 30-July 6

In [15]:
days = 7.0

In [16]:
query = """
MATCH (k:Country{iso3: 'USA'})<-[:IN]-(a1:Admin1)<-[:IN]-(a2:Admin2),
      (a2)-[:HAS_DEMOGRAPHICS]->(d:Demographics{aggregationLevel: 'Admin2'}) 

WITH k, a2, d

OPTIONAL MATCH (c:Cases{source:'JHU'})-[:REPORTED_IN]->(a2)
WHERE c.date >= date($startDate) AND c.date <= date($endDate)
WITH k, d, sum(c.cases) AS cases, sum(c.deaths) AS deaths
      
RETURN k.iso3 AS iso3, 
       sum(d.totalPopulation) AS population, 
       sum(cases)/$days AS cases, sum(deaths)/$days AS deaths,
       sum(cases)*100000.0/$days/sum(d.totalPopulation) as caseRate, 
       sum(deaths)*100000.0/$days/sum(d.totalPopulation) as deathRate
"""

In [17]:
data_20200124_20200130 = graph.run(query, startDate='2020-01-24', endDate='2020-01-30', days=days).to_data_frame()  

In [18]:
data_20200124_20200130.rename(columns={'cases': 'avgcases_2020-01-24_2020-01-30', 'deaths': 'avgdeath_2020-01-24_2020-01-30', 
                              'caseRate': 'avgcaseRate_2020-01-24_2020-01-30', 'deathRate': 'avgdeathRate_2020-01-24_2020-01-30'}, inplace=True)

In [19]:
data_20200124_20200130.head()

Unnamed: 0,iso3,population,avgcases_2020-01-24_2020-01-30,avgdeath_2020-01-24_2020-01-30,avgcaseRate_2020-01-24_2020-01-30,avgdeathRate_2020-01-24_2020-01-30
0,USA,322903030,4.428571,0.0,0.001371,0.0


In [20]:
data_20200221_20200227 = graph.run(query, startDate='2020-02-21', endDate='2020-02-27', days=days).to_data_frame() 

In [21]:
data_20200221_20200227.rename(columns={'cases': 'avgcases_2020-02-21_2020-02-27', 'deaths': 'avgdeath_2020-02-21_2020-02-27', 
                              'caseRate': 'avgcaseRate_2020-02-21_2020-02-27', 'deathRate': 'avgdeathRate_2020-02-21_2020-02-27'}, inplace=True)

In [22]:
data_20200221_20200227.head()

Unnamed: 0,iso3,population,avgcases_2020-02-21_2020-02-27,avgdeath_2020-02-21_2020-02-27,avgcaseRate_2020-02-21_2020-02-27,avgdeathRate_2020-02-21_2020-02-27
0,USA,322903030,16.142857,0.0,0.004999,0.0


In [23]:
data_20200323_20200329 = graph.run(query, startDate='2020-03-23', endDate='2020-03-29', days=days).to_data_frame() 

In [24]:
data_20200323_20200329.rename(columns={'cases': 'avgcases_2020-03-23_2020-03-29', 'deaths': 'avgdeath_2020-03-23_2020-03-29', 
                              'caseRate': 'avgcaseRate_2020-03-23_2020-03-29', 'deathRate': 'avgdeathRate_2020-03-23_2020-03-29'}, inplace=True)

In [25]:
data_20200323_20200329.head()

Unnamed: 0,iso3,population,avgcases_2020-03-23_2020-03-29,avgdeath_2020-03-23_2020-03-29,avgcaseRate_2020-03-23_2020-03-29,avgdeathRate_2020-03-23_2020-03-29
0,USA,322903030,87327.428571,1659.428571,27.044475,0.513909


In [26]:
data_20200420_20200426 = graph.run(query, startDate='2020-04-20', endDate='2020-04-26', days=days).to_data_frame() 

In [27]:
data_20200420_20200426.rename(columns={'cases': 'avgcases_2020-04-20_2020-04-26', 'deaths': 'avgdeath_2020-04-20_2020-04-26', 
                              'caseRate': 'avgcaseRate_2020-04-20_2020-04-26', 'deathRate': 'avgdeathRate_2020-04-20_2020-04-26'}, inplace=True)

In [28]:
data_20200420_20200426.head()

Unnamed: 0,iso3,population,avgcases_2020-04-20_2020-04-26,avgdeath_2020-04-20_2020-04-26,avgcaseRate_2020-04-20_2020-04-26,avgdeathRate_2020-04-20_2020-04-26
0,USA,322903030,871105.285714,45676.857143,269.773029,14.145689


In [29]:
data_20200630_20200706 = graph.run(query, startDate='2020-06-30', endDate='2020-07-06', days=days).to_data_frame() 

In [30]:
data_20200630_20200706.rename(columns={'cases': 'avgcases_2020-06-30_2020-07-06', 'deaths': 'avgdeath_2020-04-20_2020-07-06', 
                              'caseRate': 'avgcaseRate_2020-06-30_2020-07-06', 'deathRate': 'avgdeathRate_2020-06-30_2020-07-06'}, inplace=True)

In [31]:
data_20200630_20200706.head()

Unnamed: 0,iso3,population,avgcases_2020-06-30_2020-07-06,avgdeath_2020-04-20_2020-07-06,avgcaseRate_2020-06-30_2020-07-06,avgdeathRate_2020-06-30_2020-07-06
0,USA,322903030,2752229.0,127685.714286,852.339257,39.543052


In [32]:
data_20200729_20200804 = graph.run(query, startDate='2020-07-29', endDate='2020-08-04', days=days).to_data_frame() 

In [33]:
data_20200729_20200804.rename(columns={'cases': 'avgcases_2020-07-29_2020-08-04', 'deaths': 'avgdeath_2020-07-29_2020-08-04', 
                              'caseRate': 'avgcaseRate_2020-07-29_2020-08-04', 'deathRate': 'avgdeathRate_2020-07-29_2020-08-04'}, inplace=True)

In [34]:
data_20200729_20200804.head()

Unnamed: 0,iso3,population,avgcases_2020-07-29_2020-08-04,avgdeath_2020-07-29_2020-08-04,avgcaseRate_2020-07-29_2020-08-04,avgdeathRate_2020-07-29_2020-08-04
0,USA,322903030,4532361.0,152303.285714,1403.629221,47.16688


#### Consistency check (these numbers should match the daily numbers in dataframe: data_20200805)

In [35]:
data_20200805_20200805 = graph.run(query, startDate='2020-08-05', endDate='2020-08-05', days=1.0).to_data_frame() 

In [36]:
data_20200805_20200805 = data_20200805_20200805.merge(data_20200805, on=['iso3', 'population'])

In [37]:
data_20200805_20200805.head()

Unnamed: 0,iso3,population,cases,deaths,caseRate,deathRate,cases_2020-08-05,death_2020-08-05,caseRate_2020-08-05,deathRate_2020-08-05
0,USA,322903030,4744188.0,156315.0,1469.229942,48.40927,4744188,156315,1469.229942,48.40927


### Merge dataframes

In [38]:
data_frames = [data_20200228, data_20200427, data_20200805, data_20200124_20200130, data_20200221_20200227, 
               data_20200323_20200329, data_20200420_20200426, data_20200630_20200706, data_20200729_20200804]

In [39]:
data = reduce(lambda  left,right: pd.merge(left,right,on=['iso3', 'population'], how='outer'), data_frames).fillna(0.0)

In [40]:
data.head()

Unnamed: 0,iso3,population,cases_2020-02-28,death_2020-02-28,caseRate_2020-02-28,deathRate_2020-02-28,cases_2020-04-27,death_2020-04-27,caseRate_2020-04-27,deathRate_2020-04-27,cases_2020-08-05,death_2020-08-05,caseRate_2020-08-05,deathRate_2020-08-05,avgcases_2020-01-24_2020-01-30,avgdeath_2020-01-24_2020-01-30,avgcaseRate_2020-01-24_2020-01-30,avgdeathRate_2020-01-24_2020-01-30,avgcases_2020-02-21_2020-02-27,avgdeath_2020-02-21_2020-02-27,avgcaseRate_2020-02-21_2020-02-27,avgdeathRate_2020-02-21_2020-02-27,avgcases_2020-03-23_2020-03-29,avgdeath_2020-03-23_2020-03-29,avgcaseRate_2020-03-23_2020-03-29,avgdeathRate_2020-03-23_2020-03-29,avgcases_2020-04-20_2020-04-26,avgdeath_2020-04-20_2020-04-26,avgcaseRate_2020-04-20_2020-04-26,avgdeathRate_2020-04-20_2020-04-26,avgcases_2020-06-30_2020-07-06,avgdeath_2020-04-20_2020-07-06,avgcaseRate_2020-06-30_2020-07-06,avgdeathRate_2020-06-30_2020-07-06,avgcases_2020-07-29_2020-08-04,avgdeath_2020-07-29_2020-08-04,avgcaseRate_2020-07-29_2020-08-04,avgdeathRate_2020-07-29_2020-08-04
0,USA,322903030,17,0,0.005265,0.0,985175,52673,305.099336,16.312328,4744188,156315,1469.229942,48.40927,4.428571,0.0,0.001371,0.0,16.142857,0.0,0.004999,0.0,87327.428571,1659.428571,27.044475,0.513909,871105.285714,45676.857143,269.773029,14.145689,2752229.0,127685.714286,852.339257,39.543052,4532361.0,152303.285714,1403.629221,47.16688


In [41]:
data.to_csv("COVID_waves1to3_US_JHU.csv", index=False)