# # Compile State-level cases, deaths, caseRate, and deathRate for specific days and weeks for waves 1 - 3.

**[Work in progress]**

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import pandas as pd
from pathlib import Path
from py2neo import Graph
import time
import unidecode
import difflib
from functools import reduce

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

#### Connect to COVID-19-Net Knowledge Graph

In [3]:
graph = Graph("bolt://132.249.238.185:7687", user="reader", password="demo")

In [4]:
query = """
MATCH (a1:Admin1)<-[:IN]-(a2:Admin2),
      (a2)-[:HAS_DEMOGRAPHICS]->(d:Demographics{aggregationLevel: 'Admin2'}) 

WITH a1, a2, d
OPTIONAL MATCH (c:Cases{source:'JHU'})-[:REPORTED_IN]->(a2)
WHERE c.date = date($date)
      
RETURN a1.name AS state, a1.code AS code, 
       sum(d.totalPopulation) as population, 
       sum(c.cases) AS cases, sum(c.deaths) AS deaths,
       sum(c.cases)*100000.0/sum(d.totalPopulation) AS caseRate, 
       sum(c.deaths)*100000.0/sum(d.totalPopulation) AS deathRate
ORDER BY code
"""

### Data for 2020-02-28

In [5]:
data_20200228 = graph.run(query, date='2020-02-28').to_data_frame() 

In [6]:
data_20200228.rename(columns={'cases': 'cases_2020-02-28', 'deaths': 'death_2020-02-28', 
                              'caseRate': 'caseRate_2020-02-28', 'deathRate': 'deathRate_2020-02-28'}, inplace=True)

In [7]:
data_20200228.head()

Unnamed: 0,state,code,population,cases_2020-02-28,death_2020-02-28,caseRate_2020-02-28,deathRate_2020-02-28
0,Alaska,AK,738516,0,0,0.0,0.0
1,Alabama,AL,4864680,0,0,0.0,0.0
2,Arkansas,AR,2990671,0,0,0.0,0.0
3,Arizona,AZ,6946685,1,0,0.014395,0.0
4,California,CA,39148760,11,0,0.028098,0.0


### Data for 2020-04-27

In [8]:
data_20200427 = graph.run(query, date='2020-04-27').to_data_frame()

In [9]:
data_20200427.rename(columns={'cases': 'cases_2020-04-27', 'deaths': 'death_2020-04-27', 
                              'caseRate': 'caseRate_2020-04-27', 'deathRate': 'deathRate_2020-04-27'}, inplace=True)

In [10]:
data_20200427.head()

Unnamed: 0,state,code,population,cases_2020-04-27,death_2020-04-27,caseRate_2020-04-27,deathRate_2020-04-27
0,Alaska,AK,738516,345,5,46.715305,0.677033
1,Alabama,AL,4864680,6539,228,134.417886,4.686845
2,Arkansas,AR,2990671,2866,51,95.831337,1.705303
3,Arizona,AZ,6946685,6725,275,96.808766,3.958723
4,California,CA,39148760,44972,1777,114.874647,4.539097


### Data for 2020-08-05

In [11]:
data_20200805 = graph.run(query, date='2020-08-05').to_data_frame() 

In [12]:
data_20200805.rename(columns={'cases': 'cases_2020-08-05', 'deaths': 'death_2020-08-05', 
                              'caseRate': 'caseRate_2020-08-05', 'deathRate': 'deathRate_2020-08-05'}, inplace=True)

In [13]:
data_20200805.head()

Unnamed: 0,state,code,population,cases_2020-08-05,death_2020-08-05,caseRate_2020-08-05,deathRate_2020-08-05
0,Alaska,AK,738516,3448,25,466.882234,3.385167
1,Alabama,AL,4864680,91776,1639,1886.578357,33.691836
2,Arkansas,AR,2990671,44607,508,1491.538187,16.986155
3,Arizona,AZ,6946685,182203,3931,2622.876955,56.588142
4,California,CA,39148760,530606,9808,1355.358382,25.053156


### Get 7-day averages

In [14]:
# Wave 1: Feb. 21-27 (pre-launch) versus Jan 24-Jan 30 (comparison baseline)
# Wave 2: Apr 20-26 versus Mar 23-29
# Wave 3: July 28-Aug 4 versus June 30-July 6

In [15]:
days = 7.0

In [16]:
# query = """
# MATCH (a2:Admin2)-[:IN]->(a1:Admin1),
#        (a2)-[:HAS_DEMOGRAPHICS]->(d:Demographics{aggregationLevel: 'Admin2'}) 

# WITH a1, a2, d.totalPopulation AS population

# OPTIONAL MATCH (c:Cases{source:'JHU'})-[:REPORTED_IN]->(a2)
# WHERE c.date >= date($startDate) AND c.date <= date($endDate)
# WITH a1, population, sum(c.cases) AS cases, sum(c.deaths) AS deaths
      
# RETURN a1.name AS state, a1.code AS code, sum(population) AS population, sum(cases)/7.0 AS cases, sum(deaths)/7.0 AS deaths,
#        sum(cases)*100000.0/7.0/sum(population) as caseRate, sum(deaths)*100000.0/7.0/sum(population) as deathRate
# ORDER BY code
# """

In [17]:
query = """
MATCH (a2:Admin2)-[:IN]->(a1:Admin1),
       (a2)-[:HAS_DEMOGRAPHICS]->(d:Demographics{aggregationLevel: 'Admin2'}) 

WITH a1, a2, d

OPTIONAL MATCH (c:Cases{source:'JHU'})-[:REPORTED_IN]->(a2)
WHERE c.date >= date($startDate) AND c.date <= date($endDate)
WITH a1, d, sum(c.cases) AS cases, sum(c.deaths) AS deaths
      
RETURN a1.name AS state, a1.code AS code, 
       sum(d.totalPopulation) AS population, 
       sum(cases)/$days AS cases, sum(deaths)/$days AS deaths,
       sum(cases)*100000.0/$days/sum(d.totalPopulation) as caseRate, 
       sum(deaths)*100000.0/$days/sum(d.totalPopulation) as deathRate
ORDER BY code
"""

In [18]:
data_20200124_20200130 = graph.run(query, startDate='2020-01-24', endDate='2020-01-30', days=days).to_data_frame()  

In [19]:
data_20200124_20200130.rename(columns={'cases': 'avgcases_2020-01-24_2020-01-30', 'deaths': 'avgdeath_2020-01-24_2020-01-30', 
                              'caseRate': 'avgcaseRate_2020-01-24_2020-01-30', 'deathRate': 'avgdeathRate_2020-01-24_2020-01-30'}, inplace=True)

In [20]:
data_20200124_20200130.head()

Unnamed: 0,state,code,population,avgcases_2020-01-24_2020-01-30,avgdeath_2020-01-24_2020-01-30,avgcaseRate_2020-01-24_2020-01-30,avgdeathRate_2020-01-24_2020-01-30
0,Alaska,AK,738516,0.0,0.0,0.0,0.0
1,Alabama,AL,4864680,0.0,0.0,0.0,0.0
2,Arkansas,AR,2990671,0.0,0.0,0.0,0.0
3,Arizona,AZ,6946685,0.714286,0.0,0.010282,0.0
4,California,CA,39148760,1.428571,0.0,0.003649,0.0


In [21]:
data_20200221_20200227 = graph.run(query, startDate='2020-02-21', endDate='2020-02-27', days=days).to_data_frame() 

In [22]:
data_20200221_20200227.rename(columns={'cases': 'avgcases_2020-02-21_2020-02-27', 'deaths': 'avgdeath_2020-02-21_2020-02-27', 
                              'caseRate': 'avgcaseRate_2020-02-21_2020-02-27', 'deathRate': 'avgdeathRate_2020-02-21_2020-02-27'}, inplace=True)

In [23]:
data_20200221_20200227.head()

Unnamed: 0,state,code,population,avgcases_2020-02-21_2020-02-27,avgdeath_2020-02-21_2020-02-27,avgcaseRate_2020-02-21_2020-02-27,avgdeathRate_2020-02-21_2020-02-27
0,Alaska,AK,738516,0.0,0.0,0.0,0.0
1,Alabama,AL,4864680,0.0,0.0,0.0,0.0
2,Arkansas,AR,2990671,0.0,0.0,0.0,0.0
3,Arizona,AZ,6946685,1.0,0.0,0.014395,0.0
4,California,CA,39148760,10.142857,0.0,0.025909,0.0


In [24]:
data_20200323_20200329 = graph.run(query, startDate='2020-03-23', endDate='2020-03-29', days=days).to_data_frame() 

In [25]:
data_20200323_20200329.rename(columns={'cases': 'avgcases_2020-03-23_2020-03-29', 'deaths': 'avgdeath_2020-03-23_2020-03-29', 
                              'caseRate': 'avgcaseRate_2020-03-23_2020-03-29', 'deathRate': 'avgdeathRate_2020-03-23_2020-03-29'}, inplace=True)

In [26]:
data_20200323_20200329.head()

Unnamed: 0,state,code,population,avgcases_2020-03-23_2020-03-29,avgdeath_2020-03-23_2020-03-29,avgcaseRate_2020-03-23_2020-03-29,avgdeathRate_2020-03-23_2020-03-29
0,Alaska,AK,738516,58.0,0.285714,7.853587,0.038688
1,Alabama,AL,4864680,491.714286,2.857143,10.107844,0.058732
2,Arkansas,AR,2990671,297.714286,0.857143,9.954766,0.028661
3,Arizona,AZ,6946685,546.714286,6.571429,7.870146,0.094598
4,California,CA,39148760,3887.714286,80.285714,9.930619,0.205079


In [27]:
data_20200420_20200426 = graph.run(query, startDate='2020-04-20', endDate='2020-04-26', days=days).to_data_frame() 

In [28]:
data_20200420_20200426.rename(columns={'cases': 'avgcases_2020-04-20_2020-04-26', 'deaths': 'avgdeath_2020-04-20_2020-04-26', 
                              'caseRate': 'avgcaseRate_2020-04-20_2020-04-26', 'deathRate': 'avgdeathRate_2020-04-20_2020-04-26'}, inplace=True)

In [29]:
data_20200420_20200426.head()

Unnamed: 0,state,code,population,avgcases_2020-04-20_2020-04-26,avgdeath_2020-04-20_2020-04-26,avgcaseRate_2020-04-20_2020-04-26,avgdeathRate_2020-04-20_2020-04-26
0,Alaska,AK,738516,334.285714,5.0,45.264519,0.677033
1,Alabama,AL,4864680,5756.285714,197.285714,118.328147,4.055472
2,Arkansas,AR,2990671,2378.285714,45.142857,79.523482,1.509456
3,Arizona,AZ,6946685,5776.285714,241.714286,83.151686,3.479563
4,California,CA,39148760,39051.571429,1497.428571,99.751745,3.824971


In [30]:
data_20200630_20200706 = graph.run(query, startDate='2020-06-30', endDate='2020-07-06', days=days).to_data_frame() 

In [31]:
data_20200630_20200706.rename(columns={'cases': 'avgcases_2020-06-30_2020-07-06', 'deaths': 'avgdeath_2020-04-20_2020-07-06', 
                              'caseRate': 'avgcaseRate_2020-06-30_2020-07-06', 'deathRate': 'avgdeathRate_2020-06-30_2020-07-06'}, inplace=True)

In [32]:
data_20200630_20200706.head()

Unnamed: 0,state,code,population,avgcases_2020-06-30_2020-07-06,avgdeath_2020-04-20_2020-07-06,avgcaseRate_2020-06-30_2020-07-06,avgdeathRate_2020-06-30_2020-07-06
0,Alaska,AK,738516,1055.571429,14.428571,142.931423,1.953725
1,Alabama,AL,4864680,41018.285714,967.0,843.185692,19.877978
2,Arkansas,AR,2990671,21748.714286,281.714286,727.21855,9.419769
3,Arizona,AZ,6946685,90967.714286,1771.571429,1309.512585,25.5024
4,California,CA,39148760,250824.857143,6282.714286,640.696812,16.04831


In [33]:
data_20200729_20200804 = graph.run(query, startDate='2020-07-29', endDate='2020-08-04', days=days).to_data_frame() 

In [34]:
data_20200729_20200804.rename(columns={'cases': 'avgcases_2020-07-29_2020-08-04', 'deaths': 'avgdeath_2020-07-29_2020-08-04', 
                              'caseRate': 'avgcaseRate_2020-07-29_2020-08-04', 'deathRate': 'avgdeathRate_2020-07-29_2020-08-04'}, inplace=True)

In [35]:
data_20200729_20200804.head()

Unnamed: 0,state,code,population,avgcases_2020-07-29_2020-08-04,avgdeath_2020-07-29_2020-08-04,avgcaseRate_2020-07-29_2020-08-04,avgdeathRate_2020-07-29_2020-08-04
0,Alaska,AK,738516,3116.857143,23.714286,422.043279,3.211073
1,Alabama,AL,4864680,86679.0,1550.857143,1781.802709,31.879942
2,Arkansas,AR,2990671,41584.285714,459.428571,1390.466745,15.362057
3,Arizona,AZ,6946685,175507.428571,3700.714286,2526.491824,53.273098
4,California,CA,39148760,505818.142857,9287.714286,1292.041288,23.72416


#### Consistency check (these numbers should match the daily numbers in dataframe: data_20200805)

In [36]:
data_20200805_20200805 = graph.run(query, startDate='2020-08-05', endDate='2020-08-05', days=1.0).to_data_frame() 

In [37]:
data_20200805_20200805 = data_20200805_20200805.merge(data_20200805, on=['state', 'code', 'population'])

In [38]:
data_20200805_20200805.head()

Unnamed: 0,state,code,population,cases,deaths,caseRate,deathRate,cases_2020-08-05,death_2020-08-05,caseRate_2020-08-05,deathRate_2020-08-05
0,Alaska,AK,738516,3448.0,25.0,466.882234,3.385167,3448,25,466.882234,3.385167
1,Alabama,AL,4864680,91776.0,1639.0,1886.578357,33.691836,91776,1639,1886.578357,33.691836
2,Arkansas,AR,2990671,44607.0,508.0,1491.538187,16.986155,44607,508,1491.538187,16.986155
3,Arizona,AZ,6946685,182203.0,3931.0,2622.876955,56.588142,182203,3931,2622.876955,56.588142
4,California,CA,39148760,530606.0,9808.0,1355.358382,25.053156,530606,9808,1355.358382,25.053156


### Merge dataframes

In [39]:
data_frames = [data_20200228, data_20200427, data_20200805, data_20200124_20200130, data_20200221_20200227, 
               data_20200323_20200329, data_20200420_20200426, data_20200630_20200706, data_20200729_20200804]

In [40]:
data = reduce(lambda  left,right: pd.merge(left,right,on=['state','code', 'population'], how='outer'), data_frames).fillna(0.0)

In [41]:
data.head()

Unnamed: 0,state,code,population,cases_2020-02-28,death_2020-02-28,caseRate_2020-02-28,deathRate_2020-02-28,cases_2020-04-27,death_2020-04-27,caseRate_2020-04-27,deathRate_2020-04-27,cases_2020-08-05,death_2020-08-05,caseRate_2020-08-05,deathRate_2020-08-05,avgcases_2020-01-24_2020-01-30,avgdeath_2020-01-24_2020-01-30,avgcaseRate_2020-01-24_2020-01-30,avgdeathRate_2020-01-24_2020-01-30,avgcases_2020-02-21_2020-02-27,avgdeath_2020-02-21_2020-02-27,avgcaseRate_2020-02-21_2020-02-27,avgdeathRate_2020-02-21_2020-02-27,avgcases_2020-03-23_2020-03-29,avgdeath_2020-03-23_2020-03-29,avgcaseRate_2020-03-23_2020-03-29,avgdeathRate_2020-03-23_2020-03-29,avgcases_2020-04-20_2020-04-26,avgdeath_2020-04-20_2020-04-26,avgcaseRate_2020-04-20_2020-04-26,avgdeathRate_2020-04-20_2020-04-26,avgcases_2020-06-30_2020-07-06,avgdeath_2020-04-20_2020-07-06,avgcaseRate_2020-06-30_2020-07-06,avgdeathRate_2020-06-30_2020-07-06,avgcases_2020-07-29_2020-08-04,avgdeath_2020-07-29_2020-08-04,avgcaseRate_2020-07-29_2020-08-04,avgdeathRate_2020-07-29_2020-08-04
0,Alaska,AK,738516,0,0,0.0,0.0,345,5,46.715305,0.677033,3448,25,466.882234,3.385167,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,58.0,0.285714,7.853587,0.038688,334.285714,5.0,45.264519,0.677033,1055.571429,14.428571,142.931423,1.953725,3116.857143,23.714286,422.043279,3.211073
1,Alabama,AL,4864680,0,0,0.0,0.0,6539,228,134.417886,4.686845,91776,1639,1886.578357,33.691836,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,491.714286,2.857143,10.107844,0.058732,5756.285714,197.285714,118.328147,4.055472,41018.285714,967.0,843.185692,19.877978,86679.0,1550.857143,1781.802709,31.879942
2,Arkansas,AR,2990671,0,0,0.0,0.0,2866,51,95.831337,1.705303,44607,508,1491.538187,16.986155,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,297.714286,0.857143,9.954766,0.028661,2378.285714,45.142857,79.523482,1.509456,21748.714286,281.714286,727.21855,9.419769,41584.285714,459.428571,1390.466745,15.362057
3,Arizona,AZ,6946685,1,0,0.014395,0.0,6725,275,96.808766,3.958723,182203,3931,2622.876955,56.588142,0.714286,0.0,0.010282,0.0,1.0,0.0,0.014395,0.0,546.714286,6.571429,7.870146,0.094598,5776.285714,241.714286,83.151686,3.479563,90967.714286,1771.571429,1309.512585,25.5024,175507.428571,3700.714286,2526.491824,53.273098
4,California,CA,39148760,11,0,0.028098,0.0,44972,1777,114.874647,4.539097,530606,9808,1355.358382,25.053156,1.428571,0.0,0.003649,0.0,10.142857,0.0,0.025909,0.0,3887.714286,80.285714,9.930619,0.205079,39051.571429,1497.428571,99.751745,3.824971,250824.857143,6282.714286,640.696812,16.04831,505818.142857,9287.714286,1292.041288,23.72416


In [42]:
data.to_csv("COVID_waves1to3_states_JHU.csv", index=False)