# Assign place name and county information to zip-level data. Add county-level cases, deaths, caseRate, and deathRate for specific days and weeks for waves 1 - 3.

**[Work in progress]**

Author: Peter Rose (pwrose@ucsd.edu)

In [1]:
import os
import pandas as pd
from pathlib import Path
from py2neo import Graph
import time
import unidecode
import difflib
from functools import reduce

In [2]:
pd.options.display.max_rows = None  # display all rows
pd.options.display.max_columns = None  # display all columsns

#### Connect to COVID-19-Net Knowledge Graph

In [3]:
graph = Graph("bolt://132.249.238.185:7687", user="reader", password="demo")

In [4]:
df = pd.read_csv("COVID_waves1to3_zipcodefocused.csv", dtype='str')

In [5]:
df.head()

Unnamed: 0,S1_ZipCode,S1_StartDate,S1_EndDate,S2_StartDate,S2_EndDate,S3_StartDate,S3_EndDate
0,92410,2/28/2020 18:16:18,2/28/2020 18:40:40,4/29/2020 16:35:35,4/29/2020 17:01:46,8/5/2020 12:22:15,8/5/2020 12:46:30
1,16823,2/28/2020 18:10:08,2/28/2020 18:31:15,4/27/2020 15:51:59,4/27/2020 16:11:09,8/9/2020 5:07:28,8/9/2020 5:32:32
2,68512,2/28/2020 18:36:48,2/28/2020 18:53:21,4/27/2020 15:38:01,4/27/2020 15:55:20,8/5/2020 12:10:10,8/5/2020 12:26:08
3,32935,2/28/2020 18:53:58,2/28/2020 19:20:14,,,,
4,92301,2/28/2020 18:28:28,2/28/2020 19:23:54,4/27/2020 15:29:31,4/27/2020 15:57:35,,


Zip codes must be 5 digits, pad 4 digit zip code with zeros

In [6]:
df['S1_ZipCode'] = df['S1_ZipCode'].apply(lambda x: x if len(x) != 4 else '0' + x)

In [7]:
df.shape

(2005, 7)

In [8]:
df_zip = df.query("S1_ZipCode != ' '").copy()
df_zip['S1_ZipCode'] = df_zip['S1_ZipCode'].astype(int)

In [9]:
zip_unique = df_zip['S1_ZipCode'].unique()

In [10]:
len(zip_unique)

1757

In [11]:
zip_unique

array([92410, 16823, 68512, ..., 75088, 97402, 92092])

In [12]:
df_zip.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1993 entries, 0 to 2004
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   S1_ZipCode    1993 non-null   int64 
 1   S1_StartDate  1993 non-null   object
 2   S1_EndDate    1993 non-null   object
 3   S2_StartDate  1993 non-null   object
 4   S2_EndDate    1993 non-null   object
 5   S3_StartDate  1993 non-null   object
 6   S3_EndDate    1993 non-null   object
dtypes: int64(1), object(6)
memory usage: 124.6+ KB


In [13]:
df.drop_duplicates(inplace=True)
df.shape

(2005, 7)

In [14]:
query = """
OPTIONAL MATCH (p:PostalCode{name:$zip_code})-[i:IN]->(a2:Admin2)-[:IN]->(a1:Admin1),
      (a2)-[:HAS_DEMOGRAPHICS]->(d:Demographics{aggregationLevel: 'Admin2'}) 
      
RETURN a1.name AS state, a2.name AS county, i.resRatio AS resRatio, p.placeName AS placeName, a1.code AS code,
       $zip_code AS S1_ZipCode, d.totalPopulation as population
"""

In [15]:
loc = pd.concat((graph.run(query, zip_code=row.S1_ZipCode).to_data_frame() 
                           for row in df.itertuples()))

In [16]:
loc.head(10)

Unnamed: 0,state,county,resRatio,placeName,code,S1_ZipCode,population
0,California,San Bernardino County,1.0,San Bernardino,CA,92410,2135413
0,Pennsylvania,Centre County,1.0,Bellefonte,PA,16823,161443
0,Nebraska,Lancaster County,1.0,Lincoln,NE,68512,310094
0,Florida,Brevard County,1.0,Melbourne,FL,32935,576808
0,California,San Bernardino County,0.999926,Adelanto,CA,92301,2135413
1,California,Los Angeles County,7.4e-05,Adelanto,CA,92301,10098052
0,California,Los Angeles County,1.0,West Hollywood,CA,90069,10098052
0,Indiana,Monroe County,1.0,Bloomington,IN,47408,145403
0,New Jersey,Hudson County,1.0,West New York,NJ,7093,668631
0,Florida,Brevard County,1.0,Titusville,FL,32780,576808


In [17]:
df_loc = df.merge(loc, on='S1_ZipCode', how='left')

In [18]:
df_loc.shape

(3483, 13)

In [83]:
df_loc.head()

Unnamed: 0,S1_ZipCode,S1_StartDate,S1_EndDate,S2_StartDate,S2_EndDate,S3_StartDate,S3_EndDate,state,county,resRatio,placeName,code,population
0,92410,2/28/2020 18:16:18,2/28/2020 18:40:40,4/29/2020 16:35:35,4/29/2020 17:01:46,8/5/2020 12:22:15,8/5/2020 12:46:30,California,San Bernardino County,1.0,San Bernardino,CA,2135413
1,16823,2/28/2020 18:10:08,2/28/2020 18:31:15,4/27/2020 15:51:59,4/27/2020 16:11:09,8/9/2020 5:07:28,8/9/2020 5:32:32,Pennsylvania,Centre County,1.0,Bellefonte,PA,161443
2,68512,2/28/2020 18:36:48,2/28/2020 18:53:21,4/27/2020 15:38:01,4/27/2020 15:55:20,8/5/2020 12:10:10,8/5/2020 12:26:08,Nebraska,Lancaster County,1.0,Lincoln,NE,310094
3,32935,2/28/2020 18:53:58,2/28/2020 19:20:14,,,,,Florida,Brevard County,1.0,Melbourne,FL,576808
4,32935,2/28/2020 18:53:58,2/28/2020 19:20:14,,,,,Florida,Brevard County,1.0,Melbourne,FL,576808


In [20]:
df_loc.to_csv("COVID_waves1to3_zipcodefocused_Locations.csv", index=False)

In [21]:
query = """
MATCH (p:PostalCode{name:$zip_code})-[i:IN]->(a2:Admin2)-[:IN]->(a1:Admin1),
      (a2)<-[:REPORTED_IN]-(c:Cases{source:'JHU'}),
      (a2)-[:HAS_DEMOGRAPHICS]->(d:Demographics{aggregationLevel: 'Admin2'}) 
      
WHERE c.date = date($date)
      
RETURN $date AS date, a1.name AS state, a2.name AS county, i.resRatio AS resRatio, p.placeName AS placeName, a1.code AS code,
       $zip_code AS S1_ZipCode,
       d.totalPopulation as population, c.cases AS cases, c.deaths AS deaths,
       c.cases*100000.0/d.totalPopulation AS caseRate, 
       c.deaths*100000.0/d.totalPopulation AS deathRate
"""

In [22]:
zip_code = '32754'

In [23]:
graph.run(query, zip_code=zip_code, date='2020-04-27').to_data_frame()

Unnamed: 0,date,state,county,resRatio,placeName,code,S1_ZipCode,population,cases,deaths,caseRate,deathRate
0,2020-04-27,Florida,Brevard County,0.951933,Mims,FL,32754,576808,266,8,46.115865,1.386943
1,2020-04-27,Florida,Volusia County,0.048067,Mims,FL,32754,527634,440,18,83.391139,3.411456


### Data for 2020-02-28

In [24]:
data_20200228 = pd.concat((graph.run(query, zip_code=row.S1_ZipCode, date='2020-02-28').to_data_frame() 
                           for row in df.itertuples()))

In [25]:
data_20200228.rename(columns={'cases': 'cases_2020-02-28', 'deaths': 'death_2020-02-28', 
                              'caseRate': 'caseRate_2020-02-28', 'deathRate': 'deathRate_2020-02-28'}, inplace=True)

In [26]:
data_20200228.drop(['date'], axis=1, inplace=True)

In [27]:
data_20200228.drop_duplicates(inplace=True)

In [28]:
data_20200228.shape

(213, 11)

In [29]:
data_20200228.head()

Unnamed: 0,state,county,resRatio,placeName,code,S1_ZipCode,population,cases_2020-02-28,death_2020-02-28,caseRate_2020-02-28,deathRate_2020-02-28
0,California,Los Angeles County,7.4e-05,Adelanto,CA,92301,10098052,1,0,0.009903,0.0
0,California,Los Angeles County,1.0,West Hollywood,CA,90069,10098052,1,0,0.009903,0.0
0,Arizona,Maricopa County,1.0,Phoenix,AZ,85053,4253913,1,0,0.023508,0.0
0,California,Los Angeles County,1.0,San Gabriel,CA,91776,10098052,1,0,0.009903,0.0
0,California,Orange County,1.0,Fountain Valley,CA,92708,3164182,1,0,0.031604,0.0


### Data for 2020-04-27

In [30]:
data_20200427 = pd.concat((graph.run(query, zip_code=row.S1_ZipCode, date='2020-04-27').to_data_frame() 
                           for row in df.itertuples()))

In [31]:
data_20200427.rename(columns={'cases': 'cases_2020-04-27', 'deaths': 'death_2020-04-27', 
                              'caseRate': 'caseRate_2020-04-27', 'deathRate': 'deathRate_2020-04-27'}, inplace=True)

In [32]:
data_20200427.drop(['date'], axis=1, inplace=True)

In [33]:
data_20200427.drop_duplicates(inplace=True)

In [34]:
data_20200427.shape

(2264, 11)

In [84]:
data_20200427.head()

Unnamed: 0,state,county,resRatio,placeName,code,S1_ZipCode,population,cases_2020-04-27,death_2020-04-27,caseRate_2020-04-27,deathRate_2020-04-27
0,California,San Bernardino County,1.0,San Bernardino,CA,92410,2135413,1772,82,82.981606,3.840007
0,Pennsylvania,Centre County,1.0,Bellefonte,PA,16823,161443,87,1,53.888989,0.619414
0,Nebraska,Lancaster County,1.0,Lincoln,NE,68512,310094,157,1,50.629809,0.322483
0,Florida,Brevard County,1.0,Melbourne,FL,32935,576808,266,8,46.115865,1.386943
0,California,San Bernardino County,0.999926,Adelanto,CA,92301,2135413,1772,82,82.981606,3.840007


### Data for 2020-08-05

In [36]:
data_20200805 = pd.concat((graph.run(query, zip_code=row.S1_ZipCode, date='2020-08-05').to_data_frame() 
                           for row in df.itertuples()))

In [37]:
data_20200805.rename(columns={'cases': 'cases_2020-08-05', 'deaths': 'death_2020-08-05', 
                              'caseRate': 'caseRate_2020-08-05', 'deathRate': 'deathRate_2020-08-05'}, inplace=True)

In [38]:
data_20200805.drop(['date'], axis=1, inplace=True)

In [39]:
data_20200805.drop_duplicates(inplace=True)

In [40]:
data_20200805.shape

(2285, 11)

In [85]:
data_20200805.head()

Unnamed: 0,state,county,resRatio,placeName,code,S1_ZipCode,population,cases_2020-08-05,death_2020-08-05,caseRate_2020-08-05,deathRate_2020-08-05
0,California,San Bernardino County,1.0,San Bernardino,CA,92410,2135413,34237,487,1603.296412,22.805893
0,Pennsylvania,Centre County,1.0,Bellefonte,PA,16823,161443,361,10,223.608332,6.194137
0,Nebraska,Lancaster County,1.0,Lincoln,NE,68512,310094,3180,17,1025.495495,5.482209
0,Florida,Brevard County,1.0,Melbourne,FL,32935,576808,5712,132,990.277527,22.884565
0,California,San Bernardino County,0.999926,Adelanto,CA,92301,2135413,34237,487,1603.296412,22.805893


### Get 7-day averages

In [42]:
# Wave 1: Feb. 21-27 (pre-launch) versus Jan 24-Jan 30 (comparison baseline)
# Wave 2: Apr 20-26 versus Mar 23-29
# Wave 3: July 29-Aug 4 versus June 30-July 6

In [43]:
query = """
MATCH (p:PostalCode{name:$zip_code})-[i:IN]->(a2:Admin2)-[:IN]->(a1:Admin1),
      (a2)<-[:REPORTED_IN]-(c:Cases{source:'JHU'}),
      (a2)-[:HAS_DEMOGRAPHICS]->(d:Demographics{aggregationLevel: 'Admin2'}) 
      
WHERE c.date >= date($startDate) AND c.date <= date($endDate)
      
RETURN a1.name AS state, a2.name AS county, i.resRatio AS resRatio, p.placeName AS placeName, a1.code AS code,
       $zip_code AS S1_ZipCode,
       d.totalPopulation as population, avg(c.cases) AS cases, avg(c.deaths) AS deaths,
       avg(c.cases)*100000.0/d.totalPopulation AS caseRate, 
       avg(c.deaths)*100000.0/d.totalPopulation AS deathRate
"""

In [44]:
data_20200124_20200130 = pd.concat((graph.run(query, zip_code=row.S1_ZipCode, 
                                              startDate='2020-01-24', endDate='2020-01-30').to_data_frame() 
                                   for row in df.itertuples()))

In [45]:
data_20200124_20200130.rename(columns={'cases': 'avgcases_2020-01-24_2020-01-30', 'deaths': 'avgdeath_2020-01-24_2020-01-30', 
                              'caseRate': 'avgcaseRate_2020-01-24_2020-01-30', 'deathRate': 'avgdeathRate_2020-01-24_2020-01-30'}, inplace=True)

In [46]:
data_20200124_20200130.drop_duplicates(inplace=True)

In [47]:
data_20200124_20200130.shape

(164, 11)

In [48]:
data_20200221_20200227 = pd.concat((graph.run(query, zip_code=row.S1_ZipCode, 
                                              startDate='2020-02-21', endDate='2020-02-27').to_data_frame() 
                                   for row in df.itertuples()))

In [49]:
data_20200221_20200227.rename(columns={'cases': 'avgcases_2020-02-21_2020-02-27', 'deaths': 'avgdeath_2020-02-21_2020-02-27', 
                              'caseRate': 'avgcaseRate_2020-02-21_2020-02-27', 'deathRate': 'avgdeathRate_2020-02-21_2020-02-27'}, inplace=True)

In [50]:
data_20200221_20200227.drop_duplicates(inplace=True)

In [51]:
data_20200221_20200227.shape

(213, 11)

In [52]:
data_20200323_20200329 = pd.concat((graph.run(query, zip_code=row.S1_ZipCode, 
                                              startDate='2020-03-23', endDate='2020-03-29').to_data_frame() 
                                   for row in df.itertuples()))

In [53]:
data_20200323_20200329.rename(columns={'cases': 'avgcases_2020-03-23_2020-03-29', 'deaths': 'avgdeath_2020-03-23_2020-03-29', 
                              'caseRate': 'avgcaseRate_2020-03-23_2020-03-29', 'deathRate': 'avgdeathRate_2020-03-23_2020-03-29'}, inplace=True)

In [54]:
data_20200323_20200329.drop_duplicates(inplace=True)

In [55]:
data_20200323_20200329.shape

(2155, 11)

In [56]:
data_20200420_20200426 = pd.concat((graph.run(query, zip_code=row.S1_ZipCode, 
                                              startDate='2020-04-20', endDate='2020-04-26').to_data_frame() 
                                   for row in df.itertuples()))

In [57]:
data_20200420_20200426.rename(columns={'cases': 'avgcases_2020-04-20_2020-04-26', 'deaths': 'avgdeath_2020-04-20_2020-04-26', 
                              'caseRate': 'avgcaseRate_2020-04-20_2020-04-26', 'deathRate': 'avgdeathRate_2020-04-20_2020-04-26'}, inplace=True)

In [58]:
data_20200420_20200426.drop_duplicates(inplace=True)

In [59]:
data_20200420_20200426.shape

(2264, 11)

In [60]:
data_20200630_20200706 = pd.concat((graph.run(query, zip_code=row.S1_ZipCode, 
                                              startDate='2020-06-30', endDate='2020-07-06').to_data_frame() 
                                   for row in df.itertuples()))

In [61]:
data_20200630_20200706.rename(columns={'cases': 'avgcases_2020-06-30_2020-07-06', 'deaths': 'avgdeath_2020-04-20_2020-07-06', 
                              'caseRate': 'avgcaseRate_2020-06-30_2020-07-06', 'deathRate': 'avgdeathRate_2020-06-30_2020-07-06'}, inplace=True)

In [62]:
data_20200630_20200706.drop_duplicates(inplace=True)

In [63]:
data_20200630_20200706.shape

(2285, 11)

In [65]:
data_20200729_20200804 = pd.concat((graph.run(query, zip_code=row.S1_ZipCode, 
                                              startDate='2020-07-29', endDate='2020-08-04').to_data_frame() 
                                   for row in df.itertuples()))

In [66]:
data_20200729_20200804.rename(columns={'cases': 'avgcases_2020-07-29_2020-08-04', 'deaths': 'avgdeath_2020-07-29_2020-08-04', 
                              'caseRate': 'avgcaseRate_2020-07-29_2020-08-04', 'deathRate': 'avgdeathRate_2020-07-29_2020-08-04'}, inplace=True)

In [67]:
data_20200729_20200804.drop_duplicates(inplace=True)

In [68]:
data_20200729_20200804.shape

(2285, 11)

In [69]:
data_20200729_20200804.query("county == 'Los Angeles County'").head(20)

Unnamed: 0,state,county,resRatio,placeName,code,S1_ZipCode,population,avgcases_2020-07-29_2020-08-04,avgdeath_2020-07-29_2020-08-04,avgcaseRate_2020-07-29_2020-08-04,avgdeathRate_2020-07-29_2020-08-04
1,California,Los Angeles County,7.4e-05,Adelanto,CA,92301,10098052,190107.0,4646.428571,1882.610626,46.013118
0,California,Los Angeles County,1.0,West Hollywood,CA,90069,10098052,190107.0,4646.428571,1882.610626,46.013118
0,California,Los Angeles County,1.0,San Gabriel,CA,91776,10098052,190107.0,4646.428571,1882.610626,46.013118
0,California,Los Angeles County,1.0,Van Nuys,CA,91401,10098052,190107.0,4646.428571,1882.610626,46.013118
0,California,Los Angeles County,1.0,Lakewood,CA,90713,10098052,190107.0,4646.428571,1882.610626,46.013118
0,California,Los Angeles County,1.0,North Hollywood,CA,91605,10098052,190107.0,4646.428571,1882.610626,46.013118
0,California,Los Angeles County,1.0,Van Nuys,CA,91406,10098052,190107.0,4646.428571,1882.610626,46.013118
0,California,Los Angeles County,1.0,Beverly Hills,CA,90210,10098052,190107.0,4646.428571,1882.610626,46.013118
0,California,Los Angeles County,1.0,Los Angeles,CA,90015,10098052,190107.0,4646.428571,1882.610626,46.013118
0,California,Los Angeles County,1.0,Monrovia,CA,91016,10098052,190107.0,4646.428571,1882.610626,46.013118


### Merge dataframes

In [70]:
data_frames = [data_20200228, data_20200427, data_20200805, data_20200124_20200130, data_20200221_20200227, 
               data_20200323_20200329, data_20200420_20200426, data_20200630_20200706, data_20200729_20200804]

In [71]:
data = reduce(lambda  left,right: pd.merge(left,right,on=['S1_ZipCode','state','county', 'resRatio', 'placeName','code', 'population'], how='outer'), data_frames).fillna(0.0)

In [72]:
data.shape

(2287, 43)

In [86]:
data.head()

Unnamed: 0,state,county,resRatio,placeName,code,S1_ZipCode,population,cases_2020-02-28,death_2020-02-28,caseRate_2020-02-28,deathRate_2020-02-28,cases_2020-04-27,death_2020-04-27,caseRate_2020-04-27,deathRate_2020-04-27,cases_2020-08-05,death_2020-08-05,caseRate_2020-08-05,deathRate_2020-08-05,avgcases_2020-01-24_2020-01-30,avgdeath_2020-01-24_2020-01-30,avgcaseRate_2020-01-24_2020-01-30,avgdeathRate_2020-01-24_2020-01-30,avgcases_2020-02-21_2020-02-27,avgdeath_2020-02-21_2020-02-27,avgcaseRate_2020-02-21_2020-02-27,avgdeathRate_2020-02-21_2020-02-27,avgcases_2020-03-23_2020-03-29,avgdeath_2020-03-23_2020-03-29,avgcaseRate_2020-03-23_2020-03-29,avgdeathRate_2020-03-23_2020-03-29,avgcases_2020-04-20_2020-04-26,avgdeath_2020-04-20_2020-04-26,avgcaseRate_2020-04-20_2020-04-26,avgdeathRate_2020-04-20_2020-04-26,avgcases_2020-06-30_2020-07-06,avgdeath_2020-04-20_2020-07-06,avgcaseRate_2020-06-30_2020-07-06,avgdeathRate_2020-06-30_2020-07-06,avgcases_2020-07-29_2020-08-04,avgdeath_2020-07-29_2020-08-04,avgcaseRate_2020-07-29_2020-08-04,avgdeathRate_2020-07-29_2020-08-04
0,California,Los Angeles County,7.4e-05,Adelanto,CA,92301,10098052,1.0,0.0,0.009903,0.0,20423.0,944.0,202.246928,9.348338,198165.0,4827.0,1962.408195,47.801299,1.0,0.0,0.009903,0.0,1.0,0.0,0.009903,0.0,1142.571429,19.428571,11.314771,0.192399,17172.142857,781.714286,170.054015,7.741238,109219.0,3452.571429,1081.584844,34.19047,190107.0,4646.428571,1882.610626,46.013118
1,California,Los Angeles County,1.0,West Hollywood,CA,90069,10098052,1.0,0.0,0.009903,0.0,20423.0,944.0,202.246928,9.348338,198165.0,4827.0,1962.408195,47.801299,1.0,0.0,0.009903,0.0,1.0,0.0,0.009903,0.0,1142.571429,19.428571,11.314771,0.192399,17172.142857,781.714286,170.054015,7.741238,109219.0,3452.571429,1081.584844,34.19047,190107.0,4646.428571,1882.610626,46.013118
2,Arizona,Maricopa County,1.0,Phoenix,AZ,85053,4253913,1.0,0.0,0.023508,0.0,3457.0,122.0,81.266354,2.867948,123082.0,2204.0,2893.383104,51.811121,1.0,0.0,0.023508,0.0,1.0,0.0,0.023508,0.0,326.571429,3.285714,7.676965,0.07724,2985.571429,104.714286,70.184121,2.461599,57226.285714,848.571429,1345.262249,19.94802,118230.857143,2061.571429,2779.343563,48.462943
3,California,Los Angeles County,1.0,San Gabriel,CA,91776,10098052,1.0,0.0,0.009903,0.0,20423.0,944.0,202.246928,9.348338,198165.0,4827.0,1962.408195,47.801299,1.0,0.0,0.009903,0.0,1.0,0.0,0.009903,0.0,1142.571429,19.428571,11.314771,0.192399,17172.142857,781.714286,170.054015,7.741238,109219.0,3452.571429,1081.584844,34.19047,190107.0,4646.428571,1882.610626,46.013118
4,California,Orange County,1.0,Fountain Valley,CA,92708,3164182,1.0,0.0,0.031604,0.0,2126.0,39.0,67.189561,1.232546,38131.0,665.0,1205.082388,21.01649,1.0,0.0,0.031604,0.0,1.0,0.0,0.031604,0.0,267.857143,1.857143,8.465289,0.058693,1833.571429,35.571429,57.947723,1.12419,15718.0,356.285714,496.747659,11.259963,36764.142857,630.428571,1161.884584,19.923904


### Merge

In [74]:
df_loc_cases = df_loc.merge(data, on=['S1_ZipCode', 'state', 'county', 'resRatio', 'placeName', 'resRatio', 'code', 'population'], how='left')

In [75]:
df_loc_cases.shape

(3483, 49)

In [87]:
df_loc_cases.head()

Unnamed: 0,S1_ZipCode,S1_StartDate,S1_EndDate,S2_StartDate,S2_EndDate,S3_StartDate,S3_EndDate,state,county,resRatio,placeName,code,population,cases_2020-02-28,death_2020-02-28,caseRate_2020-02-28,deathRate_2020-02-28,cases_2020-04-27,death_2020-04-27,caseRate_2020-04-27,deathRate_2020-04-27,cases_2020-08-05,death_2020-08-05,caseRate_2020-08-05,deathRate_2020-08-05,avgcases_2020-01-24_2020-01-30,avgdeath_2020-01-24_2020-01-30,avgcaseRate_2020-01-24_2020-01-30,avgdeathRate_2020-01-24_2020-01-30,avgcases_2020-02-21_2020-02-27,avgdeath_2020-02-21_2020-02-27,avgcaseRate_2020-02-21_2020-02-27,avgdeathRate_2020-02-21_2020-02-27,avgcases_2020-03-23_2020-03-29,avgdeath_2020-03-23_2020-03-29,avgcaseRate_2020-03-23_2020-03-29,avgdeathRate_2020-03-23_2020-03-29,avgcases_2020-04-20_2020-04-26,avgdeath_2020-04-20_2020-04-26,avgcaseRate_2020-04-20_2020-04-26,avgdeathRate_2020-04-20_2020-04-26,avgcases_2020-06-30_2020-07-06,avgdeath_2020-04-20_2020-07-06,avgcaseRate_2020-06-30_2020-07-06,avgdeathRate_2020-06-30_2020-07-06,avgcases_2020-07-29_2020-08-04,avgdeath_2020-07-29_2020-08-04,avgcaseRate_2020-07-29_2020-08-04,avgdeathRate_2020-07-29_2020-08-04
0,92410,2/28/2020 18:16:18,2/28/2020 18:40:40,4/29/2020 16:35:35,4/29/2020 17:01:46,8/5/2020 12:22:15,8/5/2020 12:46:30,California,San Bernardino County,1.0,San Bernardino,CA,2135413,0.0,0.0,0.0,0.0,1772.0,82.0,82.981606,3.840007,34237.0,487.0,1603.296412,22.805893,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49.428571,1.714286,2.314708,0.080279,1579.428571,73.285714,73.963611,3.431922,13723.857143,264.571429,642.679292,12.389708,32764.571429,425.142857,1534.343541,19.909163
1,16823,2/28/2020 18:10:08,2/28/2020 18:31:15,4/27/2020 15:51:59,4/27/2020 16:11:09,8/9/2020 5:07:28,8/9/2020 5:32:32,Pennsylvania,Centre County,1.0,Bellefonte,PA,161443,0.0,0.0,0.0,0.0,87.0,1.0,53.888989,0.619414,361.0,10.0,223.608332,6.194137,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.714286,0.0,6.636575,0.0,77.428571,1.571429,47.960315,0.973364,216.285714,7.285714,133.970327,4.512871,358.285714,10.0,221.927067,6.194137
2,68512,2/28/2020 18:36:48,2/28/2020 18:53:21,4/27/2020 15:38:01,4/27/2020 15:55:20,8/5/2020 12:10:10,8/5/2020 12:26:08,Nebraska,Lancaster County,1.0,Lincoln,NE,310094,0.0,0.0,0.0,0.0,157.0,1.0,50.629809,0.322483,3180.0,17.0,1025.495495,5.482209,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.967449,0.0,107.714286,1.0,34.736011,0.322483,1794.285714,12.0,578.626389,3.869794,3052.714286,14.714286,984.448034,4.745105
3,32935,2/28/2020 18:53:58,2/28/2020 19:20:14,,,,,Florida,Brevard County,1.0,Melbourne,FL,576808,0.0,0.0,0.0,0.0,266.0,8.0,46.115865,1.386943,5712.0,132.0,990.277527,22.884565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.714286,0.0,2.724353,0.0,235.857143,7.571429,40.890061,1.312643,2266.571429,18.0,392.950762,3.120622,5486.428571,112.714286,951.170679,19.541041
5,92301,2/28/2020 18:28:28,2/28/2020 19:23:54,4/27/2020 15:29:31,4/27/2020 15:57:35,,,California,San Bernardino County,0.999926,Adelanto,CA,2135413,0.0,0.0,0.0,0.0,1772.0,82.0,82.981606,3.840007,34237.0,487.0,1603.296412,22.805893,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49.428571,1.714286,2.314708,0.080279,1579.428571,73.285714,73.963611,3.431922,13723.857143,264.571429,642.679292,12.389708,32764.571429,425.142857,1534.343541,19.909163


In [77]:
df_loc_cases.shape

(3483, 49)

In [78]:
df_loc_cases.drop_duplicates(inplace=True)

In [79]:
df_loc_cases.shape

(2625, 49)

In [80]:
df_loc_cases.to_csv("COVID_waves1to3_zipcodefocused_JHU_20201003.csv", index=False)

In [82]:
df_loc_cases.query("S1_ZipCode == '92301'")

Unnamed: 0,S1_ZipCode,S1_StartDate,S1_EndDate,S2_StartDate,S2_EndDate,S3_StartDate,S3_EndDate,state,county,resRatio,placeName,code,population,cases_2020-02-28,death_2020-02-28,caseRate_2020-02-28,deathRate_2020-02-28,cases_2020-04-27,death_2020-04-27,caseRate_2020-04-27,deathRate_2020-04-27,cases_2020-08-05,death_2020-08-05,caseRate_2020-08-05,deathRate_2020-08-05,avgcases_2020-01-24_2020-01-30,avgdeath_2020-01-24_2020-01-30,avgcaseRate_2020-01-24_2020-01-30,avgdeathRate_2020-01-24_2020-01-30,avgcases_2020-02-21_2020-02-27,avgdeath_2020-02-21_2020-02-27,avgcaseRate_2020-02-21_2020-02-27,avgdeathRate_2020-02-21_2020-02-27,avgcases_2020-03-23_2020-03-29,avgdeath_2020-03-23_2020-03-29,avgcaseRate_2020-03-23_2020-03-29,avgdeathRate_2020-03-23_2020-03-29,avgcases_2020-04-20_2020-04-26,avgdeath_2020-04-20_2020-04-26,avgcaseRate_2020-04-20_2020-04-26,avgdeathRate_2020-04-20_2020-04-26,avgcases_2020-06-30_2020-07-06,avgdeath_2020-04-20_2020-07-06,avgcaseRate_2020-06-30_2020-07-06,avgdeathRate_2020-06-30_2020-07-06,avgcases_2020-07-29_2020-08-04,avgdeath_2020-07-29_2020-08-04,avgcaseRate_2020-07-29_2020-08-04,avgdeathRate_2020-07-29_2020-08-04
5,92301,2/28/2020 18:28:28,2/28/2020 19:23:54,4/27/2020 15:29:31,4/27/2020 15:57:35,,,California,San Bernardino County,0.999926,Adelanto,CA,2135413,0.0,0.0,0.0,0.0,1772.0,82.0,82.981606,3.840007,34237.0,487.0,1603.296412,22.805893,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49.428571,1.714286,2.314708,0.080279,1579.428571,73.285714,73.963611,3.431922,13723.857143,264.571429,642.679292,12.389708,32764.571429,425.142857,1534.343541,19.909163
6,92301,2/28/2020 18:28:28,2/28/2020 19:23:54,4/27/2020 15:29:31,4/27/2020 15:57:35,,,California,Los Angeles County,7.4e-05,Adelanto,CA,10098052,1.0,0.0,0.009903,0.0,20423.0,944.0,202.246928,9.348338,198165.0,4827.0,1962.408195,47.801299,1.0,0.0,0.009903,0.0,1.0,0.0,0.009903,0.0,1142.571429,19.428571,11.314771,0.192399,17172.142857,781.714286,170.054015,7.741238,109219.0,3452.571429,1081.584844,34.19047,190107.0,4646.428571,1882.610626,46.013118
