# Covid-19 Data Aggregation

This notebook is in conjunction with the ongoing research on Covid-19 and human mobility and serves the purpose of preprocessing Covid-19 data by taking the input data of cumulative covid-19 data at county level from New York Times to Metropolitan Statistical Area (MSA) level for further study.

## Notebook Outline
- cumulative to daily
- transpose
- aggregate

In [1]:
import sys
import argparse
import os
import pandas as pd
import datetime as dt
from datetime import timedelta
import numpy as np
import json
import geopandas as gpd

In [7]:
# change the parameters below to specify a beginning and an ending date 
beginDate = dt.datetime(2020, 2, 14)
endDate = dt.datetime(2021, 2, 7)
inputCovid = './data/Covid-19 cumulative.csv'
inputMetro = './data/metro_county.csv'

# change the parameters below to specify an interval (for example interval = 1
# means daily cases and interval = 7 means weekly cases)
interval = 1

In [9]:
# input cumulative data
data = pd.read_csv(inputCovid)
data = data.sort_values(by = ['county', 'state'])
copy = data
data.to_csv("./data/daily count.csv", index = False)
data = pd.read_csv("./data/daily count.csv")
copy.to_csv("./data/daily count_copy.csv", index = False)
copy = pd.read_csv("./data/daily count_copy.csv")

data.head()

Unnamed: 0,date,county,state,fips,cases,deaths
5000,3/19/2020,Abbeville,South Carolina,45001.0,1,0.0
5873,3/20/2020,Abbeville,South Carolina,45001.0,1,0.0
6877,3/21/2020,Abbeville,South Carolina,45001.0,1,0.0
7997,3/22/2020,Abbeville,South Carolina,45001.0,1,0.0
9208,3/23/2020,Abbeville,South Carolina,45001.0,1,0.0


In [None]:
# input MSA to county reference table
metro = pd.read_csv(inputMetro)
metro_initial = metro
metro.head()

In [None]:
# input US state abbreviations reference table
state_abbr = pd.read_csv('./data/us states abbreviations.csv')

### cumulative to daily
This section is for converting cumulative cases to daily cases. The basic idea is that the daily case for a specific day is the cumulative cases on the day minus the cumulative cases on the day before it.

In [None]:
i = 1
data['fips'] = data['fips'].fillna(-1)
while i < len(data.index):
    if data.at[i, 'county'] == data.at[i - 1, 'county'] and data.at[i, 'state'] == data.at[i - 1, 'state']:
        if (data.at[i, 'cases'] < copy.at[i - 1, 'cases']):
            data.at[i, 'cases'] = copy.at[i - 1, 'cases']
        if (data.at[i, 'deaths'] < copy.at[i - 1, 'deaths']):
            data.at[i, 'deaths'] = copy.at[i - 1, 'deaths']
        data.at[i, 'cases'] = data.at[i, 'cases'] - copy.at[i - 1, 'cases']
        data.at[i, 'deaths'] = data.at[i, 'deaths'] - copy.at[i - 1, 'deaths']
    i += 1
data = data.astype({'fips': int})
data["date"] = pd.to_datetime(data["date"])
data = data.sort_values(by = ['date'])


### aggregate
This section is for aggregating

In [None]:
# merge the metro reference table with state reference table
metro = metro.merge(state_abbr, how ='inner', 
                     left_on = 'states_msa', 
                     right_on = 'Abbreviation')
metro['states_msa'] = metro['State']


In [None]:
# merge again with the covid data
merged = metro.merge(covid, how='inner', 
                     left_on=["states_msa", "name10_county"], 
                     right_on=["state","county"])
merged = merged[['date', 'county', 'cases', 
                 'deaths', 'name_msa', 'states_msa_code', 'states_msa', 'states_msa_full',
                 'geoid_msa']]  


In [None]:
# filter out any rows with dates not in the range
merged["date"] = pd.to_datetime(merged["date"])
merged = merged[merged.date <= enddate]
merged = merged[merged.date >= startdate]


In [None]:

iterate_start = startdate

interval_data = merged
output = pd.DataFrame()

while iterate_start <= enddate:
    iterate_end = iterate_start + timedelta(days = interval)

    eachInterval = interval_data[interval_data.date >= iterate_start]
    eachInterval = eachInterval[eachInterval.date < iterate_end]


    eachInterval = eachInterval.groupby(['name_msa'])['cases', 'deaths'].sum()
    eachInterval = eachInterval.merge(metro_initial, left_on='name_msa', right_on='name_msa')[['states_msa_code', 'states_msa', 
                                'states_msa_full', "geoid_msa",
                               'name_msa', 'cases', 'deaths']].sort_values(by = 'states_msa_code')
    eachInterval['interval_start'] = iterate_start
    eachInterval = eachInterval.drop_duplicates(subset=['name_msa'])

    output = output.append(eachInterval)
    iterate_start = iterate_start + timedelta(days=interval)


output.to_csv("./data/output.csv", index = False)


In [None]:
"""

This function is for transposing the output.csv file to output the data with each
date being a column

Args:
    input_covid: the output from aggregate function in csv format
                
Returns:
    the reformatted cases and deaths data at MSA level in the US

"""



def transform_MSA(input_covid):
    input_df = pd.read_csv(input_covid)
    geoid_msa = {}
    MSA_all_cases = {}
    MSA_all_deaths = {}
    all_dates = {}

    for index, row in input_df.iterrows():
        MSA_all_cases[row['name_msa']] = []
        MSA_all_deaths[row['name_msa']] = []
        all_dates[row['interval_start']] = 0
        geoid_msa[row['geoid_msa']] = 0
    
    for index, row in input_df.iterrows():
        MSA_all_cases[row['name_msa']].append(row['cases'])
        MSA_all_deaths[row['name_msa']].append(row['deaths'])
    
    MSA_all_cases_list = []
    for value in MSA_all_cases.values():
        MSA_all_cases_list.append(value)
        
    MSA_all_deaths_list = []
    for value in MSA_all_deaths.values():
        for index, v in enumerate(value):
            value[index] = int(v)
        #print(value)
        MSA_all_deaths_list.append(value)
    
    for i in MSA_all_cases_list:
        if (len(i) < len(all_dates)):
            diff = len(all_dates) - len(i)
            for j in range(diff):
                i.insert(j, 0)
                
    for i in MSA_all_deaths_list:
        if (len(i) < len(all_dates)):
            diff = len(all_dates) - len(i)
            for j in range(diff):
                i.insert(j, 0)

    dates = list(all_dates.keys())
    

    
    output_cases = pd.DataFrame(MSA_all_cases_list, columns = dates)
    output_cases.insert(0, 'geoid', list(geoid_msa.keys()))
    output_cases.insert(1, 'name', list(MSA_all_cases.keys()))
     
    output_deaths = pd.DataFrame(MSA_all_deaths_list, columns = dates)
    output_deaths.insert(0, 'geoid', list(geoid_msa.keys()))
    output_deaths.insert(1, 'name', list(MSA_all_deaths.keys()))
    

#     output_deaths = output_deaths.astype(float)
#     output_deaths = output_deaths.astype(int)
#     for index, row in output_deaths.iterrows():
#         print(row) 
    
    
    output_cases.to_csv("./data/output_cases.csv", index = False)
    output_deaths.to_csv("./data/output_deaths.csv", index = False)
    
    return output_cases, output_deaths



In [None]:
"""

This function is for transposing the global covid-19 data to output the data
with each date being a column

Args:
    input_covid: global covid-19 data in ./data/covid_world.csv
                
Returns:
    the reformatted cases and deaths data at country level

"""


def transform_world(input_world, begindate, enddate):
    df = pd.read_csv(input_world)
    
    geoid = pd.read_csv('./data/geoid.csv')
    df = df.merge(geoid, how='inner', 
                         left_on=['location'], 
                         right_on=['Location (Short Name)'] )
    
    df = df[['Geographical location identifier (decimal)', 'iso_code', 'date', 'continent', 'location', 
             'new_cases', 'new_deaths']]
    
    iso_country = {}
    country_all_cases = {}
    country_all_deaths = {}
    all_dates = {}
    continent = {}
    locations = {}
    geoid = {}

    for i in range(df.shape[0]):
        transform = df.at[i, 'date']
        transform = transform.split('/')
        transform = transform[0] + '-' + transform[1] + '-' + transform[2]
        #print(transform)
    
    df['date'] = pd.to_datetime(df["date"])
    
    for i in range(df.shape[0]):
        if (df.at[i, 'date'] < begindate or df.at[i, 'date'] > enddate):
            df = df.drop([i])
    
    for index, row in df.iterrows():
        country_all_cases[row['iso_code']] = []
        country_all_deaths[row['iso_code']] = []
        all_dates[row['date']] = 0
        iso_country[row['iso_code']] = 0
        
        continent[row['continent']] = 0
        locations[row['location']] = 0
        geoid[row['Geographical location identifier (decimal)']] = 0
        
    for index, row in df.iterrows():
        country_all_cases[row['iso_code']].append(row['new_cases'])
        country_all_deaths[row['iso_code']].append(row['new_deaths'])
    
    country_all_cases_list = []
    for value in country_all_cases.values():
        country_all_cases_list.append(value)
        
    country_all_deaths_list = []
    for value in country_all_deaths.values():
        country_all_deaths_list.append(value)
    

    dates = list(all_dates.keys())
    

    
    output_cases = pd.DataFrame(country_all_cases_list, columns = dates)
    output_cases.insert(0, 'geoid', list(geoid.keys()))
    output_cases.insert(1, 'name', list(locations.keys()))
    
       
    output_deaths = pd.DataFrame(country_all_deaths_list, columns = dates)
    output_deaths.insert(0, 'geoid', list(geoid.keys()))
    output_deaths.insert(1, 'name', list(locations.keys()))
    

    return output_cases, output_deaths
    
    #output_cases.to_csv("./data/output_cases_world.csv", index = False)
    #output_deaths.to_csv("./data/output_deaths_world.csv", index = False)
    
    

In [None]:
"""

This function is for merging the MSA level data in the US with the country level data by calling both the transform function
for the MSA level data and for the country level data

Args:
    begindate:
    enddate:
                
Returns:
    None. The output will be written to './data/output_cases.csv' and './data/output_deaths.csv'

"""
def merge(begindate, enddate):
    output_cases_MSA, output_deaths_MSA = transform_MSA('./data/output.csv')
    output_cases_world, output_deaths_world = transform_world('./data/covid_world.csv', begindate, enddate)
    
    for i in range(output_cases_world.shape[0]):
        output_cases_MSA.loc[output_cases_MSA.shape[0] + i] = list(output_cases_world.loc[i])
    
    for i in range(output_deaths_world.shape[0]):
        output_deaths_MSA.loc[output_deaths_MSA.shape[0] + i] = list(output_deaths_world.loc[i])
    
    output_cases_MSA.to_csv("./data/output_cases.csv", index = False)
    output_deaths_MSA.to_csv("./data/output_deaths.csv", index = False)

In [None]:
"""

This function is for converting the output in csv format to js format 

"""

def convert_to_js(param):

    disease = param['Disease']
    beginDate = param['begin_date']
    endDate = param['end_date']
    shapefile = param['shapefile']
    
    with open(shapefile, errors='replace') as f:
        data = json.load(f)

    beginDate = str_to_date(beginDate)
    endDate = str_to_date(endDate)
    
    
    
    df = pd.read_csv(disease)
    
    
 
    columns = list(df.columns)
    columns.pop(0)
    columns.pop(0)
    
    for column in columns:
        column_date = str_to_date(column)
        if (column_date > endDate or column_date < beginDate):
            df = df.drop(column, 1)
            
    
    heading = list(df.columns)
    ofile = open('./data/test.js', 'w')
    ofile.write('var GEO_VARIABLES =\n')
    ofile.write('[\n')
    ofile.write('  '+json.dumps(heading)+',\n')
    
    for index, row in df.iterrows():
        values = list(row)
        ofile.write('  '+json.dumps(values)+',\n')
    
    
    ofile.write(']\n')
    ofile.close()
    
    
    

In [None]:
"""

Helper function for converting string to datetime, string must be in the formate of "month-date-year", e.g. "2-14-2020"


"""

def str_to_date(input_string):
    input_string = input_string.split('-')
    
    
    for index in range(len(input_string)):
        input_string[index] = int(input_string[index])

    input_string = dt.datetime(input_string[0], input_string[1], input_string[2])
    
    return input_string

In [None]:


    
#     dailyCount = cumulativeToDaily(inputFile)
#     dailyCount.to_csv("./data/daily count.csv", index = False)
    
    #inputCovid = pd.read_csv('./data/daily count.csv')
    
    
    #aggregate(inputCovid, inputMetro, beginDate, endDate, interval)
    
    transform_MSA('./data/output.csv')
    
#     merge(beginDate, endDate)
    
#     param = {
#         'Disease': './data/output_deaths.csv',  
#         'begin_date': "2020-02-22",
#         'end_date': "2020-06-10",
#         'shapefile': "./shp/world_region.shp",
#     }
    
#     convert_to_js(param)

