In [303]:
import sys
import argparse
import os
import pandas as pd
import datetime as dt
from datetime import timedelta
import numpy as np

In [304]:
"""

This function is for converting cumulative count to daily count and write to the data folder

Args:
    inputFile: the cumulative count at county level collected from New York Times
                the path for it is at './data/Covid-19 cumulative.csv'
                
Returns:
    the daily count in dataframe format

"""


def cumulativeToDaily(inputFile):
    data = pd.read_csv(inputFile)
    copy = pd.read_csv(inputFile)
    data = data.sort_values(by=['county', 'state'])
    copy = copy.sort_values(by=['county', 'state'])
    data.to_csv("./data/daily count.csv", index = False)
    data = pd.read_csv("./data/daily count.csv")
    copy.to_csv("./data/daily count_copy.csv", index = False)
    copy = pd.read_csv("./data/daily count_copy.csv")
    i = 1
    data['fips'] = data['fips'].fillna(-1)
    while i < len(data.index):
        if data.at[i, 'county'] == data.at[i - 1, 'county'] and data.at[i, 'state'] == data.at[i - 1, 'state']:
            if (data.at[i, 'cases'] < copy.at[i - 1, 'cases']):
                data.at[i, 'cases'] = copy.at[i - 1, 'cases']
            if (data.at[i, 'deaths'] < copy.at[i - 1, 'deaths']):
                data.at[i, 'deaths'] = copy.at[i - 1, 'deaths']
            data.at[i, 'cases'] = data.at[i, 'cases'] - copy.at[i - 1, 'cases']
            data.at[i, 'deaths'] = data.at[i, 'deaths'] - copy.at[i - 1, 'deaths']
        i += 1
    data = data.astype({'fips': int})
    data["date"] = pd.to_datetime(data["date"])
    data = data.sort_values(by = ['date'])
    
    return data

In [305]:
"""

This function is for aggregating Covid-19 count from county level to MSA level

Args:
    covid: the daily count in csv format
    metro: a reference table in csv format for showing all the MSAs and the counties included in them
    interval: number of days you can choose to output the Covid-19 count, e.g. 1 would give you 
                the daily count and 7 would give you a weekly count. Default value is 1
                
Returns:
    None. The output will be written to ./data/output.csv

"""


def aggregate(covid, metro, startdate, enddate, interval = 1):
    metro_initial = pd.read_csv(metro)
    metro = pd.read_csv(metro)
    state_abbr = pd.read_csv('./data/us states abbreviations.csv')
    metro = metro.merge(state_abbr, how ='inner', 
                         left_on = 'states_msa', 
                         right_on = 'Abbreviation')
    metro['states_msa'] = metro['State']

    i = 0

    merged = metro.merge(covid, how='inner', 
                         left_on=["states_msa", "name10_county"], 
                         right_on=["state","county"])
    merged = merged[['date', 'county', 'cases', 
                     'deaths', 'name_msa', 'states_msa_code', 'states_msa', 'states_msa_full',
                     'geoid_msa']]  
    merged["date"] = pd.to_datetime(merged["date"])
    merged = merged[merged.date <= enddate]
    merged = merged[merged.date >= startdate]
     
    iterate_start = startdate
    
    interval_data = merged
    output = pd.DataFrame()
    
    while iterate_start <= enddate:
        iterate_end = iterate_start + timedelta(days=interval)
        
        eachInterval = interval_data[interval_data.date >= iterate_start]
        eachInterval = eachInterval[eachInterval.date <= iterate_end]
        

        eachInterval = eachInterval.groupby(['name_msa'])['cases', 'deaths'].sum()
        eachInterval = eachInterval.merge(metro_initial, left_on='name_msa', right_on='name_msa')[['states_msa_code', 'states_msa', 
                                    'states_msa_full', "geoid_msa",
                                   'name_msa', 'cases', 'deaths']].sort_values(by = 'states_msa_code')
        eachInterval['interval_start'] = iterate_start
        eachInterval = eachInterval.drop_duplicates(subset=['name_msa'])

        output = output.append(eachInterval)
        iterate_start = iterate_start + timedelta(days=interval)
    
    
    output.to_csv("./data/output.csv", index = False)


In [306]:
"""

This function is for transposing the output.csv file to output the data with each
date being a column

Args:
    input_covid: the output from aggregate function in csv format
                
Returns:
    None. The output will be written to './data/output_cases.csv' and './data/output_deaths.csv'

"""



def transform(input_covid):
    input_df = pd.read_csv(input_covid)
    geoid_msa = {}
    MSA_all_cases = {}
    MSA_all_deaths = {}
    all_dates = {}

    for index, row in input_df.iterrows():
        MSA_all_cases[row['name_msa']] = []
        MSA_all_deaths[row['name_msa']] = []
        all_dates[row['interval_start']] = 0
        geoid_msa[row['geoid_msa']] = 0
        
    for index, row in input_df.iterrows():
        MSA_all_cases[row['name_msa']].append(row['cases'])
        MSA_all_deaths[row['name_msa']].append(row['deaths'])
    
    MSA_all_cases_list = []
    for value in MSA_all_cases.values():
        MSA_all_cases_list.append(value)
        
    MSA_all_deaths_list = []
    for value in MSA_all_deaths.values():
        MSA_all_deaths_list.append(value)
    
    for i in MSA_all_cases_list:
        if (len(i) < len(all_dates)):
            diff = len(all_dates) - len(i)
            for j in range(diff):
                i.insert(j, 0)
                
    for i in MSA_all_deaths_list:
        if (len(i) < len(all_dates)):
            diff = len(all_dates) - len(i)
            for j in range(diff):
                i.insert(j, 0)

    dates = list(all_dates.keys())
    

    
    output_cases = pd.DataFrame(MSA_all_cases_list, columns = dates)
    output_cases.insert(0, 'name_msa', list(MSA_all_cases.keys()))
    output_cases.insert(1, 'geoid_msa', list(geoid_msa.keys()))
    
    output_deaths = pd.DataFrame(MSA_all_deaths_list, columns = dates)
    output_deaths.insert(0, 'name_msa', list(MSA_all_deaths.keys()))
    output_deaths.insert(1, 'geoid_msa', list(geoid_msa.keys()))
    
    output_cases.to_csv("./data/output_cases.csv", index = False)
    output_deaths.to_csv("./data/output_deaths.csv", index = False)

In [307]:
if __name__ == "__main__":
    
    # Change the parameters below to specify a beginning and an ending date 
    beginDate = dt.datetime(2020, 2, 14)
    endDate = dt.datetime(2020, 6, 11)
    interval = 1
    inputFile = './data/Covid-19 cumulative.csv'
    inputMetro = './data/metro_county.csv'
    
    
    dailyCount = cumulativeToDaily(inputFile)
    dailyCount.to_csv("./data/daily count.csv", index = False)
    inputCovid = pd.read_csv("./data/daily count.csv")
    
    aggregate(inputCovid, inputMetro, beginDate, endDate, interval)
    
    transform('./data/output.csv')


