In [1]:
import os
import glob
import json
import yaml
import pandas as pd
import requests

import time
import datetime

In [2]:
from tqdm.auto import tqdm
tqdm.pandas()

In [3]:
with open("config.yml", 'r') as ymlfile:
    cfg = yaml.safe_load(ymlfile)

In [4]:
token = cfg['access_token']
headers = {'Authorization': 'token ' + token}

In [5]:
file_dict = {'deaths': 'time_series_covid19_deaths_US.csv',
             'cases': 'time_series_covid19_confirmed_US.csv'}

In [6]:
fips = pd.read_csv('data/locations.csv')
fips = fips[fips.location.str.len() <= 2]

In [7]:
for target in file_dict:
    # retrieve information about all commits that modified the file we want
    all_commits = []

    page = 0
    while True:
        page += 1
        r = requests.get(
            'https://api.github.com/repos/CSSEGISandData/COVID-19/commits',
            params = {
                'path': 'csse_covid_19_data/csse_covid_19_time_series/' + file_dict[target],
                'page': str(page)
            },
            headers=headers
        )
        
        if (not r.ok) or (r.text == '[]'):
            break
        
        all_commits += json.loads(r.text or r.content)

    # date of each commit
    commit_dates = [
        commit['commit']['author']['date'][0:10] for commit in all_commits
    ]


    
    # sha for the last commit made each day
    commit_shas_to_get = {}
    for index, commit_date in enumerate(commit_dates):
        # location in which to save file
        result_path =  'data/JHU/raw/' + commit_date + '_JHU_raw_' + target + '.csv'
        
        # delete file if it was downloaded on the commit date since it may not
        # be the last commit that day
        if os.path.isfile(result_path):
            creation_time = os.path.getctime(result_path)
            creation_date = time.strftime(
                "%Y-%m-%d",
                time.gmtime(creation_time)
            )
            if creation_date == commit_date:
                os.remove(result_path)
        
        # record as a sha to download if applicable
        commit_date_as_date = datetime.date(
                int(commit_date[0:4]),
                int(commit_date[5:7]),
                int(commit_date[8:10]))
        commit_weekday = commit_date_as_date.weekday()
        if (commit_date not in commit_shas_to_get) and \
            (not os.path.isfile(result_path)) and \
            (commit_weekday == 0 or commit_weekday == 6 or
                datetime.datetime.today().date() - commit_date_as_date < datetime.timedelta(7)):
            commit_shas_to_get[commit_date] = all_commits[index]['sha']

    
    # download and save the csvs
    for commit_date, commit_sha in commit_shas_to_get.items():
        df = pd.read_csv(
            'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/' +
            commit_sha +
            '/csse_covid_19_data/csse_covid_19_time_series/' +
            file_dict[target])
        
        result_path =  'data/JHU/raw/' + commit_date + '_JHU_raw_' + target + '.csv'
        df.to_csv(result_path, index=False)

In [8]:
def process_file(filename, saturdays_only=True):
    df = pd.read_csv(filename)

    df.drop(columns=['UID', 'iso2', 'iso3', 'code3', 'FIPS', 'Admin2', 'Country_Region', 
                     'Lat', 'Long_', 'Combined_Key', 'Population'], errors='ignore', inplace=True)

    df = df.groupby('Province_State').sum().reset_index()
    df = pd.melt(df, id_vars=['Province_State'])
    df.columns = ['location_name', 'date', 'value']
    df.date = pd.to_datetime(df.date)

    df = df.merge(fips[['location', 'location_name']], how='left')
    df = df[['date', 'location', 'location_name', 'value']].sort_values(['date', 'location'])
    
    if saturdays_only:
        df = df[df.date.dt.day_name() == 'Saturday'].reset_index(drop=True)
    
    return df

In [9]:
list_of_files = glob.glob('data/JHU/raw/*')

for target in ['deaths', 'cases']:
    # only keep files with the respective target
    files = [f for f in list_of_files if target in f]
    
    # only consider data from Monday
    file_df = pd.DataFrame({'filename': files})
    file_df['date'] = file_df.filename.transform(lambda x: x.split('\\')[1][:10])
    file_df.date = pd.to_datetime(file_df.date)
    file_df = file_df[file_df.date.dt.day_name() == 'Monday'].reset_index(drop=True)
    
    print('Processing ' + target + ':')
    for _, row in tqdm(file_df.iterrows(), total=file_df.shape[0]):
        temp = process_file(row['filename'], saturdays_only=True)
        # temp.dropna(inplace=True) Diamond Princess and Grand Princess are counted for US
        temp.drop(columns=['location_name'], inplace=True)

        us = temp.groupby('date')['value'].sum().reset_index()
        us['location'] = 'US'

        temp.dropna(inplace=True) # drop Diamond Princess and Grand Princess
        temp = pd.concat([temp, us]).sort_values(['date', 'location']).reset_index(drop=True)

        temp.to_csv('data/JHU/cumulative_' + target + '/truth_jhu_cumulative_' + target + '_' + 
                    str(row['date'].date()) + '.csv', index=False)
        
        # compute incidence
        temp.value = temp.groupby(['location'])['value'].diff()
        temp.dropna(inplace=True)
        temp.value = temp.value.astype(int)
        
        temp.to_csv('data/JHU/incident_' + target + '/truth_jhu_incident_' + target + '_' + 
                    str(row['date'].date()) + '.csv', index=False)

Processing deaths:


  0%|          | 0/54 [00:00<?, ?it/s]

Processing cases:


  0%|          | 0/54 [00:00<?, ?it/s]