### prepare daily dataframe  

In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np

import os
from pathlib import Path

In [None]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

# get project top directory 
base_dir = Path(os.getcwd()).parent
csse_data = base_dir / 'data/external/csse_data/csse_covid_19_data'
daily_data = csse_data / 'csse_covid_19_daily_reports/'
ts_data = csse_data / 'csse_covid_19_time_series'

In [None]:
# get most recent daily report
filepath = daily_data / get_daily_csv()[-1]
COLUMNS_DATA = ['Country_Region', 'Confirmed', 'Deaths', 'Recovered', 'Active']

df = pd.read_csv(filepath, usecols=COLUMNS_DATA)
df = df.groupby('Country_Region').agg('sum')

# get georeference from disk
gdf = gpd.read_file(base_dir / 'data/processed/georeference.json')

# merge daily report with georeference
merged = pd.merge(gdf, df, how='left', on='Country_Region')

merged.head()

In [None]:
processed_daily_dir = base_dir / 'data' / 'processed' / 'daily_report'
processed_daily_dir

def get_outputpath(input_file=None, suffix='.csv'):
    output_file = f"{Path(input_file).stem}{suffix}"
    output_file = processed_daily_dir / output_file    
    return output_file

def check_folder(path):
    path = path.parents[0]
    path.mkdir(parents=True, exist_ok=True)

In [None]:
column_schema = {
    'Province/State': 'province',
    'Country/Region': 'country',
    'Last Update': 'last_update',
    'Confirmed': 'confirmed',
    'Deaths': 'deaths',
    'Recovered': 'recovered',
    'Active': 'active',
    'Latitude': 'lat', 
    'Longitude': 'long',
    'FIPS': 'fips', 
    'Admin2': 'admin2', 
    'Province_State': 'province',
    'Country_Region': 'country', 
    'Last_Update': 'last_update', 
    'Lat': 'lat', 
    'Long_': 'long',  
    'Combined_Key': 'combined_key'
}

In [None]:
column_list = [
    'confirmed', 
    'deaths', 
    'recovered',
    'active',
    'last_update'
]

index_list = [
    'country', 
    'province'
]

aggregation_dict = {
    'confirmed': 'sum',
    'deaths': 'sum', 
    'recovered': 'sum',
    'active': 'sum',
    'last_update': 'first'
}

In [56]:
country_rename_dict = {
    ' Azerbaijan': 'Azerbaijan',
    'Hong Kong SAR': 'Hong Kong',
    'Iran (Islamic Republic of)': 'Iran',
    'Bahamas, The': 'Bahamas',
    'US': 'USA',
    'UK': 'United Kingdom',
    'Viet Nam': 'Vietnam',
    'Taipei and environs': 'Taiwan',
    'North Ireland': 'United Kingdom',
    'Macao SAR': 'Macau',
    'Holy See': 'Vatican City',
    'Taiwan*': 'Taiwan',
    "Cote d'Ivoire": 'Ivory Coast',
    'Republic of Ireland': 'Ireland',
    'Republic of Korea': 'South Korea',
    'Russian Federation': 'Russia',
    'Korea, South': 'South Korea',
    'occupied Palestinian territory': 'Palestine',
    'Cruise Ship': 'Others',
    'Diamond Princess': 'Others',
    'MS Zaandam': 'Others',
    'Reunion': 	'France',
    'Channel Islands': 'United Kingdom',
    'Czechia': 'Czech Republic',
    'Mainland China': 'China',
    'Macao': 'Macau',
    'Gambia, The': 'The Gambia',
    'Cape Verde': 'Cabo Verde',
    'Timor-Leste': 'East Timor'
}

In [None]:
path_list = sorted([path for path in daily_data.glob('*.csv')])

for path in path_list:
    
    # read data
    df = pd.read_csv(path)
    # rename columns refering to schema
    df = df.rename(columns=column_schema)
    # replace country names
    df.loc[:, 'country'] = df['country'].replace(country_rename_dict)
    # add 'active' column if it doesn't exist
    if 'active' not in df.columns:
        df.loc[:, 'active'] = np.nan
    # group by country and province
    df = df.groupby(['country']).agg(aggregation_dict)
    # sort by country and province index)
    df = df.sort_index()

    # change order of columns
    df = df[column_list]
    
    output_file = get_outputpath(path)
    check_folder(output_file)
    
    df.to_csv(output_file)

In [None]:
# https://python-forum.io/Thread-How-does-pathlib-Path-rename-work?pid=54129#pid54129