In [146]:
import os
import re
import pickle
import pandas as pd
from datetime import datetime
from glob import glob

# 1.Read files

In [147]:
all_csv_paths = glob('../data/CSSE-data/csse_covid_19_data/csse_covid_19_daily_reports/*.csv')


In [148]:
def conv_to_dt(str_path):
    result = re.search(r'[0-9]{2}-[0-9]{2}-[0-9]{4}', str_path)
    str_dt = result.group()
    return datetime. strptime(str_dt, '%m-%d-%Y')


In [149]:
df_files = pd.DataFrame([(conv_to_dt(p), p) for p in all_csv_paths], columns= ['date', 'path']).sort_values('date')
df_files.index = df_files.date
df_files['df_0'] = df_files.path.apply(pd.read_csv)

In [150]:
with open('date_ordered_data.pkl', 'wb') as f:
    pickle.dump(df_files, f)

# 2.Summarize by countries

In [151]:
# Column name changed
last_col = []

for _, r in df_files.iterrows():
    df_tmp = r['df_0']
    dt = r['date']
    col = df_tmp.columns.tolist()
    if last_col != col:
        print (dt, col)
        last_col = col


2020-01-22 00:00:00 ['Province/State', 'Country/Region', 'Last Update', 'Confirmed', 'Deaths', 'Recovered']
2020-03-01 00:00:00 ['Province/State', 'Country/Region', 'Last Update', 'Confirmed', 'Deaths', 'Recovered', 'Latitude', 'Longitude']
2020-03-22 00:00:00 ['FIPS', 'Admin2', 'Province_State', 'Country_Region', 'Last_Update', 'Lat', 'Long_', 'Confirmed', 'Deaths', 'Recovered', 'Active', 'Combined_Key']
2020-05-29 00:00:00 ['FIPS', 'Admin2', 'Province_State', 'Country_Region', 'Last_Update', 'Lat', 'Long_', 'Confirmed', 'Deaths', 'Recovered', 'Active', 'Combined_Key', 'Incidence_Rate', 'Case-Fatality_Ratio']
2020-11-09 00:00:00 ['FIPS', 'Admin2', 'Province_State', 'Country_Region', 'Last_Update', 'Lat', 'Long_', 'Confirmed', 'Deaths', 'Recovered', 'Active', 'Combined_Key', 'Incident_Rate', 'Case_Fatality_Ratio']


In [152]:
def set_columns(df):
    df.columns = df.columns.str.replace('/','_')
    df.columns = df.columns.str.replace(' ','_')
    if not 'Active' in df.columns:
        df['Active'] = None
    return df[['Province_State','Country_Region','Last_Update','Confirmed','Deaths','Recovered','Active']]

In [153]:
# Adjust table colums
df_files['df_1'] = df_files['df_0'].apply(set_columns)
df_files['df_2'] = df_files['df_1'].apply(lambda x : x.groupby('Country_Region').sum())

In [154]:
target_col = 'Confirmed'
df = pd.concat([df[target_col].to_frame().T for df in df_files['df_2']],axis = 0)
df.index = df_files.date

In [155]:
df.to_csv('view.csv', index=True)

# 2.1Compare with CSSE data

In [169]:
df_raw = pd.read_csv('../data/CSSE-data/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')

In [170]:
df_raw = df_raw.groupby('Country/Region').sum().iloc[:,2:]

In [171]:
df_t = df.T.sort_index()

In [173]:
df_raw.columns = df_t.columns

In [174]:
df_t.to_csv('view1.csv')
df_raw.to_csv('view2.csv')