In [80]:
import pandas as pd
import numpy as np
import json
import math

In [106]:
df = pd.read_csv("https://covid.ourworldindata.org/data/owid-covid-data.csv", 
                     index_col = ["date"],
                     parse_dates = True, 
                     usecols=["date", "iso_code", "location", 
                              "population", "continent",
                              "total_cases", "total_deaths"]).rename(
    columns = {'iso_code': 'country_code', 'location': 'country_name'})

In [82]:
metadata = pd.read_csv("https://raw.githubusercontent.com/hongtaoh/covid19-data/master/data_sources/metadata/worldbank/country_metadata.csv",
                      usecols=["Country Code", "Region"]).rename(
    columns = {'Country Code': 'country_code', 'Region': 'world_region'}
)

In [83]:
all_dates = pd.date_range(df.index.min(), df.index.max())

In [107]:
def extract_cntry_dfs(df): # input is df
    dfs = []
    for group in df.groupby("country_code"):
        cntry_df = (
            group[1].reindex(all_dates, method="pad")
        )
        cntry_df.loc[:,"total_cases"] = cntry_df.loc[:,"total_cases"]
        cntry_df.loc[:,"total_deaths"] = cntry_df.loc[:,"total_deaths"]
        cntry_df.loc[:,"country_code"] = group[0]
        cntry_df.loc[:,"country_name"] = group[1].loc[:,"country_name"][-1]
        cntry_df.loc[:,"continent"] = group[1].loc[:,"continent"][-1]
        cntry_df.loc[:,"population"] = group[1].loc[:,"population"][-1]
        dfs.append(
            cntry_df
        )
    return dfs

In [108]:
def fill_first_case_death_with_zero(df): # input is dfs
    for i in np.arange(0, len(dfs)):
        if math.isnan(dfs[i].loc[:, "total_cases"].iloc[0]):
            dfs[i].loc[:,"total_cases"].iloc[0] = 0
        if math.isnan(dfs[i].loc[:, "total_deaths"].iloc[0]):
            dfs[i].loc[:, "total_deaths"].iloc[0] = 0
    return dfs

In [136]:
dfs = extract_cntry_dfs(df)

In [137]:
dfs_first_zero_filled = fill_first_case_death_with_zero(dfs)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [138]:
def merge_with_meta(df): #input should be dfs_first_zero_filled
    concat_df = pd.concat(dfs).fillna(method="ffill").reset_index().rename(
        columns={"index": "date"})
    #To change the original country codes of "KOS" and World to match metadata from WB:
    concat_df.loc[(concat_df.country_code == "OWID_KOS"), ('country_code')] = "XKX"
    concat_df.loc[(concat_df.country_code == "OWID_WRL"), ('country_code')] = "WLD"
    # To get the column of "world_region" in concat_df by merging with WB metadata
    left_join_df = pd.merge(concat_df, metadata, on = "country_code", how = "left")
    left_join_df.loc[:,'date'] = left_join_df.loc[:,'date'].dt.strftime('%Y-%m-%d')
    left_join_df.loc[(left_join_df.country_code == "AIA"), ('world_region')] = "Latin America & Caribbean"
    left_join_df.loc[(left_join_df.country_code == "BES"), ('world_region')] = "Latin America & Caribbean"
    left_join_df.loc[(left_join_df.country_code == "ESH"), ('world_region')] = "Middle East & North Africa"
    left_join_df.loc[(left_join_df.country_code == "FLK"), ('world_region')] = "Latin America & Caribbean"
    left_join_df.loc[(left_join_df.country_code == "GGY"), ('world_region')] = "Europe & Central Asia"
    left_join_df.loc[(left_join_df.country_code == "JEY"), ('world_region')] = "Europe & Central Asia"
    left_join_df.loc[(left_join_df.country_code == "MSR"), ('world_region')] = "Latin America & Caribbean"
    left_join_df.loc[(left_join_df.country_code == "TWN"), ('world_region')] = "East Asia & Pacific"
    left_join_df.loc[(left_join_df.country_code == "VAT"), ('world_region')] = "Europe & Central Asia"
    left_join_df.loc[(left_join_df.country_code == "WLF"), ('world_region')] = "East Asia & Pacific"
    left_join_df.loc[(left_join_df.country_code == "WLD"), ('world_region')] = "World"
    return left_join_df

In [139]:
left_join_df = merge_with_meta(dfs_first_zero_filled)

In [140]:
left_join_df

Unnamed: 0,date,country_code,continent,country_name,total_cases,total_deaths,population,world_region
0,2019-12-31,ABW,North America,Aruba,0.0,0.0,106766.0,Latin America & Caribbean
1,2020-01-01,ABW,North America,Aruba,0.0,0.0,106766.0,Latin America & Caribbean
2,2020-01-02,ABW,North America,Aruba,0.0,0.0,106766.0,Latin America & Caribbean
3,2020-01-03,ABW,North America,Aruba,0.0,0.0,106766.0,Latin America & Caribbean
4,2020-01-04,ABW,North America,Aruba,0.0,0.0,106766.0,Latin America & Caribbean
...,...,...,...,...,...,...,...,...
66335,2020-10-31,ZWE,Africa,Zimbabwe,8362.0,242.0,14862927.0,Sub-Saharan Africa
66336,2020-11-01,ZWE,Africa,Zimbabwe,8362.0,242.0,14862927.0,Sub-Saharan Africa
66337,2020-11-02,ZWE,Africa,Zimbabwe,8374.0,243.0,14862927.0,Sub-Saharan Africa
66338,2020-11-03,ZWE,Africa,Zimbabwe,8389.0,245.0,14862927.0,Sub-Saharan Africa


In [141]:
def prepare_data_structure(df, gby="country_code"): # input should be left_join_df
    data = []
    for g in df.groupby([gby]):
        code = g[0]
        cntry_df = g[1]
        try:
            country_data = {
                "country_code": code,
                "country_name": cntry_df.loc[:,"country_name"].iloc[0],
                "population": cntry_df.loc[:,"population"].iloc[0],
                "region": cntry_df.loc[:,"world_region"].iloc[0],
                "confirmed": list(zip(cntry_df.date, cntry_df.total_cases)),
                "deaths": list(zip(cntry_df.date, cntry_df.total_deaths)),
            }
            data.append(country_data)
        except KeyError:
            print("metadata doesn't exist for: ", code)
            continue
    return data

In [142]:
data = prepare_data_structure(left_join_df)

In [143]:
data

[{'country_code': 'ABW',
  'country_name': 'Aruba',
  'population': 106766.0,
  'region': 'Latin America & Caribbean',
  'confirmed': [('2019-12-31', 0.0),
   ('2020-01-01', 0.0),
   ('2020-01-02', 0.0),
   ('2020-01-03', 0.0),
   ('2020-01-04', 0.0),
   ('2020-01-05', 0.0),
   ('2020-01-06', 0.0),
   ('2020-01-07', 0.0),
   ('2020-01-08', 0.0),
   ('2020-01-09', 0.0),
   ('2020-01-10', 0.0),
   ('2020-01-11', 0.0),
   ('2020-01-12', 0.0),
   ('2020-01-13', 0.0),
   ('2020-01-14', 0.0),
   ('2020-01-15', 0.0),
   ('2020-01-16', 0.0),
   ('2020-01-17', 0.0),
   ('2020-01-18', 0.0),
   ('2020-01-19', 0.0),
   ('2020-01-20', 0.0),
   ('2020-01-21', 0.0),
   ('2020-01-22', 0.0),
   ('2020-01-23', 0.0),
   ('2020-01-24', 0.0),
   ('2020-01-25', 0.0),
   ('2020-01-26', 0.0),
   ('2020-01-27', 0.0),
   ('2020-01-28', 0.0),
   ('2020-01-29', 0.0),
   ('2020-01-30', 0.0),
   ('2020-01-31', 0.0),
   ('2020-02-01', 0.0),
   ('2020-02-02', 0.0),
   ('2020-02-03', 0.0),
   ('2020-02-04', 0.0),
   (

In [144]:
open("/Users/Tal/Desktop/covid19-data/output/cntry_stat_owid.json", "w").write(
    json.dumps(data, separators=(",", ":")))

2752390