In [1]:
import pandas as pd
import numpy as np
import json
import math
import requests
import io

In [2]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:10.0) Gecko/20100101 Firefox/10.0'}
url = 'https://covid.ourworldindata.org/data/owid-covid-data.csv'
resp = requests.get(url, headers=headers)
file_object = io.StringIO(resp.content.decode('utf-8'))

In [3]:
df = pd.read_csv(file_object, 
                     index_col = ['date'],
                     parse_dates = True, 
                     usecols=["date", "iso_code", "location", 
                              "population", "continent",
                              "total_cases", "total_deaths"]).rename(
    columns = {'iso_code': 'country_code', 'location': 'country_name'}
)

In [40]:
# prepare metadata from World Bank
metadata = pd.read_csv("https://raw.githubusercontent.com/hongtaoh/covid19-data/master/data_sources/metadata/worldbank/country_metadata.csv",
                      usecols=["Country Code", "Region"]).rename(
    columns = {'Country Code': 'country_code', 'Region': 'world_region'}
)

In [41]:
all_dates = pd.date_range(df.index.min(), df.index.max())

In [42]:
def get_fallBehind_place_date_dateframe(df): # input should be df
    fallBehind_place_list = []
    fallBehind_last_date_available_list = []
    fallBehind_last_date_index_list = []
    for group in df.groupby("country_code"):
        if group[1].tail(1).index != df.index.max():
            fallBehind_place_list.append(group[0])
    for p in fallBehind_place_list:
        fallBehind_last_date_index_list.append(all_dates.get_loc(
            df[df.loc[:, 'country_code']== p].index[-1].strftime("%Y-%m-%d")))
        fallBehind_last_date_available_list.append(
            df[df.loc[:, 'country_code']== p].index[-1].strftime("%Y-%m-%d"))
    d = {'fallBehind_place': fallBehind_place_list, 
         'fallBehind_last_date_available': fallBehind_last_date_available_list,
         'fallBehind_last_date_index': fallBehind_last_date_index_list}
    fallBehind_list = pd.DataFrame(data = d)
    return fallBehind_list

In [43]:
fallBehind_list = get_fallBehind_place_date_dateframe(df)
fallBehind_list

Unnamed: 0,fallBehind_place,fallBehind_last_date_available,fallBehind_last_date_index
0,AFG,2021-02-18,414
1,AGO,2021-02-18,414
2,AIA,2021-02-13,409
3,ALB,2021-02-18,414
4,AND,2021-02-18,414
...,...,...,...
207,WSM,2021-02-18,414
208,YEM,2021-02-18,414
209,ZAF,2021-02-18,414
210,ZMB,2021-02-18,414


In [44]:
def extract_cntry_dfs(df): 
    dfs = []
    for group in df.groupby("country_code"):
        cntry_df = (
            group[1].reindex(all_dates, method="pad")
        )
        cntry_df["total_cases"] = cntry_df["total_cases"]
        cntry_df["total_deaths"] = cntry_df["total_deaths"]
        cntry_df["country_code"] = group[0]
        cntry_df["country_name"] = group[1]["country_name"][-1]
        cntry_df["continent"] = group[1]["continent"][-1]
        cntry_df["population"] = group[1]["population"][-1]
        dfs.append(
            cntry_df
        )
    return dfs

In [45]:
dfs = extract_cntry_dfs(df)

In [46]:
def fill_first_case_death_with_zero(df): # input is dfs
    for i in np.arange(0, len(df)):
        if (df[i].head(1).total_cases.isnull()[0] & df[i].head(1).total_deaths.isnull()[0]):
            df[i][0:1] = [df[i].country_code[-1],
            df[i].continent[-1],
            df[i].country_name[-1],
            0,
            0, 
            df[i].population[-1]
            ]
        if (df[i].head(1).total_cases.isnull()[0] & df[i].head(1).total_deaths.notnull()[0]):
            df[i][0:1] = [df[i].country_code[-1],
            df[i].continent[-1],
            df[i].country_name[-1],
            0,
            df[i].total_deaths[0],
            df[i].population[-1]
            ]
        if (df[i].head(1).total_cases.notnull()[0] & df[i].head(1).total_deaths.isnull()[0]):
            df[i][0:1] = [df[i].country_code[-1],
            df[i].continent[-1],
            df[i].country_name[-1],
            df[i].total_cases[0],
            0,
            df[i].population[-1]
            ]
    return df # output is the dfs with first case & death conditionally filled with zero. 
              # Later, I name this output to be "dfs_first_zero_filled" 

In [47]:
dfs_first_zero_filled = fill_first_case_death_with_zero(dfs)

In [48]:
concat_df = pd.concat(dfs_first_zero_filled).fillna(method="ffill").reset_index().rename(
        columns={"index": "date"})
    #To change the original country codes of "KOS" and World to match metadata from WB:
concat_df.loc[(concat_df.country_code == "OWID_KOS"), ('country_code')] = "XKX"
concat_df.loc[(concat_df.country_code == "OWID_WRL"), ('country_code')] = "WLD"
    # To get the column of "world_region" in concat_df by merging with WB metadata
left_join_df = pd.merge(concat_df, metadata, on = "country_code", how = "left")
# df[df['name column'].isnull()]
left_join_df.loc[left_join_df.world_region.isnull()].country_code.unique()

array(['AIA', 'GGY', 'JEY', 'OWID_AFR', 'OWID_ASI', 'OWID_EUN',
       'OWID_EUR', 'OWID_INT', 'OWID_NAM', 'OWID_NCY', 'OWID_OCE',
       'OWID_SAM', 'WLD', 'SHN', 'TWN', 'VAT'], dtype=object)

In [49]:
def merge_with_meta(df): #input should be dfs_first_zero_filled
    concat_df = pd.concat(df).fillna(method="ffill").reset_index().rename(
        columns={"index": "date"})
    #To change the original country codes of "KOS" and World to match metadata from WB:
    concat_df.loc[(concat_df.country_code == "OWID_KOS"), ('country_code')] = "XKX"
    concat_df.loc[(concat_df.country_code == "OWID_WRL"), ('country_code')] = "WLD"
    # To get the column of "world_region" in concat_df by merging with WB metadata
    left_join_df = pd.merge(concat_df, metadata, on = "country_code", how = "left")
    left_join_df.loc[:,'date'] = left_join_df.loc[:,'date'].dt.strftime('%Y-%m-%d')
    left_join_df.loc[(left_join_df.country_code == "AIA"), ('world_region')] = "Latin America & Caribbean"
    left_join_df.loc[(left_join_df.country_code == "BES"), ('world_region')] = "Latin America & Caribbean"
    left_join_df.loc[(left_join_df.country_code == "ESH"), ('world_region')] = "Middle East & North Africa"
    left_join_df.loc[(left_join_df.country_code == "FLK"), ('world_region')] = "Latin America & Caribbean"
    left_join_df.loc[(left_join_df.country_code == "GGY"), ('world_region')] = "Europe & Central Asia"
    left_join_df.loc[(left_join_df.country_code == "JEY"), ('world_region')] = "Europe & Central Asia"
    left_join_df.loc[(left_join_df.country_code == "MSR"), ('world_region')] = "Latin America & Caribbean"
    left_join_df.loc[(left_join_df.country_code == "TWN"), ('world_region')] = "East Asia & Pacific"
    left_join_df.loc[(left_join_df.country_code == "VAT"), ('world_region')] = "Europe & Central Asia"
    left_join_df.loc[(left_join_df.country_code == "WLF"), ('world_region')] = "East Asia & Pacific"
    left_join_df.loc[(left_join_df.country_code == "WLD"), ('world_region')] = "World"
    return left_join_df

In [50]:
left_join_df = merge_with_meta(dfs_first_zero_filled)

In [51]:
# In the following step, I converted 
def fallBehind_filled_to_null (df): # input should be left_join_df
    left_join_copy_group1_with_nan = []
    for group in df.groupby('country_code'):
        for i in np.arange(0, len(fallBehind_list)):
            if group[1].tail(1).country_code.iloc[0] == fallBehind_list.iloc[i, 0]:
                group[1].tail(len(all_dates) - 1 - fallBehind_list.iloc[i, 2]).total_cases = np.nan
                group[1].tail(len(all_dates) - 1 - fallBehind_list.iloc[i, 2]).total_deaths = np.nan
        left_join_copy_group1_with_nan.append(group[1])
    left_join_copy_group1_with_nan_concated = pd.concat(left_join_copy_group1_with_nan)
    left_join_copy_group1_concated_with_null = left_join_copy_group1_with_nan_concated
    left_join_copy_group1_concated_with_null.replace(np.nan, 'null', inplace=True)
    return left_join_copy_group1_concated_with_null 
# Later, I'll name the output to be fallBehind_with_nan_concated

In [52]:
# fallBehind_with_nan_concated = fallBehind_filled_to_nan(left_join_df)

In [53]:
fallBehind_with_null = fallBehind_filled_to_null(left_join_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [54]:
# fallBehind_with_null.replace(np.nan, 'null', inplace=True)

In [55]:
# fallBehind_with_null[fallBehind_with_null.loc[:, 'country_code'] == 'BEL'].tail(10)

In [56]:
# fallBehind_nan_changed[fallBehind_nan_changed.isnull().any(axis=1)]

In [57]:
fallBehind_with_null[fallBehind_with_null.loc[:, 'country_code'] == 'SWE'].tail(10)

Unnamed: 0,date,country_code,continent,country_name,total_cases,total_deaths,population,world_region
76534,2021-02-10,SWE,Europe,Sweden,600244.0,12326.0,10099270.0,Europe & Central Asia
76535,2021-02-11,SWE,Europe,Sweden,604577.0,12370.0,10099270.0,Europe & Central Asia
76536,2021-02-12,SWE,Europe,Sweden,608411.0,12428.0,10099270.0,Europe & Central Asia
76537,2021-02-13,SWE,Europe,Sweden,608411.0,12428.0,10099270.0,Europe & Central Asia
76538,2021-02-14,SWE,Europe,Sweden,608411.0,12428.0,10099270.0,Europe & Central Asia
76539,2021-02-15,SWE,Europe,Sweden,608411.0,12428.0,10099270.0,Europe & Central Asia
76540,2021-02-16,SWE,Europe,Sweden,617869.0,12487.0,10099270.0,Europe & Central Asia
76541,2021-02-17,SWE,Europe,Sweden,622102.0,12569.0,10099270.0,Europe & Central Asia
76542,2021-02-18,SWE,Europe,Sweden,627022.0,12598.0,10099270.0,Europe & Central Asia
76543,2021-02-19,SWE,Europe,Sweden,,,10099270.0,Europe & Central Asia


In [58]:
def prepare_data_structure(df, gby="country_code"): # input should be fallBehind_with_null
    data = []
    for g in df.groupby([gby]):
        code = g[0]
        cntry_df = g[1]
        try:
            country_data = {
                "country_code": code,
                "country_name": cntry_df.loc[:,"country_name"].iloc[0],
                "population": cntry_df.loc[:,"population"].iloc[0],
                "region": cntry_df.loc[:,"world_region"].iloc[0],
                "confirmed": list(zip(cntry_df.date, cntry_df.total_cases)),
                "deaths": list(zip(cntry_df.date, cntry_df.total_deaths)),
            }
            data.append(country_data)
        except KeyError:
            print("metadata doesn't exist for: ", code)
            continue
    return data

In [59]:
data = prepare_data_structure(fallBehind_with_null)

In [60]:
open("../output/cntry_stat_owid.json", "w").write(json.dumps(data, separators=(",", ":")))

3740598