In [1]:
import pandas as pd
import numpy as np
import json
import math

In [151]:
df = pd.read_csv("https://covid.ourworldindata.org/data/owid-covid-data.csv", 
                     index_col = ['date'],
                     parse_dates = True, 
                     usecols=["date", "iso_code", "location", 
                              "population", "continent",
                              "total_cases", "total_deaths"]).rename(
    columns = {'iso_code': 'country_code', 'location': 'country_name'}
)

In [188]:
# prepare metadata from World Bank
metadata = pd.read_csv("https://raw.githubusercontent.com/hongtaoh/covid19-data/master/data_sources/metadata/worldbank/country_metadata.csv",
                      usecols=["Country Code", "Region"]).rename(
    columns = {'Country Code': 'country_code', 'Region': 'world_region'}
)

In [189]:
all_dates = pd.date_range(df.index.min(), df.index.max())

In [190]:
def get_fallBehind_place_date_dateframe(df):
    fallBehind_place_list = []
    fallBehind_last_date_list = []
    fallBehind_last_date_index_list = []
    for group in df.groupby("country_code"):
        if group[1].tail(1).index != df.index.max():
            fallBehind_place_list.append(group[0])
    for p in fallBehind_place_list:
        fallBehind_last_date_index_list.append(all_dates.get_loc(
            df[df.loc[:, 'country_code']== p].index[-1].strftime("%Y-%m-%d")))
        fallBehind_last_date_list.append(
            df[df.loc[:, 'country_code']== p].index[-1].strftime("%Y-%m-%d"))
    d = {'fallBehind_place': fallBehind_place_list, 
         'fallBehind_last_date': fallBehind_last_date_list,
         'fallBehind_last_date_index': fallBehind_last_date_index_list}
    fallBehind_list = pd.DataFrame(data = d)
    return fallBehind_list

In [191]:
fallBehind_list = get_fallBehind_place_date_dateframe(df)
fallBehind_list

Unnamed: 0,fallBehind_place,fallBehind_last_date,fallBehind_last_date_index
0,BEL,2020-11-08,313
1,ESP,2020-11-09,314
2,HKG,2020-11-03,308
3,SWE,2020-11-06,311


In [192]:
def extract_cntry_dfs(df): 
    dfs = []
    for group in df.groupby("country_code"):
        cntry_df = (
            group[1].reindex(all_dates, method="pad")
        )
        cntry_df["total_cases"] = cntry_df["total_cases"]
        cntry_df["total_deaths"] = cntry_df["total_deaths"]
        cntry_df["country_code"] = group[0]
        cntry_df["country_name"] = group[1]["country_name"][-1]
        cntry_df["continent"] = group[1]["continent"][-1]
        cntry_df["population"] = group[1]["population"][-1]
        dfs.append(
            cntry_df
        )
    return dfs

In [193]:
dfs = extract_cntry_dfs(df)

In [194]:
def fill_first_case_death_with_zero(df): # input is dfs
    for i in np.arange(0, len(df)):
        if (df[i].head(1).total_cases.isnull()[0] & df[i].head(1).total_deaths.isnull()[0]):
            df[i][0:1] = [df[i].country_code[-1],
            df[i].continent[-1],
            df[i].country_name[-1],
            0,
            0, 
            df[i].population[-1]
            ]
        if (df[i].head(1).total_cases.isnull()[0] & df[i].head(1).total_deaths.notnull()[0]):
            df[i][0:1] = [df[i].country_code[-1],
            df[i].continent[-1],
            df[i].country_name[-1],
            0,
            df[i].total_deaths[0],
            df[i].population[-1]
            ]
        if (df[i].head(1).total_cases.notnull()[0] & df[i].head(1).total_deaths.isnull()[0]):
            df[i][0:1] = [df[i].country_code[-1],
            df[i].continent[-1],
            df[i].country_name[-1],
            df[i].total_cases[0],
            0,
            df[i].population[-1]
            ]
    return df # output is the dfs with first case & death conditionally filled with zero. 
              # Later, I name this output to be "dfs_first_zero_filled" 

In [195]:
dfs_first_zero_filled = fill_first_case_death_with_zero(dfs)

In [196]:
def merge_with_meta(df): #input should be dfs_first_zero_filled
    concat_df = pd.concat(df).fillna(method="ffill").reset_index().rename(
        columns={"index": "date"})
    #To change the original country codes of "KOS" and World to match metadata from WB:
    concat_df.loc[(concat_df.country_code == "OWID_KOS"), ('country_code')] = "XKX"
    concat_df.loc[(concat_df.country_code == "OWID_WRL"), ('country_code')] = "WLD"
    # To get the column of "world_region" in concat_df by merging with WB metadata
    left_join_df = pd.merge(concat_df, metadata, on = "country_code", how = "left")
    left_join_df.loc[:,'date'] = left_join_df.loc[:,'date'].dt.strftime('%Y-%m-%d')
    left_join_df.loc[(left_join_df.country_code == "AIA"), ('world_region')] = "Latin America & Caribbean"
    left_join_df.loc[(left_join_df.country_code == "BES"), ('world_region')] = "Latin America & Caribbean"
    left_join_df.loc[(left_join_df.country_code == "ESH"), ('world_region')] = "Middle East & North Africa"
    left_join_df.loc[(left_join_df.country_code == "FLK"), ('world_region')] = "Latin America & Caribbean"
    left_join_df.loc[(left_join_df.country_code == "GGY"), ('world_region')] = "Europe & Central Asia"
    left_join_df.loc[(left_join_df.country_code == "JEY"), ('world_region')] = "Europe & Central Asia"
    left_join_df.loc[(left_join_df.country_code == "MSR"), ('world_region')] = "Latin America & Caribbean"
    left_join_df.loc[(left_join_df.country_code == "TWN"), ('world_region')] = "East Asia & Pacific"
    left_join_df.loc[(left_join_df.country_code == "VAT"), ('world_region')] = "Europe & Central Asia"
    left_join_df.loc[(left_join_df.country_code == "WLF"), ('world_region')] = "East Asia & Pacific"
    left_join_df.loc[(left_join_df.country_code == "WLD"), ('world_region')] = "World"
    return left_join_df

In [202]:
left_join_df = merge_with_meta(dfs_first_zero_filled)

In [223]:
def fallBehind_zero_to_nan (df): # input should be left_join_df
    left_join_copy_group1 = []
    for group in df.groupby('country_code'):
        for i in np.arange(0, len(fallBehind_list)):
            if group[1].tail(1).country_code.iloc[0] == fallBehind_list.iloc[i, 0]:
                group[1].tail(len(all_dates) - 1 - fallBehind_list.iloc[i, 2]).total_cases = np.nan
                group[1].tail(len(all_dates) - 1 - fallBehind_list.iloc[i, 2]).total_deaths = np.nan
        left_join_copy_group1.append(group[1])
    return left_join_copy_group # I will name the output later to be f1allBehind_nan_changed

In [222]:
fallBehind_nan_changed = pd.concat(fallBehind_zero_to_nan(left_join_df))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


AttributeError: 'list' object has no attribute 'replace'

In [218]:
fallBehind_nan_changed

In [219]:
fallBehind_nan_changed[fallBehind_nan_changed.isnull().any(axis=1)]

Unnamed: 0,date,country_code,continent,country_name,total_cases,total_deaths,population,world_region


In [220]:
fallBehind_nan_changed[fallBehind_nan_changed.loc[:, 'country_code'] == 'BEL'].tail(10)

Unnamed: 0,date,country_code,continent,country_name,total_cases,total_deaths,population,world_region
4730,2020-11-01,BEL,Europe,Belgium,448471.0,11836.0,11589616.0,Europe & Central Asia
4731,2020-11-02,BEL,Europe,Belgium,451133.0,11993.0,11589616.0,Europe & Central Asia
4732,2020-11-03,BEL,Europe,Belgium,464912.0,12194.0,11589616.0,Europe & Central Asia
4733,2020-11-04,BEL,Europe,Belgium,476916.0,12362.0,11589616.0,Europe & Central Asia
4734,2020-11-05,BEL,Europe,Belgium,486052.0,12550.0,11589616.0,Europe & Central Asia
4735,2020-11-06,BEL,Europe,Belgium,493601.0,12742.0,11589616.0,Europe & Central Asia
4736,2020-11-07,BEL,Europe,Belgium,500002.0,12917.0,11589616.0,Europe & Central Asia
4737,2020-11-08,BEL,Europe,Belgium,502452.0,13066.0,11589616.0,Europe & Central Asia
4738,2020-11-09,BEL,Europe,Belgium,,,11589616.0,Europe & Central Asia
4739,2020-11-10,BEL,Europe,Belgium,,,11589616.0,Europe & Central Asia


In [127]:
def prepare_data_structure(df, gby="country_code"): # input should be fallBehind_nan_changed
    data = []
    for g in df.groupby([gby]):
        code = g[0]
        cntry_df = g[1]
        try:
            country_data = {
                "country_code": code,
                "country_name": cntry_df.loc[:,"country_name"].iloc[0],
                "population": cntry_df.loc[:,"population"].iloc[0],
                "region": cntry_df.loc[:,"world_region"].iloc[0],
                "confirmed": list(zip(cntry_df.date, cntry_df.total_cases)),
                "deaths": list(zip(cntry_df.date, cntry_df.total_deaths)),
            }
            data.append(country_data)
        except KeyError:
            print("metadata doesn't exist for: ", code)
            continue
    return data

In [133]:
data = prepare_data_structure(fallBehind_nan_changed)

In [134]:
open("/Users/Tal/Desktop/covid19-data/output/cntry_stat_owid.json", "w").write(
    json.dumps(data, separators=(",", ":")))

2807997