# Data generation notebook

Processing and generating data files for easier use

In [1]:
import pandas as pd
import pycountry_convert as pc
import numpy as np

## Load data

In [2]:
data_frames = {
    'confirmed':  pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv'),
    'deaths': pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv'),
    'recovered': pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv')
}

## Remove cruises

In [3]:
for key in data_frames.keys():
    df = data_frames[key]
    
    df = df[df['Country/Region'] != 'Diamond Princess']
    df = df[df['Country/Region'] != 'MS Zaandam']
    
    data_frames[key] = df

## Rename countries

In [4]:
for key in data_frames.keys():
    df = data_frames[key]
    
    df = df.replace("Cote d'Ivoire", 'Ivory Coast')
    df = df.replace("Cabo Verde", 'Cape Verde')
    df = df.replace("Congo (Brazzaville)", 'Congo')
    df = df.replace("Congo (Kinshasa)", 'Democratic Republic of the Congo')
    df = df.replace("Czechia", 'Czech Republic')
    df = df.replace("Holy See", 'Vatican City')
    df = df.replace("Korea, South", 'South Korea')
    df = df.replace("Taiwan*", 'Taiwan')
    df = df.replace("US", 'United States')
    df = df.replace("West Bank and Gaza", 'State of Palestine')
    df = df.replace("Burma", 'Myanmar')
    
    data_frames[key] = df

## Separate 'countries' such as Greenland

In [5]:
for key in data_frames.keys():
    df = data_frames[key]
    
    regions = ['Greenland', 'French Guiana', 'Falkland Islands (Malvinas)', 'New Caledonia']
    
    for region in regions:
        df.loc[df['Province/State'] == region, 'Country/Region'] = region
        
    data_frames[key] = df

## Group by country

In [6]:
for key in data_frames.keys():
    df = data_frames[key]
    df = df.drop(columns='Province/State')
    
    # Compute average of Lat and Long
    agg_dict = {'Lat': 'mean',
                'Long': 'mean'}
    
    # Sum the rest of the columns
    for column in df.columns[3:]:
        agg_dict[column] = 'sum'
    
    data_frames[key] = df.groupby('Country/Region').agg(agg_dict).reset_index()

## Reformat dates

In [7]:
for key in data_frames.keys():
    df = data_frames[key]
    dates = [x.strftime("%Y-%m-%d") for x in pd.to_datetime(df.columns[3:])]
    df.columns = list(df.columns[:3]) + dates

## Add world population column

In [8]:
population_df = pd.read_csv('generated/population.csv')

for key in data_frames.keys():
    df = data_frames[key]
    
    df = df.merge(population_df, how='left', left_on='Country/Region', right_on='name')
    df = df.drop(columns='name')
    
    # Reorder columns
    df = df[['Country/Region', 'population'] + list(df.columns[1:-1])]
    df = df.copy()
    
    data_frames[key] = df

## Add sick dataset (confirmed - recovered)

In [9]:
confirmed = data_frames['confirmed']
recovered = data_frames['recovered']
# Check that the countries are in the same order
assert((confirmed.loc[:, 'Country/Region'] != recovered.loc[:, 'Country/Region']).sum() == 0)

sick = confirmed.copy()
sick.loc[:, sick.columns[4:]] = confirmed.loc[:, sick.columns[4:]] - recovered.loc[:, sick.columns[4:]]
data_frames['sick'] = sick

## Add daily dataset (number of confirmed cases for each day)

In [10]:
daily = data_frames['confirmed'].copy()

days = daily.loc[:, daily.columns[5:]]
shifted_days = daily.loc[:, daily.columns[4:-1]]
shifted_days.columns = days.columns

daily.loc[:, daily.columns[5:]] = days - shifted_days
data_frames['daily'] = daily

## R0 computation

In [11]:
R0_df = data_frames['confirmed'].copy()
values = R0_df.loc[:, R0_df.columns[4:]].values
Tau = 14

doubling_times = np.zeros(values.shape)

for i, country in enumerate(values):
    for j, day in enumerate(country):
        if day > 0:
            doubling_time = np.argmax(country > day*2) - j
            if doubling_time < 0:
                doubling_time = -1
            
            doubling_times[i, j] = doubling_time

K_d = np.divide(np.log(2), doubling_times, where=doubling_times>0)
K = np.gradient(np.log(values, where=values>0), axis=1)
R0 = np.exp(K_d * Tau)
R0[doubling_times == -1] = np.exp(K * Tau)[doubling_times == -1]

# Don't use doubling time
R0 = np.exp(K * Tau).clip(0,10)
R0[values < 100] = np.nan

R0_df.loc[:, confirmed.columns[4:]] = R0
R0_df.loc[(R0_df['population'] < 1000000) | (confirmed[R0_df.columns[-1]] < 10000), confirmed.columns[4:]] = np.nan
display(R0_df)
R0_df.describe()
data_frames['R0'] = R0_df

Unnamed: 0,Country/Region,population,Lat,Long,2020-01-22,2020-01-23,2020-01-24,2020-01-25,2020-01-26,2020-01-27,...,2020-04-20,2020-04-21,2020-04-22,2020-04-23,2020-04-24,2020-04-25,2020-04-26,2020-04-27,2020-04-28,2020-04-29
0,Afghanistan,38041754,33.000000,65.000000,,,,,,,...,,,,,,,,,,
1,Albania,2880917,41.153300,20.168300,,,,,,,...,,,,,,,,,,
2,Algeria,43053054,28.033900,1.659600,,,,,,,...,,,,,,,,,,
3,Andorra,77142,42.506300,1.521800,,,,,,,...,,,,,,,,,,
4,Angola,31825295,-11.202700,17.873900,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
182,Vietnam,96462106,16.000000,108.000000,,,,,,,...,,,,,,,,,,
183,Western Sahara,582463,24.215500,-12.885800,,,,,,,...,,,,,,,,,,
184,Yemen,29161922,15.552727,48.516388,,,,,,,...,,,,,,,,,,
185,Zambia,17861030,-15.416700,28.283300,,,,,,,...,,,,,,,,,,


## Add world and continents rows

In [12]:
# Continents code to name
continents = {
    'AF': 'Africa',
    'AS': 'Asia',
    'EU': 'Europe',
    'NA': 'North America',
    'OC': 'Oceania',
    'SA': 'South America'
}

def country_to_continent(country):
    """
    From country name get continent name
    """
    try:
        country_code = pc.country_name_to_country_alpha2(country)
        continent = pc.country_alpha2_to_continent_code(country_code)
        return continents[continent]
    except:
        # Country unknown for pycountry -> by hand
        if country == 'Kosovo':
            return 'Europe'
        elif country == 'State of Palestine':
            return 'Asia'
        elif country == 'Timor-Leste':
            return 'Asia'
        elif country == 'Vatican City':
            return 'Europe'
        elif country == 'Western Sahara':
            return 'Africa'
        else:
            print(f"Unkown country continent: {country}")
            raise ValueError

            
for key in data_frames.keys():
    df = data_frames[key].copy()
    
    # Compute world aggregation
    world = pd.Series(df.loc[0, :])
    world['Country/Region'] = "World"
    world['population'] = df['population'].sum()
    world['Lat'] = 0
    world['Long'] = 0
    world.loc[world.index[4:]] = df.loc[:, df.columns[4:]].sum()
    world = pd.DataFrame(world).T
    
    # Compute continents aggregation
    df['Country/Region'] = df['Country/Region'].apply(country_to_continent)
    
    # Aggregation dict
    agg_dict = {'population': 'sum',
                'Lat': 'mean',
                'Long': 'mean'}
    
    # Sum the rest of the columns
    for column in df.columns[4:]:
        if key == 'R0':
            agg_dict[column] = 'mean'
        else:
            agg_dict[column] = 'sum'
    
    df = df.groupby('Country/Region').agg(agg_dict).reset_index()
    
    # Concat world, continents and countries
    df = pd.concat([world, df, data_frames[key]]).reset_index(drop=True)
    data_frames[key] = df

In [14]:
data_frames['R0'].fillna(0, inplace=True)

## Write back data

In [15]:
for key in data_frames.keys():
    data_frames[key].to_csv(f'generated/{key}.csv', index=False)

## Governments measures generation

In [3]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [4]:
data_full = pd.read_csv('data/governments-measures.csv')
data = data_full[['COUNTRY', 'LOG_TYPE', 'CATEGORY', 'MEASURE', 'COMMENTS', 'ENTRY_DATE']]
data['MEASURE'] = data['MEASURE'].str.replace('\xa0', '')
data

Unnamed: 0,COUNTRY,LOG_TYPE,CATEGORY,MEASURE,COMMENTS,ENTRY_DATE
0,Afghanistan,Introduction / extension of measures,Public health measures,Health screenings in airports and border cross...,,14/03/2020
1,Afghanistan,Introduction / extension of measures,Public health measures,Isolation and quarantine policies,,14/03/2020
2,Afghanistan,Introduction / extension of measures,Public health measures,Awareness campaigns,,14/03/2020
3,Afghanistan,Introduction / extension of measures,Governance and socio-economic measures,Emergency administrative structures activated ...,,14/03/2020
4,Afghanistan,Introduction / extension of measures,Social distancing,Limit public gatherings,Nevruz festival cancelled,14/03/2020
...,...,...,...,...,...,...
10037,Zimbabwe,Phase-out measure,Social distancing,Limit public gatherings,The mining sector allowed to scale up operatio...,24/04/2020
10038,Zimbabwe,Introduction / extension of measures,Governance and socio-economic measures,Economic measures,A price moratorium has been introduced on good...,24/04/2020
10039,Zimbabwe,Introduction / extension of measures,Governance and socio-economic measures,Emergency administrative structures activated ...,"As of 24 April 2020, four (4) more laboratorie...",24/04/2020
10040,Zimbabwe,Introduction / extension of measures,Governance and socio-economic measures,Economic measures,Both residential & commercial properties will ...,30/04/2020


Match the countries, delete the countries that are not in our dataset on corona cases

In [5]:
original = pd.read_csv('generated/confirmed.csv')

country_matching = {'Viet Nam': 'Vietnam',
                   'United States of America': 'United States',
                   'Russian Federation': 'Russia',
                   'Palestine': 'State of Palestine',
                   'North Macedonia Republic Of': 'North Macedonia',
                   'Moldova Republic Of': 'Moldova',
                   'Moldova Republic of': 'Moldova',
                   'Lao PDR': 'Laos',
                   'Korea Republic of': 'South Korea',
                   'kenya': 'Kenya',
                   'Czech republic': 'Czech Republic',
                   "Côte d'Ivoire": 'Ivory Coast',
                   'Cabo Verde': 'Cape Verde',
                   'Brunei Darussalam': 'Brunei',
                   'Congo DR': 'Democratic Republic of the Congo'}

others = ['Turkmenistan', 'Vanuatu', 'Tuvalu', 'Tonga', 'Tajikistan', 'Solomon Islands', 
          'Samoa', 'Palau', 'Nauru', 'Micronesia', 'Marshall Islands', 'Lesotho', 'Korea DPR',
          'Kiribati', 'Comoros', 'China, Hong Kong Special Administrative Region']
data = data.loc[~data['COUNTRY'].isin(others), :]
data['COUNTRY'] = data['COUNTRY'].replace(country_matching)
number_countries_not_in_original_dataset = (data[['COUNTRY']]
                                            .drop_duplicates()
                                            .merge(original[['Country/Region']], how='outer', left_on='COUNTRY', right_on='Country/Region')['Country/Region'].isna().sum())
print('Number of incorrectly named countries relative to original dataset:', number_countries_not_in_original_dataset)

Number of incorrectly named countries relative to original dataset: 0


In [6]:
data.columns = ['country', 'log_type', 'category', 'measure', 'comment', 'date']
data['date'] = pd.to_datetime(data['date']).dt.strftime('%Y-%m-%d') 
data.to_csv('generated/governments-measures.csv', index=False)
data

Unnamed: 0,country,log_type,category,measure,comment,date
0,Afghanistan,Introduction / extension of measures,Public health measures,Health screenings in airports and border cross...,,2020-03-14
1,Afghanistan,Introduction / extension of measures,Public health measures,Isolation and quarantine policies,,2020-03-14
2,Afghanistan,Introduction / extension of measures,Public health measures,Awareness campaigns,,2020-03-14
3,Afghanistan,Introduction / extension of measures,Governance and socio-economic measures,Emergency administrative structures activated ...,,2020-03-14
4,Afghanistan,Introduction / extension of measures,Social distancing,Limit public gatherings,Nevruz festival cancelled,2020-03-14
...,...,...,...,...,...,...
10037,Zimbabwe,Phase-out measure,Social distancing,Limit public gatherings,The mining sector allowed to scale up operatio...,2020-04-24
10038,Zimbabwe,Introduction / extension of measures,Governance and socio-economic measures,Economic measures,A price moratorium has been introduced on good...,2020-04-24
10039,Zimbabwe,Introduction / extension of measures,Governance and socio-economic measures,Emergency administrative structures activated ...,"As of 24 April 2020, four (4) more laboratorie...",2020-04-24
10040,Zimbabwe,Introduction / extension of measures,Governance and socio-economic measures,Economic measures,Both residential & commercial properties will ...,2020-04-30
