In [2]:
import os
import sys

import pandas as pd
import numpy as np

import datetime

import matplotlib.pyplot as plt
import matplotlib
import matplotlib.dates as mdates
import matplotlib.ticker as ticker

import seaborn as sns
import statsmodels.api as sm
import scipy.optimize as optim

from IPython.display import Image

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# set font size
# plt.rcParams['font.size'] = 24
# plt.rcParams.update({'figure.figsize': (10, 7), 'figure.dpi': 120, 'font.size':10})

%config InlineBackend.figure_format='retina'

In [4]:
data_dir = '/Users/genie/data/covid-19/'

In [5]:
for dirname, _, filenames in os.walk(data_dir):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/Users/genie/data/covid-19/.DS_Store
/Users/genie/data/covid-19/jhu_time_series_covid19_deaths_global_narrow.csv
/Users/genie/data/covid-19/owid-covid-data.csv
/Users/genie/data/covid-19/eiu_democracy_indices.csv
/Users/genie/data/covid-19/jhu_time_series_covid19_confirmed_global_narrow.csv
/Users/genie/data/covid-19/jhu_time_series_covid19_recovered_global_narrow.csv
/Users/genie/data/covid-19/us_daily.csv


In [6]:
%%time

data = pd.read_csv(os.path.join(data_dir,'owid-covid-data.csv'))
data = data[data.iso_code.notnull()][['iso_code', 'location', 'date', 'total_cases', 'new_cases',
       'total_deaths', 'new_deaths', 'total_tests','new_tests',
       'stringency_index', 'population', 'population_density', 'median_age',
       'aged_65_older','gdp_per_capita']]
data = data.rename(columns={'location':'country_name'})


# Merge recoveries data from JHU dataset
recovered_df = pd.read_csv(os.path.join(data_dir,'jhu_time_series_covid19_recovered_global_narrow.csv'))
recovered_df = recovered_df.rename(columns={'iso_country_code':'iso_code','recovered':'total_recoveries'})
recovered_df = recovered_df.groupby(['iso_code','date'])['total_recoveries'].sum().reset_index()
recovered_df['date'] = pd.to_datetime(recovered_df['date'])
recovered_df['date'] = recovered_df['date'].dt.strftime('%Y-%m-%d')
recovered_df['new_recoveries'] = recovered_df['total_recoveries'] - recovered_df.sort_values(by=['date'], ascending=True)\
.groupby(['iso_code'])['total_recoveries'].shift(1)
data = pd.merge(data, recovered_df, how='left', on=['iso_code','date'])
del recovered_df


# Merge democracy indices data
eiu_demox = pd.read_csv(os.path.join(data_dir,'eiu_democracy_indices.csv'))
eiu_demox = eiu_demox[eiu_demox.year==2019][['iso_code','demox_eiu']]
eiu_demox['iso_code'] = eiu_demox['iso_code'].str.upper()
data = pd.merge(data, eiu_demox, how='left', on=['iso_code'])
del eiu_demox

CPU times: user 3.77 s, sys: 72.4 ms, total: 3.84 s
Wall time: 3 s


In [7]:
## FEATURES

data['tests_per_capita'] = data['total_tests']/data['population']
data['tests_per_1000'] = data['total_tests']/(data['population']/1000)
data['cases_per_1000'] = data['total_cases']/(data['population']/1000)
data['tests_per_confirmed_case'] = data['new_tests']/data['new_cases']

data['cpr_daily'] = data['new_cases']/data['new_tests'] * 100
data['cpr_total'] = data['total_cases']/data['total_tests'] * 100
data['share_of_infected_population '] = data['total_cases']/data['population'] * 100

data['cfr_daily'] = data['new_deaths']/data['new_cases'] * 100
data['cfr_total'] = data['total_deaths']/data['total_cases'] * 100
data['deaths_per_1000'] = data['total_deaths']/(data['population']/1000)

data['crr_daily'] = data['new_recoveries']/data['new_cases'] * 100
data['crr_total'] = data['total_recoveries']/data['total_cases'] * 100
data['recoveries_per_1000'] = data['total_recoveries']/(data['population']/1000)
data['recoveries_per_confirmed_case'] = data['new_recoveries']/data['new_cases']


### Calculate days until first case, days until 1000 cases

In [8]:
df = data[(data.iso_code.notnull()) & ~(data.iso_code.isin(['OWID_KOS','OWID_WRL']))].copy()

first_case_data = df[df.total_cases > 0].groupby(['iso_code'])['date'].min().reset_index()
first_case_data = first_case_data.rename(columns={'date':'first_case_date'})
df = pd.merge(df, first_case_data, how='left', on=['iso_code'])

# df['days_since_first_case'] = (pd.to_datetime(df['date'])-pd.to_datetime(df['first_case_date'])).dt.days
df['days_until_first_case'] = (pd.to_datetime(df['first_case_date'])-pd.to_datetime(df['date'].min())).dt.days

cases_1000_data = df[df.total_cases>=1000].groupby(['iso_code'])['date'].min().reset_index()
cases_1000_data = cases_1000_data.rename(columns={'date':'cases_1000_date'})
df = pd.merge(df, cases_1000_data, how='left', on=['iso_code'])
df['days_until_1000_cases'] = (pd.to_datetime(df['cases_1000_date'])-pd.to_datetime(df['first_case_date'])).dt.days

df = df[['iso_code','country_name','first_case_date','days_until_first_case','cases_1000_date','days_until_1000_cases']].drop_duplicates()

del first_case_data
del cases_1000_data