In [2]:
import pandas as pd
import numpy as np

In [3]:
cm = pd.read_csv('COVID 19 Containment measures data.csv',parse_dates=['Date Start','Date end intended'])


In [4]:
jhcc = pd.read_csv('https://github.com/CSSEGISandData/COVID-19/raw/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
jhd = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv')
JH2CM = {
    'Korea, South':'South Korea',
    'Taiwan*':'Taiwan',
    ('China','Hong Kong'):'Hong Kong',
    ('China','Macau'):'Macau',
    ('Netherlands','Faroe Islands'):'Faroe Islands',
    'US':'United States'
}


def jh2cm(c,s):
    if c in JH2CM:
        return JH2CM[c]
    elif (c,s) in JH2CM:
        return JH2CM[(c,s)]
    return c

def pre_jh(jh,val="Confirmed Cases"):
    jh['Country/Region'] = [jh2cm(c,s) for c,s in jh[['Country/Region','Province/State']].values]
    jh['Country/Region'] = [jh2cm(c,s) for c,s in jh[['Country/Region','Province/State']].values]

    jh.drop(columns=['Lat','Long'],inplace=True)
    jh = jh.groupby('Country/Region').sum().reset_index()
    jh = pd.melt(jh,id_vars=['Country/Region'],value_vars=jh.columns[2:],value_name=val,var_name='Date')
    jh['Date'] = pd.to_datetime(jh['Date'])
    return jh

jhcc, jhd = pre_jh(jhcc),pre_jh(jhd,val='Deaths')
jhcc['Deaths'] = jhd['Deaths']

In [5]:
MAX_COLS = {
    'Symptomatic isolation - targeted':{'contact isolation - symptoms':1,
                                        'cohort isolation - symptoms':1},
    'Symptomatic isolation - blanket':{'cluster isolation - symptoms':1,
                                       'blanket isolation - symptoms':2},
    'Asymptomatic isolation - targeted':{'contact isolation - no symptoms':1,
                                         'cohort isolation - no symptoms':2},
    'Asymptomatic isolation - blanket':{'cluster isolation - no symptoms':1,
                                        'blanket isolation - no symptoms':3,
                                        'blanket curfew - no symptoms':2,
                                        'natural village quarantine':3},
    'Domestic travel restriction':{'domestic traveller quarantine':1,
                                   'domestic travel ban':2,
                                   'total vehicle ban':2},
    'International travel restriction':{'international traveller screening - risk countries':1,
                                        'international traveller screening - all countries':2,
                                        'international traveller quarantine - risk countries':3,
                                        'international traveller quarantine - all countries':4,
                                        'international travel ban - risk countries':5,
                                        'international travel ban - all countries':6},
    'Testing':{'testing numbers total':np.nan},
    'Contact tracing':{'contacts traced total':np.nan},
    'Mask wearing':{'public mask wearing data':np.nan},
    'Hand washing':{'public handwashing data':np.nan}
    
}

MIN_COLS = {
    'Gatherings banned':['indoor gatherings banned',
                        'outdoor gatherings banned']
}

CUMSUM_COLS = {
    'Healthcare specialisation':['clinic specialisation',
                                'case transport',
                                'quarantine zone',
                                'hospital specialisation',
                                'healthcare entry screening',
                                'remote medical treatment',
                                'visiting in hospital banned'],
    'Public education and incentives':['risk communication',
                                      'community engagement',
                                      'coronavirus education activities',
                                      'handwashing encouragement',
                                      'public mask encouragement',
                                      'public mask supply',
                                      'public mask and hygiene supply',
                                      'public hand sanitizer supply',
                                      'phone line'],
    'Assisting people to stay home':['unemployment benefits extension',
                                    'eviction moratorium',
                                    'isolation allowance',
                                    'compulsory isolation'],
    'Public cleaning':['public transport cleaning',
                      'public facility cleaning'],
    'Miscellaneous hygiene measures':['funeral hygiene',
                                     'cash cleaning',
                                     'cash banned'],
    'Public interaction reduction':['handshakes banned',
                                   'social distancing advice',
                                   'stay home',
                                   'space minimum',
                                   'funeral hygiene',
                                   'outdoor person density',
                                   'indoor person density',
                                   'public venue screening'],
    'Nonessential business suspension':['nonessential business suspension',
                                       'remote work',
                                       'closure of gathering places',
                                       'restaurant limitations'],
    'School closure':['school closure',
                     'university closure',
                     'nursery school closure',
                     'remote schooling',
                     'public transport stopped'],
    'Activity cancellation':['activity cancellation - other',
                            'sports cancellation',
                            'religious activity cancellation',
                            'religious activity limitations',
                            'weddings canceled',
                            'very large event cancellation or postponement',
                            'cultural activity limitation',
                            'remote cultural content'],
    'Resumption':['public transport resumed',
                 'activity resumed',
                 'business resumed'],
    'Diagnostic criteria loosened':['diagnostic criteria loosened'],
    'Diagnostic criteria tightened':['diagnostic criteria tightened']    
}

TEST_COLS = {    
    'Testing criteria':{'test all':1,
                       'test symptomatic':0.5,
                       'cluster testing':0.3,
                       'test contacts':0.1,
                       'test cohorts':0.2,
                       'test travellers':0.1,
                       'test medical staff':0.1,
                       'test vulnerable':0.1}
}

def default_values(kw):
    for k, v in {**MAX_COLS,**TEST_COLS}.items():
        if (kw in v) and (v[kw]!=np.nan):
            return v[kw]
    return np.nan

def keywords(kws_quants):
    res =  pd.DataFrame([(i,j[1]) 
                         for j in kws_quants.values 
                         for i in str(j[0]).split(', ')],
                        columns=['Keywords','Quantity'])
    res['Quantity'] = res['Keywords'].apply(default_values).fillna(res['Quantity'])
    return res

def sum_kws(kws_quants,tags):
    return pd.Series(kws_quants['Keywords'].unique()).isin(tags).sum()

def max_kws(kws_quants,tags):
    return kws_quants[kws_quants['Keywords'].isin(tags)]['Quantity'].max()

def min_kws(kws_quants,tags):
    return kws_quants[kws_quants['Keywords'].isin(tags)]['Quantity'].min()

def test_kws(kws_quants,tags):
    if 'test all' in kws_quants['Keywords']:
        return 1
    elif 'test symptomatic' in kws_quants['Keywords']:
        return 0.5
    else:
        return kws_quants[kws_quants['Keywords'].isin(tags)]['Quantity'].sum()


jdict = {**MAX_COLS,**MIN_COLS,**CUMSUM_COLS,**TEST_COLS}


In [None]:
data_dict = {k:[] for k in jdict.keys()}
data_dict['Date'] = []
data_dict['Country'] = []

for d in pd.date_range(cm['Date Start'].min(),cm['Date Start'].max()):
    print(d,' of ',cm['Date Start'].max())
    for c in cm['Country'].unique():
        data_dict['Country'].append(c)
        data_dict['Date'].append(d)
        if ((cm['Date Start']<=d)&(cm['Country']==c)).any():
            kws_quants = keywords(cm[(cm['Date Start']<=d)&(cm['Country']==c)][['Keywords','Quantity']])
        else:
            kws_quants = pd.DataFrame({'Keywords':[],'Quantity':[]})
        for col in MAX_COLS:
            data_dict[col].append(max_kws(kws_quants,MAX_COLS[col].keys()))
        for col in CUMSUM_COLS:
            data_dict[col].append(sum_kws(kws_quants,CUMSUM_COLS[col]))
        for col in TEST_COLS:
            data_dict[col].append(test_kws(kws_quants,TEST_COLS[col].keys()))
        for col in MIN_COLS:
            data_dict[col].append(min_kws(kws_quants,MIN_COLS[col]))
            

2019-12-18 00:00:00  of  2020-04-01 00:00:00
2019-12-19 00:00:00  of  2020-04-01 00:00:00
2019-12-20 00:00:00  of  2020-04-01 00:00:00
2019-12-21 00:00:00  of  2020-04-01 00:00:00
2019-12-22 00:00:00  of  2020-04-01 00:00:00
2019-12-23 00:00:00  of  2020-04-01 00:00:00
2019-12-24 00:00:00  of  2020-04-01 00:00:00
2019-12-25 00:00:00  of  2020-04-01 00:00:00
2019-12-26 00:00:00  of  2020-04-01 00:00:00
2019-12-27 00:00:00  of  2020-04-01 00:00:00
2019-12-28 00:00:00  of  2020-04-01 00:00:00
2019-12-29 00:00:00  of  2020-04-01 00:00:00
2019-12-30 00:00:00  of  2020-04-01 00:00:00
2019-12-31 00:00:00  of  2020-04-01 00:00:00
2020-01-01 00:00:00  of  2020-04-01 00:00:00
2020-01-02 00:00:00  of  2020-04-01 00:00:00
2020-01-03 00:00:00  of  2020-04-01 00:00:00
2020-01-04 00:00:00  of  2020-04-01 00:00:00
2020-01-05 00:00:00  of  2020-04-01 00:00:00
2020-01-06 00:00:00  of  2020-04-01 00:00:00
2020-01-07 00:00:00  of  2020-04-01 00:00:00
2020-01-08 00:00:00  of  2020-04-01 00:00:00
2020-01-09

In [None]:
unmerged_data = pd.DataFrame(data_dict).dropna(subset=['Country'])

In [None]:
US_STATES = data['Country'][unmerged_data['Country'].str.startswith('US:')].unique()


unmerged_data.loc[unmerged_data['Country']=='United States',unmerged_data.columns[:-1]] = unmerged_data.loc[data['Country'].isin(US_STATES),:].groupby('Date').mean()


djh = unmerged_data[data['Country'].isin(jh['Country/Region'])]

In [268]:
jhcc.rename(columns={'Country/Region':'Country'},inplace=True)
jh_merged_data = djh.merge(jhcc,on=['Date','Country'])

In [177]:
unmerged_data.to_csv('countermeasures_features_2020_03_30.csv')
jh_merged_data.to_csv('countermeasures_db_johnshopkins_2020_03_30.csv')