In [1]:
import pandas as pd
import numpy as np

## Line List Data

In [17]:
line = pd.read_csv("COVID19_line_list_data.csv", index_col=0)
line.head(3)

Unnamed: 0_level_0,case_in_country,reporting date,Unnamed: 3,summary,location,country,gender,age,symptom_onset,If_onset_approximated,...,recovered,symptom,source,link,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,1/20/2020,,First confirmed imported COVID-19 pneumonia pa...,"Shenzhen, Guangdong",China,male,66.0,01/03/20,0.0,...,0,,Shenzhen Municipal Health Commission,http://wjw.sz.gov.cn/wzx/202001/t20200120_1898...,,,,,,
2,,1/20/2020,,First confirmed imported COVID-19 pneumonia pa...,Shanghai,China,female,56.0,1/15/2020,0.0,...,0,,Official Weibo of Shanghai Municipal Health Co...,https://www.weibo.com/2372649470/IqogQhgfa?fro...,,,,,,
3,,1/21/2020,,First confirmed imported cases in Zhejiang: pa...,Zhejiang,China,male,46.0,01/04/20,0.0,...,0,,Health Commission of Zhejiang Province,http://www.zjwjw.gov.cn/art/2020/1/21/art_1202...,,,,,,


In [18]:
line.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1085 entries, 1 to 1085
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   case_in_country        888 non-null    float64
 1   reporting date         1084 non-null   object 
 2   Unnamed: 3             0 non-null      float64
 3   summary                1080 non-null   object 
 4   location               1085 non-null   object 
 5   country                1085 non-null   object 
 6   gender                 902 non-null    object 
 7   age                    843 non-null    float64
 8   symptom_onset          563 non-null    object 
 9   If_onset_approximated  560 non-null    float64
 10  hosp_visit_date        507 non-null    object 
 11  exposure_start         128 non-null    object 
 12  exposure_end           341 non-null    object 
 13  visiting Wuhan         1085 non-null   int64  
 14  from Wuhan             1081 non-null   float64
 15  deat

In [302]:
line_clean = line.drop(columns=[
    "Unnamed: 3",
    "Unnamed: 21",
    "Unnamed: 22",
    "Unnamed: 23",
    "Unnamed: 24",
    "Unnamed: 25",
    "Unnamed: 26",
])
line_clean = line_clean.rename(columns={
    "reporting date": "reporting_date",
    "If_onset_approximated": "onset_approximated",
    "visiting Wuhan": "visited_Wuhan",
    "from Wuhan": "from_Wuhan",
})

line_clean = line_clean.drop(columns=[
    "case_in_country", "source", "link"
])

# datetime
date_cols = [
    "reporting_date", "symptom_onset", "hosp_visit_date", "exposure_start", "exposure_end", "death", "recovered"
]
for dc in date_cols:
    line_clean[dc] = pd.to_datetime(line_clean[dc], errors="coerce")

# bool
bool_cols = [
    "onset_approximated", "visited_Wuhan", "from_Wuhan"
]
for bc in bool_cols:
    line_clean[bc] = line_clean[bc].astype(bool)


In [303]:
line_clean.head()

Unnamed: 0_level_0,reporting_date,summary,location,country,gender,age,symptom_onset,onset_approximated,hosp_visit_date,exposure_start,exposure_end,visited_Wuhan,from_Wuhan,death,recovered,symptom
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,2020-01-20,First confirmed imported COVID-19 pneumonia pa...,"Shenzhen, Guangdong",China,male,66.0,2020-01-03,False,2020-01-11,2019-12-29,2020-01-04,True,False,NaT,NaT,
2,2020-01-20,First confirmed imported COVID-19 pneumonia pa...,Shanghai,China,female,56.0,2020-01-15,False,2020-01-15,NaT,2020-01-12,False,True,NaT,NaT,
3,2020-01-21,First confirmed imported cases in Zhejiang: pa...,Zhejiang,China,male,46.0,2020-01-04,False,2020-01-17,NaT,2020-01-03,False,True,NaT,NaT,
4,2020-01-21,new confirmed imported COVID-19 pneumonia in T...,Tianjin,China,female,60.0,NaT,True,2020-01-19,NaT,NaT,True,False,NaT,NaT,
5,2020-01-21,new confirmed imported COVID-19 pneumonia in T...,Tianjin,China,male,58.0,NaT,True,2020-01-14,NaT,NaT,False,False,NaT,NaT,


## Population Data

In [247]:
uk_pop = pd.read_csv("United Kingdom-2019.csv")
fr_pop = pd.read_csv("France-2019.csv")
it_pop = pd.read_csv("Italy-2019.csv")

In [250]:
def process(data):
    data.loc[data["Age"]=="100+", "Age"] = "100"
    data["Age_low"] = data["Age"].str.split("-", expand=True)[0].astype(float)
    data["Age_high"] = data["Age"].str.split("-", expand=True)[1].astype(float)
    data = data.rename(columns={"Age" : "Age_band"})

    def f(g):
        low, high = g.iloc[0]["Age_low"], g.iloc[0]["Age_high"]+1
        if low == 100:
            df = g.copy()
            df["Age"] = df["Age_band"]
            return df

        df = g.reindex(
            np.arange(low, high), method='ffill'
        ).reset_index().rename(columns={"index": "Age"})
        df["Age"] = df["Age"].astype(int)
        return df

    data = data.groupby('Age_band', group_keys=False).apply(f)
    return data[["Age", "M", "F"]]

expanded_uk_pop = process(uk_pop)
expanded_fr_pop = process(fr_pop)
expanded_it_pop = process(it_pop)

## Measures

In [276]:
measures = pd.read_csv("COVID 19 Containment measures data.csv")
measures = measures[measures["Country"].isin(["France", "United Kingdom", "Italy"])]

measures = measures.drop(columns=[
    "Applies To", "Implementing City", "Implementing State/Province", 
    "Quantity", "Source", "Target city", "Target country", "Target region", "Target state"
])

measures["Date Start"] = pd.to_datetime(measures["Date Start"])
measures["Date end intended"] = pd.to_datetime(measures["Date end intended"])

In [277]:
measures.head()

Unnamed: 0,ID,Country,Date Start,Date end intended,Description of measure implemented,Exceptions,Keywords
2,578,United Kingdom,2020-03-20,NaT,"All schools, nurseries and colleges closed.",,"nursery school closure, school closure, univer..."
3,372,United Kingdom,2020-03-16,NaT,If one person in any household has a persisten...,,blanket isolation - symptoms
4,357,United Kingdom,2020-03-16,NaT,"By the weekend, those with the most serious he...",,isolation advice to elderly
5,356,United Kingdom,2020-03-16,NaT,Everyone should avoid gatherings with friends ...,,"public announcement, social distancing"
6,373,United Kingdom,2020-03-14,NaT,People who are self-isolating with mild sympto...,,"end of testing, testing criteria tightened"


## Time Series

In [307]:
infected = pd.read_csv("time_series_covid_19_confirmed.csv")
recovered = pd.read_csv("time_series_covid_19_recovered.csv")
deaths = pd.read_csv("time_series_covid_19_deaths.csv")

In [308]:
infected.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,4/5/20,4/6/20,4/7/20,4/8/20,4/9/20,4/10/20,4/11/20,4/12/20,4/13/20,4/14/20
0,,Afghanistan,33.0,65.0,0,0,0,0,0,0,...,349,367,423,444,484,521,555,607,665,714
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,361,377,383,400,409,416,433,446,467,475
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,1320,1423,1468,1572,1666,1761,1825,1914,1983,2070
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,501,525,545,564,583,601,601,638,646,659
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,14,16,17,19,19,19,19,19,19,19
