In [1]:
import pandas as pd
import numpy as np
import datetime
import os
import json

# General Dataset

## 1 General Johns Hopkins University Dataset

In [97]:
df = pd.read_csv("data/Enigma-JHU.csv")

In [98]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198717 entries, 0 to 198716
Data columns (total 12 columns):
fips              177301 non-null float64
admin2            177260 non-null object
province_state    185378 non-null object
country_region    198717 non-null object
last_update       198717 non-null object
latitude          198374 non-null float64
longitude         198374 non-null float64
confirmed         198698 non-null float64
deaths            198276 non-null float64
recovered         198329 non-null float64
active            193703 non-null float64
combined_key      198717 non-null object
dtypes: float64(7), object(5)
memory usage: 18.2+ MB


In [99]:
df["last_update"] = df["last_update"].apply(lambda x : datetime.datetime.strptime(x[:10], '%Y-%m-%d'))

In [100]:
df = df.fillna(0)

In [209]:
df.head()

Unnamed: 0,fips,admin2,province_state,country_region,last_update,latitude,longitude,confirmed,deaths,recovered,active,combined_key
0,0.0,0,Anhui,China,2020-01-22,31.826,117.226,1.0,0.0,0.0,0.0,"Anhui, China"
1,0.0,0,Beijing,China,2020-01-22,40.182,116.414,14.0,0.0,0.0,0.0,"Beijing, China"
2,0.0,0,Chongqing,China,2020-01-22,30.057,107.874,6.0,0.0,0.0,0.0,"Chongqing, China"
3,0.0,0,Fujian,China,2020-01-22,26.079,117.987,1.0,0.0,0.0,0.0,"Fujian, China"
4,0.0,0,Gansu,China,2020-01-22,36.061,103.834,0.0,0.0,0.0,0.0,"Gansu, China"


In [210]:
per_day = pd.DataFrame(df.groupby(['country_region', 'last_update'])['confirmed', 'deaths', 'recovered'].sum())

In [211]:
per_day = per_day.reset_index()

In [213]:
per_day['last_update'] = per_day['last_update'].apply(lambda x: x.strftime("%d-%m-%Y"))

In [214]:
per_day['last_update'].max()

'31-03-2020'

In [115]:
per_day.describe()

Unnamed: 0,confirmed,deaths,recovered
count,13704.0,13704.0,13704.0
mean,12178.82,800.06188,3800.486135
std,69749.54,4772.053779,17441.367366
min,0.0,0.0,0.0
25%,31.0,0.0,2.0
50%,336.0,6.0,41.0
75%,2463.0,56.0,561.0
max,1600937.0,95979.0,350135.0


In [215]:
per_day.head()

Unnamed: 0,country_region,last_update,confirmed,deaths,recovered
0,Afghanistan,24-02-2020,1.0,0.0,0.0
1,Afghanistan,08-03-2020,4.0,0.0,0.0
2,Afghanistan,10-03-2020,5.0,0.0,0.0
3,Afghanistan,11-03-2020,14.0,0.0,0.0
4,Afghanistan,14-03-2020,11.0,0.0,0.0


In [216]:
per_day_sankey = per_day[per_day['last_update']=='23-05-2020']

In [217]:
per_day_sankey

Unnamed: 0,country_region,last_update,confirmed,deaths,recovered
70,Afghanistan,23-05-2020,9216.0,205.0,996.0
144,Albania,23-05-2020,981.0,31.0,777.0
223,Algeria,23-05-2020,7918.0,582.0,4256.0
294,Andorra,23-05-2020,762.0,51.0,652.0
358,Angola,23-05-2020,60.0,3.0,17.0
...,...,...,...,...,...
13484,West Bank and Gaza,23-05-2020,423.0,2.0,346.0
13532,Western Sahara,23-05-2020,6.0,0.0,6.0
13575,Yemen,23-05-2020,209.0,33.0,11.0
13639,Zambia,23-05-2020,920.0,7.0,336.0


### D3

In [218]:
distinct_countries = per_day['country_region'].unique()

In [219]:
dict_ = {}

In [220]:
for country in distinct_countries:
    dict_country = {}
    
    array_data = []
    df_country = per_day[per_day['country_region'] == country].reset_index()
    
    for i in range(df_country.shape[0]):
        dict_country_date = {}
        dict_country_date['date'] = df_country['last_update'][i]
        dict_country_date['cases'] = str(int(df_country['confirmed'][i]))
        dict_country_date['deaths'] = str(int(df_country['deaths'][i]))
        dict_country_date['recovered'] = str(int(df_country['recovered'][i]))
        
        array_data.append(dict_country_date)
        
    dict_country['name'] = country
    dict_country['values'] = array_data
    
    dict_[country] = dict_country

In [221]:
dict_

{'Afghanistan': {'name': 'Afghanistan',
  'values': [{'date': '24-02-2020',
    'cases': '1',
    'deaths': '0',
    'recovered': '0'},
   {'date': '08-03-2020', 'cases': '4', 'deaths': '0', 'recovered': '0'},
   {'date': '10-03-2020', 'cases': '5', 'deaths': '0', 'recovered': '0'},
   {'date': '11-03-2020', 'cases': '14', 'deaths': '0', 'recovered': '0'},
   {'date': '14-03-2020', 'cases': '11', 'deaths': '0', 'recovered': '0'},
   {'date': '15-03-2020', 'cases': '16', 'deaths': '0', 'recovered': '0'},
   {'date': '16-03-2020', 'cases': '21', 'deaths': '0', 'recovered': '1'},
   {'date': '17-03-2020', 'cases': '22', 'deaths': '0', 'recovered': '1'},
   {'date': '20-03-2020', 'cases': '24', 'deaths': '0', 'recovered': '1'},
   {'date': '22-03-2020', 'cases': '40', 'deaths': '1', 'recovered': '1'},
   {'date': '23-03-2020', 'cases': '40', 'deaths': '1', 'recovered': '1'},
   {'date': '24-03-2020', 'cases': '74', 'deaths': '1', 'recovered': '1'},
   {'date': '25-03-2020', 'cases': '84', 

In [222]:
dict_ = json.dumps(dict_)

In [223]:
dict_

'{"Afghanistan": {"name": "Afghanistan", "values": [{"date": "24-02-2020", "cases": "1", "deaths": "0", "recovered": "0"}, {"date": "08-03-2020", "cases": "4", "deaths": "0", "recovered": "0"}, {"date": "10-03-2020", "cases": "5", "deaths": "0", "recovered": "0"}, {"date": "11-03-2020", "cases": "14", "deaths": "0", "recovered": "0"}, {"date": "14-03-2020", "cases": "11", "deaths": "0", "recovered": "0"}, {"date": "15-03-2020", "cases": "16", "deaths": "0", "recovered": "0"}, {"date": "16-03-2020", "cases": "21", "deaths": "0", "recovered": "1"}, {"date": "17-03-2020", "cases": "22", "deaths": "0", "recovered": "1"}, {"date": "20-03-2020", "cases": "24", "deaths": "0", "recovered": "1"}, {"date": "22-03-2020", "cases": "40", "deaths": "1", "recovered": "1"}, {"date": "23-03-2020", "cases": "40", "deaths": "1", "recovered": "1"}, {"date": "24-03-2020", "cases": "74", "deaths": "1", "recovered": "1"}, {"date": "25-03-2020", "cases": "84", "deaths": "2", "recovered": "2"}, {"date": "26-03

In [224]:
f1 = open('general_data.json', 'w')

f1.write(dict_)

f1.close()

### Plotly

In [175]:
distinct_countries = per_day['country_region'].unique()

In [176]:
data_cases = {}
data_deaths = {}
data_recovered = {}

In [177]:
for country in distinct_countries:
    dict_cases = {}
    dict_cases['type'] = 'scatter'
    dict_cases['mode'] = 'line'
    dict_cases['name'] = country
    dict_cases['x'] = per_day[per_day['country_region'] == country]['last_update'].to_list()
    dict_cases['y'] = per_day[per_day['country_region'] == country]['confirmed'].to_list()
    
    dict_deaths = {}
    dict_deaths['type'] = 'scatter'
    dict_deaths['mode'] = 'line'
    dict_deaths['name'] = country
    dict_deaths['x'] = per_day[per_day['country_region'] == country]['last_update'].to_list()
    dict_deaths['y'] = per_day[per_day['country_region'] == country]['deaths'].to_list()
    
    dict_recovered = {}
    dict_recovered['type'] = 'scatter'
    dict_recovered['mode'] = 'line'
    dict_recovered['name'] = country
    dict_recovered['x'] = per_day[per_day['country_region'] == country]['last_update'].to_list()
    dict_recovered['y'] = per_day[per_day['country_region'] == country]['recovered'].to_list()
    
    data_cases[country] = dict_cases
    data_deaths[country] = dict_deaths
    data_recovered[country] = dict_recovered

In [178]:
data_cases = json.dumps(data_cases)
data_deaths = json.dumps(data_deaths)
data_recovered = json.dumps(data_recovered)

In [179]:
f1 = open('per_day_cases_json.json', 'w')
f2 = open('per_day_deaths_json.json', 'w')
f3 = open('per_day_recovered_json.json', 'w')

f1.write(data_cases)
f2.write(data_deaths)
f3.write(data_recovered)

f1.close()
f2.close()
f3.close()

### Other Libraries

In [11]:
# Selected countries we want to show
per_day_selected_countries = per_day[((((per_day['country_region'] == 'US') |\
                                     (per_day['country_region'] == 'Switzerland')) |\
                                     (per_day['country_region'] == 'Italy')) |\
                                     (per_day['country_region'] == 'France')) |\
                                     (per_day['country_region'] == 'Germany')]

per_day_selected_countries = per_day_selected_countries.rename({'country_region':'topicName', 'last_update':'date'}, axis=1)

# Get unique number for each country
df_map_numberToCountry = pd.DataFrame(per_day_selected_countries['topicName'].unique()).reset_index()
df_map_numberToCountry['index'] += 1
df_map_numberToCountry = df_map_numberToCountry.rename({'index':'name', 0:'topicName'}, axis=1)

# Merge with prior dataset
per_day_selected_countries = per_day_selected_countries.merge(df_map_numberToCountry, on='topicName')

# Divide dataset into 3, one for each plot
per_day_selected_countries_cases = per_day_selected_countries[['topicName', 'name', 'date', 'confirmed']]
per_day_selected_countries_deaths = per_day_selected_countries[['topicName', 'name', 'date', 'deaths']]
per_day_selected_countries_recovered = per_day_selected_countries[['topicName', 'name', 'date', 'recovered']]

# Renaming to correspond to Britecharts data format
# Remember that the list is sorted by the countries
per_day_selected_countries_cases = per_day_selected_countries_cases.rename({'confirmed':'value'}, axis=1)
per_day_selected_countries_deaths = per_day_selected_countries_cases.rename({'deaths':'value'}, axis=1)
per_day_selected_countries_recovered = per_day_selected_countries_cases.rename({'recovered':'value'}, axis=1)

In [12]:
per_day_selected_countries_cases

Unnamed: 0,topicName,name,date,value
0,France,1,2020-01-24T00:00:00,2.0
1,France,1,2020-01-25T00:00:00,3.0
2,France,1,2020-01-26T00:00:00,3.0
3,France,1,2020-01-27T00:00:00,3.0
4,France,1,2020-01-28T00:00:00,4.0
...,...,...,...,...
360,US,5,2020-04-26T00:00:00,942833.0
361,US,5,2020-04-27T00:00:00,836161.0
362,US,5,2020-04-28T00:00:00,843004.0
363,US,5,2020-04-29T00:00:00,1012582.0


In [13]:
dict_ = []

topicName = ''
topic = 0
dates = []


for i in range(len(per_day_selected_countries_cases)):
    elemTopicName = per_day_selected_countries_cases['topicName'][i]
    if elemTopicName != topicName:
        if topicName != '':
            dictEntry = {'topicName': topicName, 'topic': topic, 'dates': dates}
            dict_.append(dictEntry)
        topicName = elemTopicName
        topic += 1
        datesDict = {'date': per_day_selected_countries_cases['date'][i], 'value': int(per_day_selected_countries_cases['value'][i])}
        dates = [datesDict]
        
            
    else:
        datesDict = {'date': per_day_selected_countries_cases['date'][i], 'value': int(per_day_selected_countries_cases['value'][i])}
        dates.append(datesDict)

In [14]:
dict_

[{'topicName': 'France',
  'topic': 1,
  'dates': [{'date': '2020-01-24T00:00:00', 'value': 2},
   {'date': '2020-01-25T00:00:00', 'value': 3},
   {'date': '2020-01-26T00:00:00', 'value': 3},
   {'date': '2020-01-27T00:00:00', 'value': 3},
   {'date': '2020-01-28T00:00:00', 'value': 4},
   {'date': '2020-01-29T00:00:00', 'value': 5},
   {'date': '2020-01-30T00:00:00', 'value': 5},
   {'date': '2020-01-31T00:00:00', 'value': 5},
   {'date': '2020-02-01T00:00:00', 'value': 12},
   {'date': '2020-02-08T00:00:00', 'value': 11},
   {'date': '2020-02-12T00:00:00', 'value': 11},
   {'date': '2020-02-15T00:00:00', 'value': 12},
   {'date': '2020-02-25T00:00:00', 'value': 14},
   {'date': '2020-02-26T00:00:00', 'value': 18},
   {'date': '2020-02-27T00:00:00', 'value': 38},
   {'date': '2020-02-28T00:00:00', 'value': 57},
   {'date': '2020-02-29T00:00:00', 'value': 100},
   {'date': '2020-03-01T00:00:00', 'value': 130},
   {'date': '2020-03-02T00:00:00', 'value': 191},
   {'date': '2020-03-03T00

In [15]:
json.dumps(dict_).replace('"topicName"', 'topicName').replace('"topic"', 'topic').replace('"date"', 'date').replace('"dates"', 'dates').replace('"value"', 'value').replace('"','\'')

"[{topicName: 'France', topic: 1, dates: [{date: '2020-01-24T00:00:00', value: 2}, {date: '2020-01-25T00:00:00', value: 3}, {date: '2020-01-26T00:00:00', value: 3}, {date: '2020-01-27T00:00:00', value: 3}, {date: '2020-01-28T00:00:00', value: 4}, {date: '2020-01-29T00:00:00', value: 5}, {date: '2020-01-30T00:00:00', value: 5}, {date: '2020-01-31T00:00:00', value: 5}, {date: '2020-02-01T00:00:00', value: 12}, {date: '2020-02-08T00:00:00', value: 11}, {date: '2020-02-12T00:00:00', value: 11}, {date: '2020-02-15T00:00:00', value: 12}, {date: '2020-02-25T00:00:00', value: 14}, {date: '2020-02-26T00:00:00', value: 18}, {date: '2020-02-27T00:00:00', value: 38}, {date: '2020-02-28T00:00:00', value: 57}, {date: '2020-02-29T00:00:00', value: 100}, {date: '2020-03-01T00:00:00', value: 130}, {date: '2020-03-02T00:00:00', value: 191}, {date: '2020-03-03T00:00:00', value: 204}, {date: '2020-03-04T00:00:00', value: 285}, {date: '2020-03-05T00:00:00', value: 377}, {date: '2020-03-06T00:00:00', value:

In [16]:
# Reformat to correspond to Britecharts data format
per_day_selected_countries_cases_json = per_day_selected_countries_cases.to_dict(orient='records')
per_day_selected_countries_deaths_json = per_day_selected_countries_deaths.to_dict(orient='records')
per_day_selected_countries_recovered_json = per_day_selected_countries_recovered.to_dict(orient='records')

In [17]:
per_day_selected_countries_cases_json

[{'topicName': 'France',
  'name': 1,
  'date': '2020-01-24T00:00:00',
  'value': 2.0},
 {'topicName': 'France',
  'name': 1,
  'date': '2020-01-25T00:00:00',
  'value': 3.0},
 {'topicName': 'France',
  'name': 1,
  'date': '2020-01-26T00:00:00',
  'value': 3.0},
 {'topicName': 'France',
  'name': 1,
  'date': '2020-01-27T00:00:00',
  'value': 3.0},
 {'topicName': 'France',
  'name': 1,
  'date': '2020-01-28T00:00:00',
  'value': 4.0},
 {'topicName': 'France',
  'name': 1,
  'date': '2020-01-29T00:00:00',
  'value': 5.0},
 {'topicName': 'France',
  'name': 1,
  'date': '2020-01-30T00:00:00',
  'value': 5.0},
 {'topicName': 'France',
  'name': 1,
  'date': '2020-01-31T00:00:00',
  'value': 5.0},
 {'topicName': 'France',
  'name': 1,
  'date': '2020-02-01T00:00:00',
  'value': 12.0},
 {'topicName': 'France',
  'name': 1,
  'date': '2020-02-08T00:00:00',
  'value': 11.0},
 {'topicName': 'France',
  'name': 1,
  'date': '2020-02-12T00:00:00',
  'value': 11.0},
 {'topicName': 'France',
  'n

In [20]:
# Saving to json format
json1 = json.dumps(per_day_selected_countries_cases_json)
json2 = json.dumps(per_day_selected_countries_deaths_json)
json3 = json.dumps(per_day_selected_countries_recovered_json)

In [27]:
json1 = json1.replace('"topicName"', 'topicName').replace('"name"', 'name').replace('"date"', 'date').replace('"value"', 'value')
json2 = json2.replace('"topicName"', 'topicName').replace('"name"', 'name').replace('"date"', 'date').replace('"value"', 'value')
json3 = json3.replace('"topicName"', 'topicName').replace('"name"', 'name').replace('"date"', 'date').replace('"value"', 'value')

In [None]:
f1 = open('per_day_selected_countries_cases_json.json', 'w')
f2 = open('per_day_selected_countries_deaths_json.json', 'w')
f3 = open('per_day_selected_countries_recovered_json.json', 'w')

f1.write(json1)
f2.write(json2)
f3.write(json3)

f1.close()
f2.close()
f3.close()

In [12]:
per_day.to_csv('Data_per_day_country.csv', index = False)

In [13]:
for_map = df[["last_update","confirmed","deaths","recovered" , "latitude", "longitude"]]

In [14]:
for_map.head()

Unnamed: 0,last_update,confirmed,deaths,recovered,latitude,longitude
0,2020-01-22,1.0,0.0,0.0,31.826,117.226
1,2020-01-22,14.0,0.0,0.0,40.182,116.414
2,2020-01-22,6.0,0.0,0.0,30.057,107.874
3,2020-01-22,1.0,0.0,0.0,26.079,117.987
4,2020-01-22,0.0,0.0,0.0,36.061,103.834


In [15]:
for_map.to_csv('for_map.csv', index = False)

## 2 Gender/age Dataset

In [124]:
df1 = pd.read_csv("data/covid19_sexe.csv", sep = ',')

In [125]:
df1.head()

Unnamed: 0,Country,Sex-disaggregated?,Date,Cases,Cases (% male),Cases (% female),Deaths,deaths (% male),deaths (% female),Deaths among confirmed cases (male),Deaths in confirmed cases (female),Deaths in confirmed cases (Male:female ratio),Sources
0,Thailand,Yes,16.05.20,3025,54.0,46.0,56,77.0,23.0,2.6%,0.9%,2.8,Source
1,Dominican Republic,Yes,17.05.20,12725,54.0,46.0,434,72.0,28.0,4.5%,2.1%,2.2,Source
2,Greece,Yes,19.05.20,2632,55.0,45.0,165,72.0,28.0,8.1%,4.0%,2.1,Source
3,The Netherlands,Yes,19.05.20,44196,37.0,63.0,5715,55.0,45.0,19.5%,9.1%,2.1,Source
4,Belgium,Yes,18.05.20,55564,37.0,63.0,6475,51.0,49.0,16.0%,9.1%,1.8,Source


In [126]:
df1.dropna(subset = ["deaths (% male)", "Cases (% male)"], inplace = True)

In [144]:
df_cl = df1[["Country","Cases (% male)", "Cases (% female)", "deaths (% male)","deaths (% female)", "Deaths among confirmed cases (male)", "Deaths in confirmed cases (female)", "Deaths in confirmed cases (Male:female ratio)"]]

In [145]:
df_cl

Unnamed: 0,Country,Cases (% male),Cases (% female),deaths (% male),deaths (% female),Deaths among confirmed cases (male),Deaths in confirmed cases (female),Deaths in confirmed cases (Male:female ratio)
0,Thailand,54.0,46.0,77.0,23.0,2.6%,0.9%,2.8
1,Dominican Republic,54.0,46.0,72.0,28.0,4.5%,2.1%,2.2
2,Greece,55.0,45.0,72.0,28.0,8.1%,4.0%,2.1
3,The Netherlands,37.0,63.0,55.0,45.0,19.5%,9.1%,2.1
4,Belgium,37.0,63.0,51.0,49.0,16.0%,9.1%,1.8
5,Denmark,42.0,58.0,57.0,43.0,6.7%,3.7%,1.8
6,Italy,46.0,54.0,60.0,40.0,17.4%,9.9%,1.8
7,Romania,45.0,55.0,60.0,40.0,8.8%,4.8%,1.8
8,Spain,43.0,57.0,57.0,43.0,10.6%,6.1%,1.8
9,South Africa,42.0,58.0,56.0,44.0,2.3%,1.3%,1.8


In [146]:
df_cl = df_cl.replace({'Republic of Ireland':'Ireland', 'The Netherlands':'Netherlands', 'England':'United Kingdom'})

In [147]:
per_day_sankey = per_day_sankey.replace('Czechia', 'Czech Republic')

In [148]:
per_day_sankey['country_region'].unique()

array(['Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Angola',
       'Antigua and Barbuda', 'Argentina', 'Armenia', 'Australia',
       'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh',
       'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin', 'Bhutan',
       'Bolivia', 'Bosnia and Herzegovina', 'Botswana', 'Brazil',
       'Brunei', 'Bulgaria', 'Burkina Faso', 'Burma', 'Burundi',
       'Cabo Verde', 'Cambodia', 'Cameroon', 'Canada',
       'Central African Republic', 'Chad', 'Chile', 'China', 'Colombia',
       'Comoros', 'Congo (Brazzaville)', 'Congo (Kinshasa)', 'Costa Rica',
       "Cote d'Ivoire", 'Croatia', 'Cuba', 'Cyprus', 'Czech Republic',
       'Denmark', 'Diamond Princess', 'Djibouti', 'Dominica',
       'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador',
       'Equatorial Guinea', 'Eritrea', 'Estonia', 'Eswatini', 'Ethiopia',
       'Fiji', 'Finland', 'France', 'Gabon', 'Gambia', 'Georgia',
       'Germany', 'Ghana', 'Greece', 'Grenada', 'Guatemala'

In [149]:
df_cl = df_cl.merge(per_day_sankey, how='left', left_on='Country', right_on='country_region')

In [152]:
df_cl = df_cl.dropna()

In [183]:
df_cl

Unnamed: 0,Country,Cases (% male),Cases (% female),deaths (% male),deaths (% female),Deaths among confirmed cases (male),Deaths in confirmed cases (female),Deaths in confirmed cases (Male:female ratio),country_region,last_update,confirmed,deaths,recovered
0,Thailand,54.0,46.0,77.0,23.0,2.6%,0.9%,2.8,Thailand,2020-05-23T00:00:00,3037.0,56.0,2910.0
1,Dominican Republic,54.0,46.0,72.0,28.0,4.5%,2.1%,2.2,Dominican Republic,2020-05-23T00:00:00,13989.0,456.0,7572.0
2,Greece,55.0,45.0,72.0,28.0,8.1%,4.0%,2.1,Greece,2020-05-23T00:00:00,2874.0,169.0,1374.0
3,Netherlands,37.0,63.0,55.0,45.0,19.5%,9.1%,2.1,Netherlands,2020-05-23T00:00:00,45088.0,5807.0,174.0
4,Belgium,37.0,63.0,51.0,49.0,16.0%,9.1%,1.8,Belgium,2020-05-23T00:00:00,56511.0,9212.0,15123.0
5,Denmark,42.0,58.0,57.0,43.0,6.7%,3.7%,1.8,Denmark,2020-05-23T00:00:00,11428.0,561.0,9962.0
6,Italy,46.0,54.0,60.0,40.0,17.4%,9.9%,1.8,Italy,2020-05-23T00:00:00,228658.0,32616.0,136720.0
7,Romania,45.0,55.0,60.0,40.0,8.8%,4.8%,1.8,Romania,2020-05-23T00:00:00,17712.0,1166.0,10777.0
8,Spain,43.0,57.0,57.0,43.0,10.6%,6.1%,1.8,Spain,2020-05-23T00:00:00,234824.0,28628.0,150376.0
9,South Africa,42.0,58.0,56.0,44.0,2.3%,1.3%,1.8,South Africa,2020-05-23T00:00:00,20125.0,397.0,10104.0


## Data for sankey plot

In [172]:
dict_ = {}

In [185]:
for country in df_cl['Country']:
    dict_value = {'nodes':[{"node":0,"name":"Women"},{"node":1,"name":"Men"},{"node":2,"name":"Deaths"},{"node":3,"name":"Confirmed cases without death"}]}
    number_men_cases = int(df_cl[df_cl['Country']==country]['confirmed'] * df_cl[df_cl['Country']==country]['Cases (% male)'] / 100)
    number_men_deaths = int(df_cl[df_cl['Country']==country]['deaths'] * df_cl[df_cl['Country']==country]['deaths (% male)'] / 100)
    
    number_women_cases = int(df_cl[df_cl['Country']==country]['confirmed'] * df_cl[df_cl['Country']==country]['Cases (% female)'] / 100)
    number_women_deaths = int(df_cl[df_cl['Country']==country]['deaths'] * df_cl[df_cl['Country']==country]['deaths (% female)'] / 100)
    
    number_men_cases_alive = number_men_cases - number_men_deaths
    number_women_cases_alive = number_women_cases - number_women_deaths
    
    dict_value_links = [{'source':0,'target':2,'value':number_women_deaths},{'source':0,'target':3,'value':number_women_cases_alive},{'source':1,'target':2,'value':number_men_deaths},{'source':1,'target':3,'value':number_men_cases_alive}]
    dict_value['links'] = dict_value_links
    
    dict_value_json = json.dumps(dict_value)
    
    f = open('sexe_deaths_sankey_' + country + '.json', 'w')

    f.write(dict_value_json)

    f.close()

In [174]:
dict_

{'Thailand': {'nodes': [{'node': 0, 'name': 'Women'},
   {'node': 1, 'name': 'Men'},
   {'node': 2, 'name': 'Deaths'},
   {'node': 3, 'name': 'Confirmed cases without death'}],
  'links': [{'source': 0, 'target': 2, 'value': 12},
   {'source': 0, 'target': 3, 'value': 1385},
   {'source': 1, 'target': 2, 'value': 43},
   {'source': 1, 'target': 3, 'value': 1596}]},
 'Dominican Republic': {'nodes': [{'node': 0, 'name': 'Women'},
   {'node': 1, 'name': 'Men'},
   {'node': 2, 'name': 'Deaths'},
   {'node': 3, 'name': 'Confirmed cases without death'}],
  'links': [{'source': 0, 'target': 2, 'value': 127},
   {'source': 0, 'target': 3, 'value': 6307},
   {'source': 1, 'target': 2, 'value': 328},
   {'source': 1, 'target': 3, 'value': 7226}]},
 'Greece': {'nodes': [{'node': 0, 'name': 'Women'},
   {'node': 1, 'name': 'Men'},
   {'node': 2, 'name': 'Deaths'},
   {'node': 3, 'name': 'Confirmed cases without death'}],
  'links': [{'source': 0, 'target': 2, 'value': 47},
   {'source': 0, 'target

In [180]:
dict_ = json.dumps(dict_)

In [181]:
dict_

'{"Thailand": {"nodes": [{"node": 0, "name": "Women"}, {"node": 1, "name": "Men"}, {"node": 2, "name": "Deaths"}, {"node": 3, "name": "Confirmed cases without death"}], "links": [{"source": 0, "target": 2, "value": 12}, {"source": 0, "target": 3, "value": 1385}, {"source": 1, "target": 2, "value": 43}, {"source": 1, "target": 3, "value": 1596}]}, "Dominican Republic": {"nodes": [{"node": 0, "name": "Women"}, {"node": 1, "name": "Men"}, {"node": 2, "name": "Deaths"}, {"node": 3, "name": "Confirmed cases without death"}], "links": [{"source": 0, "target": 2, "value": 127}, {"source": 0, "target": 3, "value": 6307}, {"source": 1, "target": 2, "value": 328}, {"source": 1, "target": 3, "value": 7226}]}, "Greece": {"nodes": [{"node": 0, "name": "Women"}, {"node": 1, "name": "Men"}, {"node": 2, "name": "Deaths"}, {"node": 3, "name": "Confirmed cases without death"}], "links": [{"source": 0, "target": 2, "value": 47}, {"source": 0, "target": 3, "value": 1246}, {"source": 1, "target": 2, "value

In [182]:
f1 = open('sexe_deaths_sankey.json', 'w')

f1.write(dict_)

f1.close()

In [7]:
#df_cl.to_csv("Gender_per_country.csv")

## 3 SARS Dataset

In [22]:
sars = pd.read_csv("data/sars_2003_complete_dataset_clean.csv", sep = ',')

In [23]:
sars.head()

Unnamed: 0,Date,Country,Cumulative number of case(s),Number of deaths,Number recovered
0,2003-03-17,Germany,1,0,0
1,2003-03-17,Canada,8,2,0
2,2003-03-17,Singapore,20,0,0
3,2003-03-17,"Hong Kong SAR, China",95,1,0
4,2003-03-17,Switzerland,2,0,0


In [24]:
sars[["Country", "Date", "Cumulative number of case(s)", "Number of deaths", "Number recovered"]].to_csv("sars_country_date.csv", index = False)

## 4 Ebola Dataset

In [25]:
ebola = pd.read_csv("data/ebola_2014_2016_clean.csv")

In [26]:
ebola.head()

Unnamed: 0,Country,Date,No. of suspected cases,No. of probable cases,No. of confirmed cases,"No. of confirmed, probable and suspected cases",No. of suspected deaths,No. of probable deaths,No. of confirmed deaths,"No. of confirmed, probable and suspected deaths"
0,Guinea,2014-08-29,25.0,141.0,482.0,648.0,2.0,141.0,287.0,430.0
1,Nigeria,2014-08-29,3.0,1.0,15.0,19.0,0.0,1.0,6.0,7.0
2,Sierra Leone,2014-08-29,54.0,37.0,935.0,1026.0,8.0,34.0,380.0,422.0
3,Liberia,2014-08-29,382.0,674.0,322.0,1378.0,168.0,301.0,225.0,694.0
4,Sierra Leone,2014-09-05,78.0,37.0,1146.0,1261.0,11.0,37.0,443.0,491.0


In [27]:
ebola.fillna(0, inplace = True)

In [28]:
ebola[["Country", "Date", "No. of confirmed cases", "No. of confirmed deaths"]].to_csv("ebola_country_date.csv", index = False)

# Government Dataset and Testing Dataset

#### Testing Dataset

In [29]:
tests = pd.read_csv('data/covid19_testing.csv')

In [30]:
tests.head()

Unnamed: 0,Entity,Code,Date,Total tests per thousand
0,Argentina,ARG,"Apr 8, 2020",0.295
1,Argentina,ARG,"Apr 9, 2020",0.329
2,Argentina,ARG,"Apr 10, 2020",0.362
3,Argentina,ARG,"Apr 11, 2020",0.399
4,Argentina,ARG,"Apr 13, 2020",0.437


In [31]:
tests[['Entity', 'Date', 'Total tests per thousand']].isnull().values.any()

False

In [32]:
tests['Code'].isnull().values.any()

True

We can thus delete the `Code` columns from the dataset

In [33]:
tests = tests.drop(columns = ['Code'])

Check if no two dates are the same for a given Country

In [34]:
tests.groupby('Entity')['Date'].apply(lambda x: x.duplicated().any()).unique()

array([False])

In [35]:
selected_countries = ['United States', 'France', 'Belgium', 'Germany']

In [36]:
tests = tests[tests['Entity'].isin(selected_countries)]

In [37]:
tests.head()

Unnamed: 0,Entity,Date,Total tests per thousand
256,Belgium,"Mar 1, 2020",0.005
257,Belgium,"Mar 2, 2020",0.029
258,Belgium,"Mar 3, 2020",0.07
259,Belgium,"Mar 4, 2020",0.126
260,Belgium,"Mar 5, 2020",0.187


#### Governement Dataset

In [10]:
gov_oxford = pd.read_csv('data/gov_oxford.csv')

In [11]:
gov_oxford.head()

Unnamed: 0,CountryName,CountryCode,Date,C1_School closing,C1_Flag,C2_Workplace closing,C2_Flag,C3_Cancel public events,C3_Flag,C4_Restrictions on gatherings,...,H3_Contact tracing,H4_Emergency investment in healthcare,H5_Investment in vaccines,M1_Wildcard,ConfirmedCases,ConfirmedDeaths,StringencyIndex,StringencyIndexForDisplay,LegacyStringencyIndex,LegacyStringencyIndexForDisplay
0,Aruba,ABW,20200101,0.0,,0.0,,0.0,,0.0,...,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0
1,Aruba,ABW,20200102,0.0,,0.0,,0.0,,0.0,...,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0
2,Aruba,ABW,20200103,0.0,,0.0,,0.0,,0.0,...,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0
3,Aruba,ABW,20200104,0.0,,0.0,,0.0,,0.0,...,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0
4,Aruba,ABW,20200105,0.0,,0.0,,0.0,,0.0,...,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0


In [13]:
gov_oxford = gov_oxford.drop(['S1_Notes', 'S1_IsGeneral', 'S2_Notes', 'S2_IsGeneral', 'S3_Notes', 'S3_IsGeneral', 'S4_Notes', 'S4_IsGeneral', 'S5_Notes', 'S5_IsGeneral', 'S6_Notes', 'S6_IsGeneral', 'S7_Notes', 'S8_Notes', 'S9_Notes', 'S10_Notes', 'S11_Notes', 'S12_Notes', 'S13_Notes', 'Unnamed: 39'], axis=1)

KeyError: "['S1_Notes' 'S1_IsGeneral' 'S2_Notes' 'S2_IsGeneral' 'S3_Notes'\n 'S3_IsGeneral' 'S4_Notes' 'S4_IsGeneral' 'S5_Notes' 'S5_IsGeneral'\n 'S6_Notes' 'S6_IsGeneral' 'S7_Notes' 'S8_Notes' 'S9_Notes' 'S10_Notes'\n 'S11_Notes' 'S12_Notes' 'S13_Notes' 'Unnamed: 39'] not found in axis"

We want every country to be compared on the same date range

In [None]:
len(gov_oxford['CountryName'].unique())

In [None]:
len(gov_oxford['Date'].unique())

In [None]:
gov_oxford[['CountryName', 'Date']].isnull().values.any()

In [None]:
full_dates = gov_oxford[['Date', 'CountryName']].groupby('Date').count()
full_dates = full_dates[full_dates['CountryName'] == len(gov_oxford['CountryName'].unique())]
full_dates.head()

In [None]:
full_dates = full_dates.reset_index().drop(columns = ['CountryName'])
gov_oxford_map = gov_oxford[gov_oxford['Date'].isin(full_dates)]

In [None]:
def convert_date_appearance():
    

In [None]:
gov_map_si = gov_oxford_map[['CountryName', 'Date', 'StringencyIndexForDisplay']]

In [None]:
gov_map_school = gov_oxford_map[['CountryName', 'Date', 'S1_School closing']]

In [None]:
gov_map_work = gov_oxford_map[['CountryName', 'Date', 'S2_Workplace closing']]

In [None]:
gov_map_events = gov_oxford_map[['CountryName', 'Date', 'S3_Cancel public events']]

In [None]:
gov_map_transport = gov_oxford_map[['CountryName', 'Date', 'S4_Close public transport']]

In [None]:
gov_map_info = gov_oxford_map[['CountryName', 'Date', 'S5_Public information campaigns']]

In [None]:
gov_map_movement = gov_oxford_map[['CountryName', 'Date', 'S6_Restrictions on internal movement']]

In [None]:
gov_map_travel_controls = gov_oxford_map[['CountryName', 'Date', 'S7_International travel controls']]

In [None]:
gov_map_fiscal = gov_oxford_map[['CountryName', 'Date', 'S8_Fiscal measures']]

In [None]:
gov_map_monetary = gov_oxford_map[['CountryName', 'Date', 'S9_Monetary measures']]

In [None]:
gov_map_health_investment = gov_oxford_map[['CountryName', 'Date', 'S10_Emergency investment in health care']]

In [None]:
gov_map_vaccine_investment = gov_oxford_map[['CountryName', 'Date', 'S11_Investment in Vaccines']]

In [None]:
gov_map_testing = gov_oxford_map[['CountryName', 'Date', 'S12_Testing framework']]

In [None]:
gov_map_contact_tracing = gov_oxford_map[['CountryName', 'Date', 'S13_Contact tracing']]

Country selection

In [None]:
selected_countries = ['United States', 'France', 'Germany', 'Belgium']

In [None]:
gov_oxford_select = gov_oxford[gov_oxford['CountryName'].isin(selected_countries)]

In [None]:
gov_oxford_select.head()

In [None]:
gov_oxford_select[['CountryName', 'Date', 'ConfirmedCases', 'ConfirmedDeaths']].isnull().values.any()

In [None]:
gov_selected_cases_si = gov_oxford_select[['CountryName', 'Date', 'ConfirmedCases', 'StringencyIndexForDisplay']]

Non economical measures

In [None]:
non_economical_measures = ['CountryName', 'Date', 'ConfirmedCases', 'S1_School closing', 'S2_Workplace closing', 'S3_Cancel public events', 'S4_Close public transport', 'S5_Public information campaigns', 'S6_Restrictions on internal movement', 'S13_Contact tracing']

In [None]:
gov_selected_non_eco_measures = gov_oxford_select[non_economical_measures]

In [None]:
gov_selected_non_eco_measures.head()

In [None]:
gov_selected_non_eco_measures[['ConfirmedCases']].isnull().values.any()

In [None]:
gov_selected_non_eco_measures[gov_selected_non_eco_measures.isna().any(axis=1)]

In [None]:
gov_selected_non_eco_measures[(gov_selected_non_eco_measures['CountryName'] == 'Germany') & (gov_selected_non_eco_measures['S6_Restrictions on internal movement'] == 2)]

We thus can delete `nan` rows 

In [None]:
gov_selected_non_eco_measures = gov_selected_non_eco_measures.dropna()

In [None]:
gov_selected_non_eco_measures[(gov_selected_non_eco_measures['CountryName'] == 'Germany') & (gov_selected_non_eco_measures['S6_Restrictions on internal movement'] == 2)]

In [None]:
binary_format = {1:0}
binary_format_contact_tracing = {1: 2}

In [None]:
gov_selected_non_eco_measures[['S1_School closing', 'S2_Workplace closing', 'S3_Cancel public events', 'S4_Close public transport', 'S6_Restrictions on internal movement']] = gov_selected_non_eco_measures[['S1_School closing', 'S2_Workplace closing', 'S3_Cancel public events', 'S4_Close public transport', 'S6_Restrictions on internal movement']].replace(binary_format)
gov_selected_non_eco_measures[['S13_Contact tracing']] = gov_selected_non_eco_measures[['S13_Contact tracing']].replace(binary_format_contact_tracing)

In [None]:
def locate_changes(df, column_to_change):
    if df['value'] == False:
        df['value'] = df[column_to_change]
    else:
        df['value'] = np.nan
    return df

def delete_first_row(df):
    return df[1:]



School measure

In [None]:
school = gov_selected_non_eco_measures[['CountryName', 'Date', 'S1_School closing']]

In [None]:
school['value'] = school['S1_School closing'].eq(school['S1_School closing'].shift())

In [None]:
school = school.apply(lambda x: locate_changes(x, 'S1_School closing'), axis = 1)

In [None]:
school = school.drop(columns = ['S1_School closing']).dropna()

In [None]:
school_text_measures = {0: 'Schools open', 2: 'School closed '}

In [None]:
school = school.replace(school_text_measures).groupby('CountryName').apply(lambda x: delete_first_row(x))

In [None]:
school

Workplace measures

In [None]:
workplace = gov_selected_non_eco_measures[['CountryName', 'Date', 'S2_Workplace closing']]
workplace['value'] = workplace['S2_Workplace closing'].eq(workplace['S2_Workplace closing'].shift())
workplace = workplace.apply(lambda x: locate_changes(x, 'S2_Workplace closing'), axis = 1)
workplace = workplace.drop(columns = ['S2_Workplace closing']).dropna()
workplace_text_measures = {0: 'Workplace open', 2: 'Worplace closed '}
workplace = workplace.replace(workplace_text_measures).groupby('CountryName').apply(lambda x: delete_first_row(x))

Public events measures

In [None]:
events = gov_selected_non_eco_measures[['CountryName', 'Date', 'S3_Cancel public events']]
events['value'] = events['S3_Cancel public events'].eq(events['S3_Cancel public events'].shift())
events = events.apply(lambda x: locate_changes(x, 'S3_Cancel public events'), axis = 1)
events = events.drop(columns = ['S3_Cancel public events']).dropna()
events_text_measures = {0: 'No measures on public events', 2: 'Cancel public events'}
events = events.replace(events_text_measures).groupby('CountryName').apply(lambda x: delete_first_row(x))

Public transport measures

In [None]:
transport = gov_selected_non_eco_measures[['CountryName', 'Date', 'S4_Close public transport']]
transport['value'] = transport['S4_Close public transport'].eq(transport['S4_Close public transport'].shift())
transport = transport.apply(lambda x: locate_changes(x, 'S4_Close public transport'), axis = 1)
transport = transport.drop(columns = ['S4_Close public transport']).dropna()
transport_text_measures = {0: 'No measures on public transport', 2: 'Close public transport'}
transport = transport.replace(transport_text_measures).groupby('CountryName').apply(lambda x: delete_first_row(x))

Information campaign measures

In [None]:
campaign = gov_selected_non_eco_measures[['CountryName', 'Date', 'S5_Public information campaigns']]
campaign['value'] = campaign['S5_Public information campaigns'].eq(campaign['S5_Public information campaigns'].shift())
campaign = campaign.apply(lambda x: locate_changes(x, 'S5_Public information campaigns'), axis = 1)
campaign = campaign.drop(columns = ['S5_Public information campaigns']).dropna()
campaign_text_measures = {0: 'No COVID-19 information campaign', 1: 'COVID-19 public information campaign'}
campaign = campaign.replace(campaign_text_measures).groupby('CountryName').apply(lambda x: delete_first_row(x))

Internal movement measures

In [None]:
internal_movement = gov_selected_non_eco_measures[['CountryName', 'Date', 'S6_Restrictions on internal movement']]
internal_movement['value'] = internal_movement['S6_Restrictions on internal movement'].eq(internal_movement['S6_Restrictions on internal movement'].shift())
internal_movement = internal_movement.apply(lambda x: locate_changes(x, 'S6_Restrictions on internal movement'), axis = 1)
internal_movement = internal_movement.drop(columns = ['S6_Restrictions on internal movement']).dropna()
internal_movement_text_measures = {0: 'No restriction on internal movement', 2: 'Rectriction on movement'}
internal_movement = internal_movement.replace(internal_movement_text_measures).groupby('CountryName').apply(lambda x: delete_first_row(x))

Contact tracing measures

In [None]:
tracing = gov_selected_non_eco_measures[['CountryName', 'Date', 'S13_Contact tracing']]
tracing['value'] = tracing['S13_Contact tracing'].eq(tracing['S13_Contact tracing'].shift())
tracing = tracing.apply(lambda x: locate_changes(x, 'S13_Contact tracing'), axis = 1)
tracing = tracing.drop(columns = ['S13_Contact tracing']).dropna()
tracing_text_measures = {0: 'No contact tracing', 2: 'Contact tracing used'}
tracing = tracing.replace(tracing_text_measures).groupby('CountryName').apply(lambda x: delete_first_row(x))

Concatenate all the values

In [None]:
overall_measures = school.append(workplace, ignore_index=True).append(events, ignore_index=True).append(transport, ignore_index=True).append(campaign, ignore_index=True).append(internal_movement, ignore_index=True).append(tracing, ignore_index=True)
overall_measures.head()

In [None]:
germany_measures = overall_measures[overall_measures['CountryName'] == 'Germany']
usa_measures = overall_measures[overall_measures['CountryName'] == 'France']
france_measures = overall_measures[overall_measures['CountryName'] == 'United States']
gelgium_measures = overall_measures[overall_measures['CountryName'] == 'Belgium']

# Detailed Datasets

### Structure:
- a) Clean data for general informations by date
- b) Clean data for age by date
- c) Clean data for gender by date
- d) Clean data for hospitalisations by date
- e) Compute fatality rate by date

## 1 France

(Only hospitalisation data)

In [14]:
PATH_COVID_FR = os.path.expanduser('~/Documents/GitHub/com-480-project-pouletpanier/data/detailed_data/donnees-hospitalieres-covid19-2020-04-25-19h00.csv')
PATH_COVID_FR_METADATA = os.path.expanduser('~/Documents/GitHub/com-480-project-pouletpanier/data/detailed_data/metadonnees-donnees-hospitalieres-covid19.csv')

PATH_COVID_FR_AGE = os.path.expanduser('~/Documents/GitHub/com-480-project-pouletpanier/data/detailed_data/donnees-hospitalieres-classe-age-covid19-2020-04-25-19h00.csv')
PATH_COVID_FR_AGE_METADATA = os.path.expanduser('~/Documents/GitHub/com-480-project-pouletpanier/data/detailed_data/metadonnees-donnees-hospitalieres-covid19-classes-age.csv')
PATH_COVID_FR_SEXE_METADATA = os.path.expanduser('~/Documents/GitHub/com-480-project-pouletpanier/data/detailed_data/metadonnees-sexe.csv')

PATH_COVID_FR_HOSP = os.path.expanduser('~/Documents/GitHub/com-480-project-pouletpanier/data/detailed_data/donnees-hospitalieres-nouveaux-covid19-2020-04-25-19h00.csv')
PATH_COVID_FR_HOSP_METADATA = os.path.expanduser('~/Documents/GitHub/com-480-project-pouletpanier/data/detailed_data/metadonnees-hospit-incid.csv')


In [15]:
covid_fr = pd.read_csv(PATH_COVID_FR, sep=';')
covid_fr_metadata = pd.read_csv(PATH_COVID_FR_METADATA, sep=';')

covid_fr_age = pd.read_csv(PATH_COVID_FR_AGE, sep=';')
covid_fr_age_metadata = pd.read_csv(PATH_COVID_FR_AGE_METADATA, sep=';')
covid_fr_sexe_metadata = pd.read_csv(PATH_COVID_FR_SEXE_METADATA)

covid_fr_hosp = pd.read_csv(PATH_COVID_FR_HOSP, sep=';')
covid_fr_hosp_metadata = pd.read_csv(PATH_COVID_FR_HOSP_METADATA, sep=';')

In [16]:
covid_fr.head()

Unnamed: 0,dep,sexe,jour,hosp,rea,rad,dc
0,1,0,2020-03-18,2,0,1,0
1,1,1,2020-03-18,1,0,1,0
2,1,2,2020-03-18,1,0,0,0
3,2,0,2020-03-18,41,10,18,11
4,2,1,2020-03-18,19,4,11,6


In [17]:
covid_fr_metadata

Unnamed: 0,Colonne,Type,Description_FR,Description_EN,Exemple
0,dep,integer,Département,Department,1
1,sexe,integer,Sexe,Sex,0
2,jour,string($date),Date de notification,Date of notice,18/03/2020
3,hosp,integer,Nombre de personnes actuellement hospitalisées,Number of people currently hospitalized,2
4,rea,integer,Nombre de personnes actuellement en réanimatio...,Number of people currently in resuscitation or...,0
5,rad,integer,Nombre cumulé de personnes retournées à domicile,Total amount of patient that returned home,1
6,dc,integer,Nombre cumulé de personnes décédées à l'hôpital,Total amout of deaths at the hospital,0


In [18]:
covid_fr_age.head()

Unnamed: 0,reg,cl_age90,jour,hosp,rea,rad,dc
0,1,0,2020-04-07,34,17,43,8
1,1,9,2020-04-07,0,0,0,0
2,1,19,2020-04-07,0,0,0,0
3,1,29,2020-04-07,0,0,1,0
4,1,39,2020-04-07,1,1,3,0


In [19]:
covid_fr_age_metadata.head(7)

Unnamed: 0,Colonne,Type,Description_FR,Description_EN,Exemple,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12
0,reg,integer,Region,Region,1,,,,,,,,
1,cl_age90,integer,Classe age,Age group,9,,,,,,,,
2,jour,string($date),Date de notification,Date of notice,18/03/2020,,,,,,,,
3,hosp,integer,Nombre de personnes actuellement hospitalisées,Number of people currently hospitalized,2,,,,,,,,
4,rea,integer,Nombre de personnes actuellement en réanimatio...,Number of people currently in resuscitation or...,0,,,,,,,,
5,rad,integer,Nombre cumulé de personnes retournées à domicile,Total amount of patient that returned home,1,,,,,,,,
6,dc,integer,Nombre cumulé de personnes décédées,Total amout of deaths,0,,,,,,,,


In [20]:
covid_fr_sexe_metadata

Unnamed: 0,{\rtf1\ansi\ansicpg1252\cocoartf1671\cocoasubrtf600
0,{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
1,{\colortbl;\red255\green255\blue255;}
2,{\*\expandedcolortbl;;}
3,\paperw11900\paperh16840\margl1440\margr1440\v...
4,\pard\tx566\tx1133\tx1700\tx2267\tx2834\tx3401...
5,\f0\fs24 \cf0 Code ; Sexe
6,\
7,0; femmes + hommes
8,\
9,1;hommes\


In [21]:
covid_fr_hosp.head()

Unnamed: 0,dep,jour,incid_hosp,incid_rea,incid_dc,incid_rad
0,1,2020-03-19,1,0,0,0
1,1,2020-03-20,0,0,0,1
2,1,2020-03-21,3,0,0,0
3,1,2020-03-22,3,1,0,1
4,1,2020-03-23,14,1,0,5


In [22]:
covid_fr_hosp_metadata.head(6)

Unnamed: 0,Colonne,Type,Description_FR,Description_EN,Exemple
0,dep,integer,Département,Department,1
1,jour,string($date),Date de notification,Date of notice,19/03/2020
2,incid_hosp,string,Nombre quotidien de personnes nouvellement hos...,Daily number of newly hospitalized persons,1
3,incid_rea,integer,Nombre quotidien de nouvelles admissions en ré...,Daily number of new intensive care admissions,1
4,incid_dc,integer,Nombre quotidien de personnes nouvellement déc...,Daily number of newly deceased persons,1
5,incid_rad,integer,Nombre quotidien de nouveaux retours à domicile,Daily number of new home returns,2


### A) 

ok with general dataset

### B) Clean data for age by date

In [23]:
covid_fr_age_regions = covid_fr_age.copy()
covid_fr_age = covid_fr_age.rename({'cl_age90':'age', 'jour':'date', 'hosp':'hospitalisations', 'rea':'reanimations', 'dc':'deaths'}, axis=1).drop(['rad', 'reg'], axis=1)

In [24]:
covid_fr_age = covid_fr_age.groupby(['age', 'date']).sum().reset_index()

In [25]:
covid_fr_age.head()

Unnamed: 0,age,date,hospitalisations,reanimations,deaths
0,0,2020-04-07,29871,7004,7091
1,0,2020-04-08,30217,7019,7632
2,0,2020-04-09,30608,6937,8044
3,0,2020-04-10,31108,6875,8598
4,0,2020-04-11,31159,6752,8943


### C) Clean data for sexe by date

In [26]:
covid_fr_sexe_regions = covid_fr.copy()
covid_fr_sexe = covid_fr.rename({'jour':'date', 'hosp':'hospitalisations', 'rea':'reanimations', 'dc':'deaths'}, axis=1).drop(['dep', 'rad'], axis=1)

In [27]:
covid_fr_sexe['sexe'] = covid_fr_sexe['sexe'].map({0:'total', 1:'m', 2:'f'})

In [28]:
covid_fr_sexe = covid_fr_sexe.groupby(['sexe', 'date']).sum().reset_index()

In [29]:
covid_fr_sexe.head()

Unnamed: 0,sexe,date,hospitalisations,reanimations,deaths
0,f,2020-03-18,1248,239,96
1,f,2020-03-19,1632,284,136
2,f,2020-03-20,2134,362,190
3,f,2020-03-21,2401,401,219
4,f,2020-03-22,2815,461,263


## 2 Germany

In [30]:
PATH_COVID_GER = os.path.expanduser('~/Documents/GitHub/com-480-project-pouletpanier/data/detailed_data/RKI_COVID19.csv')

In [31]:
covid_ger = pd.read_csv(PATH_COVID_GER)

In [32]:
covid_ger = covid_ger.rename({'Altersgruppe':'Age', 'Geschlecht':'Sexe', 'AnzahlFall':'Cases', 'AnzahlTodesfall':'Deaths', 'Meldedatum':'Date', 'NeuerFall':'New Cases', 'NeuerTodesfall':'New Deaths', 'NeuGenesen':'New Recovers', 'AnzahlGenesen':'Recovers'}, axis=1)

In [33]:
covid_ger.head()

Unnamed: 0,IdBundesland,Bundesland,Landkreis,Age,Sexe,Cases,Deaths,ObjectId,Date,IdLandkreis,Datenstand,New Cases,New Deaths,Refdatum,New Recovers,Recovers
0,1,Schleswig-Holstein,LK Steinburg,A15-A34,W,1,0,3650921,2020-03-25T00:00:00.000Z,1061,"25.04.2020, 00:00 Uhr",0,-9,2020-03-16T00:00:00.000Z,0,1
1,1,Schleswig-Holstein,LK Steinburg,A15-A34,W,1,0,3650922,2020-03-27T00:00:00.000Z,1061,"25.04.2020, 00:00 Uhr",0,-9,2020-03-22T00:00:00.000Z,0,1
2,1,Schleswig-Holstein,LK Steinburg,A15-A34,W,1,0,3650923,2020-04-01T00:00:00.000Z,1061,"25.04.2020, 00:00 Uhr",0,-9,2020-03-21T00:00:00.000Z,0,1
3,1,Schleswig-Holstein,LK Steinburg,A15-A34,W,1,0,3650924,2020-04-06T00:00:00.000Z,1061,"25.04.2020, 00:00 Uhr",0,-9,2020-03-22T00:00:00.000Z,0,1
4,1,Schleswig-Holstein,LK Steinburg,A15-A34,W,1,0,3650925,2020-04-07T00:00:00.000Z,1061,"25.04.2020, 00:00 Uhr",0,-9,2020-03-31T00:00:00.000Z,0,1


### a) Clean data for general informations by date

In [34]:
covid_ger_general = covid_ger[['Bundesland', 'Landkreis', 'Cases', 'Deaths', 'Date', 'Recovers']]

In [35]:
covid_ger_general_by_regions = covid_ger_general.groupby(['Bundesland', 'Landkreis', 'Date']).sum().reset_index()

In [36]:
covid_ger_general_by_regions.head()

Unnamed: 0,Bundesland,Landkreis,Date,Cases,Deaths,Recovers
0,Baden-Württemberg,LK Alb-Donau-Kreis,2020-02-28T00:00:00.000Z,1,0,1
1,Baden-Württemberg,LK Alb-Donau-Kreis,2020-03-04T00:00:00.000Z,3,0,3
2,Baden-Württemberg,LK Alb-Donau-Kreis,2020-03-07T00:00:00.000Z,1,0,1
3,Baden-Württemberg,LK Alb-Donau-Kreis,2020-03-10T00:00:00.000Z,1,0,1
4,Baden-Württemberg,LK Alb-Donau-Kreis,2020-03-11T00:00:00.000Z,2,0,2


In [37]:
covid_ger_general_by_date = covid_ger_general_by_regions.drop(['Bundesland', 'Landkreis'], axis=1)\
                                    .groupby('Date')\
                                    .sum()\
                                    .reset_index()

In [38]:
covid_ger_general_by_date.head()

Unnamed: 0,Date,Cases,Deaths,Recovers
0,2020-01-28T00:00:00.000Z,2,0,2
1,2020-01-29T00:00:00.000Z,2,0,2
2,2020-01-31T00:00:00.000Z,4,0,4
3,2020-02-03T00:00:00.000Z,1,0,1
4,2020-02-04T00:00:00.000Z,5,0,4


### b) Clean data for age by date

In [39]:
covid_ger_general_age = covid_ger[['Bundesland', 'Landkreis', 'Cases', 'Deaths', 'Date', 'Recovers', 'Age']]

In [40]:
covid_ger_general_age_by_region = covid_ger_general_age.groupby(['Bundesland', 'Landkreis', 'Date', 'Age'])\
                                        .sum()\
                                        .reset_index()

In [41]:
covid_ger_general_age_by_region.head()

Unnamed: 0,Bundesland,Landkreis,Date,Age,Cases,Deaths,Recovers
0,Baden-Württemberg,LK Alb-Donau-Kreis,2020-02-28T00:00:00.000Z,A35-A59,1,0,1
1,Baden-Württemberg,LK Alb-Donau-Kreis,2020-03-04T00:00:00.000Z,A15-A34,1,0,1
2,Baden-Württemberg,LK Alb-Donau-Kreis,2020-03-04T00:00:00.000Z,A35-A59,2,0,2
3,Baden-Württemberg,LK Alb-Donau-Kreis,2020-03-07T00:00:00.000Z,A35-A59,1,0,1
4,Baden-Württemberg,LK Alb-Donau-Kreis,2020-03-10T00:00:00.000Z,A35-A59,1,0,1


In [42]:
covid_ger_general_age_by_region_by_date = covid_ger_general_age.drop(['Bundesland', 'Landkreis'], axis=1)\
                                                .groupby(['Date', 'Age'])\
                                                .sum()\
                                                .reset_index()

In [43]:
covid_ger_general_age_by_region_by_date.head()

Unnamed: 0,Date,Age,Cases,Deaths,Recovers
0,2020-01-28T00:00:00.000Z,A15-A34,1,0,1
1,2020-01-28T00:00:00.000Z,A35-A59,1,0,1
2,2020-01-29T00:00:00.000Z,A15-A34,2,0,2
3,2020-01-31T00:00:00.000Z,A00-A04,1,0,1
4,2020-01-31T00:00:00.000Z,A15-A34,1,0,1


### c) Clean data for sexe by date

In [44]:
covid_ger_general_sexe = covid_ger[['Bundesland', 'Landkreis', 'Cases', 'Deaths', 'Date', 'Recovers', 'Sexe']]

In [45]:
covid_ger_general_sexe_by_region = covid_ger_general_sexe.groupby(['Bundesland', 'Landkreis', 'Date', 'Sexe'])\
                                        .sum()\
                                        .reset_index()

In [46]:
covid_ger_general_sexe_by_region.head()

Unnamed: 0,Bundesland,Landkreis,Date,Sexe,Cases,Deaths,Recovers
0,Baden-Württemberg,LK Alb-Donau-Kreis,2020-02-28T00:00:00.000Z,M,1,0,1
1,Baden-Württemberg,LK Alb-Donau-Kreis,2020-03-04T00:00:00.000Z,M,3,0,3
2,Baden-Württemberg,LK Alb-Donau-Kreis,2020-03-07T00:00:00.000Z,M,1,0,1
3,Baden-Württemberg,LK Alb-Donau-Kreis,2020-03-10T00:00:00.000Z,M,1,0,1
4,Baden-Württemberg,LK Alb-Donau-Kreis,2020-03-11T00:00:00.000Z,M,1,0,1


In [47]:
covid_ger_general_sexe_by_region_by_date = covid_ger_general_sexe.drop(['Bundesland', 'Landkreis'], axis=1)\
                                                .groupby(['Date', 'Sexe'])\
                                                .sum()\
                                                .reset_index()

In [48]:
covid_ger_general_sexe_by_region_by_date.head()

Unnamed: 0,Date,Sexe,Cases,Deaths,Recovers
0,2020-01-28T00:00:00.000Z,M,2,0,2
1,2020-01-29T00:00:00.000Z,M,1,0,1
2,2020-01-29T00:00:00.000Z,W,1,0,1
3,2020-01-31T00:00:00.000Z,M,2,0,2
4,2020-01-31T00:00:00.000Z,W,2,0,2


### e) Compute fatality rate by date

In [49]:
def compute_fatality_rate(df):
    df['fatality_rate'] = df['Deaths'] / df['Cases']
    return df

In [50]:
covid_ger_fatality_rate_by_region = covid_ger[['Bundesland', 'Landkreis', 'Cases', 'Deaths', 'Date']]\
                                        .groupby(['Bundesland', 'Landkreis', 'Date'])\
                                        .sum()\
                                        .reset_index()

In [51]:
covid_ger_fatality_rate_by_date = covid_ger[['Cases', 'Deaths', 'Date']]\
                                        .groupby('Date')\
                                        .sum()\
                                        .reset_index()

In [52]:
compute_fatality_rate(covid_ger_fatality_rate_by_region).head()

Unnamed: 0,Bundesland,Landkreis,Date,Cases,Deaths,fatality_rate
0,Baden-Württemberg,LK Alb-Donau-Kreis,2020-02-28T00:00:00.000Z,1,0,0.0
1,Baden-Württemberg,LK Alb-Donau-Kreis,2020-03-04T00:00:00.000Z,3,0,0.0
2,Baden-Württemberg,LK Alb-Donau-Kreis,2020-03-07T00:00:00.000Z,1,0,0.0
3,Baden-Württemberg,LK Alb-Donau-Kreis,2020-03-10T00:00:00.000Z,1,0,0.0
4,Baden-Württemberg,LK Alb-Donau-Kreis,2020-03-11T00:00:00.000Z,2,0,0.0


In [53]:
compute_fatality_rate(covid_ger_fatality_rate_by_date).head()

Unnamed: 0,Date,Cases,Deaths,fatality_rate
0,2020-01-28T00:00:00.000Z,2,0,0.0
1,2020-01-29T00:00:00.000Z,2,0,0.0
2,2020-01-31T00:00:00.000Z,4,0,0.0
3,2020-02-03T00:00:00.000Z,1,0,0.0
4,2020-02-04T00:00:00.000Z,5,0,0.0


#### Comments

After doing e), we should only run the function compute_fatality_rate on the df computed in b) and c).

## 3 Belgium

In [54]:
PATH_COVID_BE = os.path.expanduser('~/Documents/GitHub/com-480-project-pouletpanier/data/detailed_data/COVID19BE.xlsx')
PATH_COVID_BE_HOSP = os.path.expanduser('~/Documents/GitHub/com-480-project-pouletpanier/data/detailed_data/COVID19BE_HOSP.csv')
PATH_COVID_BE_DEATHS = os.path.expanduser('~/Documents/GitHub/com-480-project-pouletpanier/data/detailed_data/COVID19BE_MORT.csv')
PATH_COVID_BE_TESTS = os.path.expanduser('~/Documents/GitHub/com-480-project-pouletpanier/data/detailed_data/COVID19BE_tests.csv')


In [55]:
covid_be = pd.read_excel(PATH_COVID_BE)
covid_be_hops = pd.read_csv(PATH_COVID_BE_HOSP, encoding = "ISO-8859-1")
covid_be_deaths = pd.read_csv(PATH_COVID_BE_DEATHS)
covid_be_tests = pd.read_csv(PATH_COVID_BE_TESTS)

In [56]:
covid_be.head()

Unnamed: 0,DATE,PROVINCE,REGION,AGEGROUP,SEX,CASES
0,2020-03-01,Brussels,Brussels,10-19,M,1
1,2020-03-01,Brussels,Brussels,10-19,F,1
2,2020-03-01,Brussels,Brussels,20-29,M,1
3,2020-03-01,Brussels,Brussels,30-39,F,1
4,2020-03-01,Brussels,Brussels,40-49,F,1


In [57]:
covid_be_hops.head()

Unnamed: 0,DATE,PROVINCE,REGION,NR_REPORTING,TOTAL_IN,TOTAL_IN_ICU,TOTAL_IN_RESP,TOTAL_IN_ECMO,NEW_IN,NEW_OUT
0,2020-03-15,Antwerpen,Flanders,14,50,9,4,0,8,8
1,2020-03-15,Brussels,Brussels,14,58,11,8,0,7,2
2,2020-03-15,Hainaut,Wallonia,15,56,13,11,1,26,1
3,2020-03-15,Limburg,Flanders,7,20,6,3,0,9,3
4,2020-03-15,Liège,Wallonia,12,22,2,1,0,4,1


In [58]:
covid_be_deaths.head()

Unnamed: 0,DATE,REGION,AGEGROUP,SEX,DEATHS
0,2020-03-10,Brussels,85+,F,1
1,2020-03-11,Flanders,85+,F,1
2,2020-03-11,Brussels,75-84,M,1
3,2020-03-11,Brussels,85+,F,1
4,2020-03-12,Brussels,75-84,M,1


In [59]:
covid_be_tests.head()

Unnamed: 0,DATE,TESTS
0,2020-03-01,56
1,2020-03-02,278
2,2020-03-03,484
3,2020-03-04,656
4,2020-03-05,713


### b) Clean data for age by date

In [60]:
covid_be['AGEGROUP'].unique()

array(['10-19', '20-29', '30-39', '40-49', '50-59', '70-79', '60-69',
       '0-9', '90+', '80-89', nan], dtype=object)

In [61]:
covid_be_deaths['AGEGROUP'].unique()

array(['85+', '75-84', '65-74', nan, '25-44', '45-64', '0-24'],
      dtype=object)

In [62]:
covid_be_cases_age_by_region = covid_be.drop('SEX', axis=1)\
                                    .groupby(['DATE', 'PROVINCE', 'REGION', 'AGEGROUP'])\
                                    .sum()\
                                    .reset_index()

In [63]:
covid_be_cases_age_by_region.head()

Unnamed: 0,DATE,PROVINCE,REGION,AGEGROUP,CASES
0,2020-03-01,Brussels,Brussels,10-19,2
1,2020-03-01,Brussels,Brussels,20-29,1
2,2020-03-01,Brussels,Brussels,30-39,1
3,2020-03-01,Brussels,Brussels,40-49,1
4,2020-03-01,Brussels,Brussels,50-59,1


In [64]:
covid_be_cases_age_by_date = covid_be.drop(['SEX', 'PROVINCE', 'REGION'], axis=1)\
                                    .groupby(['DATE', 'AGEGROUP'])\
                                    .sum()\
                                    .reset_index()

In [65]:
covid_be_cases_age_by_date.head()

Unnamed: 0,DATE,AGEGROUP,CASES
0,2020-03-01,10-19,4
1,2020-03-01,20-29,1
2,2020-03-01,30-39,1
3,2020-03-01,40-49,5
4,2020-03-01,50-59,3


In [66]:
covid_be_deaths_age_by_region = covid_be_deaths.drop('SEX', axis=1)\
                                    .groupby(['DATE', 'REGION', 'AGEGROUP'])\
                                    .sum()\
                                    .reset_index()

In [67]:
covid_be_deaths_age_by_region.head()

Unnamed: 0,DATE,REGION,AGEGROUP,DEATHS
0,2020-03-10,Brussels,85+,1
1,2020-03-11,Brussels,75-84,1
2,2020-03-11,Brussels,85+,1
3,2020-03-11,Flanders,85+,1
4,2020-03-12,Brussels,75-84,1


In [68]:
covid_be_deaths_age_by_date = covid_be_deaths.drop(['SEX', 'REGION'], axis=1)\
                                    .groupby(['DATE', 'AGEGROUP'])\
                                    .sum()\
                                    .reset_index()

In [69]:
covid_be_deaths_age_by_date.head()

Unnamed: 0,DATE,AGEGROUP,DEATHS
0,2020-03-10,85+,1
1,2020-03-11,75-84,1
2,2020-03-11,85+,2
3,2020-03-12,75-84,1
4,2020-03-13,75-84,2


### c) Clean data for sexe by date

In [70]:
covid_be_cases_sexe_by_region = covid_be.drop('AGEGROUP', axis=1)\
                                    .groupby(['DATE', 'PROVINCE', 'REGION', 'SEX'])\
                                    .sum()\
                                    .reset_index()

In [71]:
covid_be_cases_sexe_by_region.head()

Unnamed: 0,DATE,PROVINCE,REGION,SEX,CASES
0,2020-03-01,Brussels,Brussels,F,3
1,2020-03-01,Brussels,Brussels,M,3
2,2020-03-01,Limburg,Flanders,M,1
3,2020-03-01,Liège,Wallonia,M,2
4,2020-03-01,OostVlaanderen,Flanders,F,1


In [72]:
covid_be_cases_sexe_by_date = covid_be.drop(['PROVINCE', 'AGEGROUP', 'REGION'], axis=1)\
                                .groupby(['DATE', 'SEX']).sum().reset_index()\
                                .merge(covid_be_deaths.drop(['AGEGROUP', 'REGION'], axis=1).groupby(['DATE', 'SEX']).sum().reset_index(),
                                      left_on=['SEX', 'DATE'],
                                      right_on=['SEX', 'DATE'])

In [73]:
covid_be_cases_sexe_by_date['fatality_rate'] = covid_be_cases_sexe_by_date['DEATHS'] / covid_be_cases_sexe_by_date['CASES']

In [74]:
covid_be_cases_sexe_by_date.head()

Unnamed: 0,DATE,SEX,CASES,DEATHS,fatality_rate
0,2020-03-10,F,45,1,0.022222
1,2020-03-11,F,77,2,0.025974
2,2020-03-11,M,93,1,0.010753
3,2020-03-12,M,129,1,0.007752
4,2020-03-13,F,188,2,0.010638


## 4 New York

In [75]:
PATH_COVID_NY = os.path.expanduser('/Users/olivier/Documents/GitHub/com-480-project-pouletpanier/data/detailed_data/newyork/case-hosp-death.csv')

PATH_COVID_NY_AGE = os.path.expanduser('/Users/olivier/Documents/GitHub/com-480-project-pouletpanier/data/detailed_data/newyork/by-age.csv')
PATH_COVID_NY_SEXE = os.path.expanduser('/Users/olivier/Documents/GitHub/com-480-project-pouletpanier/data/detailed_data/newyork/by-sex.csv')

PATH_COVID_NY_TEST = os.path.expanduser('/Users/olivier/Documents/GitHub/com-480-project-pouletpanier/data/detailed_data/newyork/tests-by-zcta.csv')

In [76]:
covid_ny = pd.read_csv(PATH_COVID_NY)

covid_ny_age = pd.read_csv(PATH_COVID_NY_AGE)
covid_ny_sexe = pd.read_csv(PATH_COVID_NY_SEXE)

covid_ny_test = pd.read_csv(PATH_COVID_NY_TEST)

In [77]:
covid_ny.head()

Unnamed: 0,DATE_OF_INTEREST,NEW_COVID_CASE_COUNT,HOSPITALIZED_CASE_COUNT,DEATH_COUNT
0,3/3/20,2,7.0,
1,3/4/20,5,10.0,
2,3/5/20,3,14.0,
3,3/6/20,7,11.0,
4,3/7/20,7,10.0,


In [78]:
covid_ny_age

Unnamed: 0,AGE_GROUP,COVID_CASE_RATE,HOSPITALIZED_CASE_RATE,DEATH_RATE
0,0-17 years,194.62,15.17,0.29
1,18-44 years,1675.94,166.54,14.01
2,45-64 years,2744.49,675.52,124.66
3,65-74 years,2746.03,1290.48,406.66
4,75 and older years,3188.99,1983.27,1019.24
5,Citywide total,1824.13,471.92,136.45


In [79]:
covid_ny_sexe

Unnamed: 0,SEX_GROUP,COVID_CASE_RATE,HOSPITALIZED_CASE_RATE,DEATH_RATE
0,Female,1672.75,374.23,101.87
1,Male,1983.24,578.66,174.09
2,Citywide total,1824.13,471.92,136.45


In [80]:
covid_ny_test.head()

Unnamed: 0,MODZCTA,Total,Positive,zcta_cum.perc_pos
0,,2464,2166,87.91
1,10001.0,851,375,44.07
2,10002.0,1962,978,49.85
3,10003.0,1194,487,40.79
4,10004.0,87,36,41.38


## 5 California

In [81]:
PATH_COVID_CLFN = os.path.expanduser('/Users/olivier/Documents/GitHub/com-480-project-pouletpanier/data/detailed_data/california-coronavirus-data/cdph-state-totals.csv')

In [82]:
covid_clfn = pd.read_csv(PATH_COVID_CLFN)

In [83]:
covid_clfn.head()

Unnamed: 0,date,confirmed_cases,deaths,travel,person_to_person,community_spread,under_investigation,other_causes,self_monitoring,age_0_to_17,...,total_tests,received_tests,pending_tests,confirmed_hospitalizations,confirmed_icu,suspected_hospitalizations,suspected_icu,healthcare_worker_infections,healthcare_worker_deaths,source_url
0,2020-04-26,42164,1710.0,,,,,,,1039.0,...,526084.0,526084.0,0.0,3324.0,1184.0,1604.0,289.0,4593.0,24.0,https://www.cdph.ca.gov/Programs/OPA/Pages/NR2...
1,2020-04-25,41137,1651.0,,,,,,,1011.0,...,506035.0,506035.0,0.0,3343.0,1198.0,1504.0,260.0,4453.0,22.0,https://www.cdph.ca.gov/Programs/OPA/Pages/NR2...
2,2020-04-24,39254,1562.0,,,,,,,936.0,...,494173.0,494173.0,0.0,3344.0,1216.0,1536.0,305.0,4322.0,22.0,https://www.cdph.ca.gov/Programs/OPA/Pages/NR2...
3,2020-04-23,37369,1469.0,,,,,,,855.0,...,482097.0,482097.0,0.0,3343.0,1204.0,1586.0,327.0,4153.0,,https://www.cdph.ca.gov/Programs/OPA/Pages/NR2...
4,2020-04-22,35396,1354.0,,,,,,,768.0,...,308700.0,301547.0,7200.0,3357.0,1219.0,1627.0,332.0,3877.0,,https://www.cdph.ca.gov/Programs/OPA/Pages/NR2...


In [84]:
covid_clfn.columns

Index(['date', 'confirmed_cases', 'deaths', 'travel', 'person_to_person',
       'community_spread', 'under_investigation', 'other_causes',
       'self_monitoring', 'age_0_to_17', 'age_18_to_49', 'age_50_to_64',
       'age_65_and_up', 'age_18_to_64', 'age_unknown', 'gender_male',
       'gender_female', 'gender_unknown', 'latino_cases_percent',
       'latino_deaths_percent', 'white_cases_percent', 'white_deaths_percent',
       'black_cases_percent', 'black_deaths_percent', 'asian_cases_percent',
       'asian_deaths_percent', 'multiracial_cases_percent',
       'multiracial_deaths_percent', 'native_cases_percent',
       'native_deaths_percent', 'hawaiian_pacislander_cases_percent',
       'hawaiian_pacislander_deaths_percent', 'other_cases_percent',
       'other_deaths_percent', 'unknown_race_cases', 'unknown_race_deaths',
       'total_tests', 'received_tests', 'pending_tests',
       'confirmed_hospitalizations', 'confirmed_icu',
       'suspected_hospitalizations', 'suspected_

### b) Clean data for age by date

In [85]:
covid_clfn_age = covid_clfn[['date', 'confirmed_cases', 'deaths', 'age_0_to_17', 'age_18_to_49', 'age_50_to_64',
       'age_65_and_up', 'age_unknown']]

In [86]:
covid_clfn_age.head()

Unnamed: 0,date,confirmed_cases,deaths,age_0_to_17,age_18_to_49,age_50_to_64,age_65_and_up,age_unknown
0,2020-04-26,42164,1710.0,1039.0,20379.0,11139.0,9512.0,95.0
1,2020-04-25,41137,1651.0,1011.0,19897.0,10885.0,9256.0,88.0
2,2020-04-24,39254,1562.0,936.0,18954.0,10451.0,8832.0,81.0
3,2020-04-23,37369,1469.0,855.0,18004.0,10025.0,8401.0,84.0
4,2020-04-22,35396,1354.0,768.0,17009.0,9517.0,8035.0,67.0


### c) Clean data for sexe by date

In [87]:
covid_clfn_sexe = covid_clfn[['date', 'confirmed_cases', 'deaths', 'gender_male',
       'gender_female', 'gender_unknown']]

In [88]:
covid_clfn_sexe.head()

Unnamed: 0,date,confirmed_cases,deaths,gender_male,gender_female,gender_unknown
0,2020-04-26,42164,1710.0,20957.0,20908.0,299.0
1,2020-04-25,41137,1651.0,20459.0,20395.0,283.0
2,2020-04-24,39254,1562.0,19577.0,19394.0,283.0
3,2020-04-23,37369,1469.0,18690.0,18395.0,284.0
4,2020-04-22,35396,1354.0,17718.0,17411.0,267.0


### d) Clean data for hospitalisations by date


In [89]:
covid_clfn_hosp = covid_clfn[['date','confirmed_cases', 'deaths', 'confirmed_hospitalizations', 'confirmed_icu',
       'suspected_hospitalizations', 'suspected_icu',
       'healthcare_worker_infections', 'healthcare_worker_deaths']]

In [90]:
covid_clfn_hosp.head()

Unnamed: 0,date,confirmed_cases,deaths,confirmed_hospitalizations,confirmed_icu,suspected_hospitalizations,suspected_icu,healthcare_worker_infections,healthcare_worker_deaths
0,2020-04-26,42164,1710.0,3324.0,1184.0,1604.0,289.0,4593.0,24.0
1,2020-04-25,41137,1651.0,3343.0,1198.0,1504.0,260.0,4453.0,22.0
2,2020-04-24,39254,1562.0,3344.0,1216.0,1536.0,305.0,4322.0,22.0
3,2020-04-23,37369,1469.0,3343.0,1204.0,1586.0,327.0,4153.0,
4,2020-04-22,35396,1354.0,3357.0,1219.0,1627.0,332.0,3877.0,
