### COVID-19 - CONFIRMED CASES BY DATE, PROVINCE, AGE, AND SEX

###### IMPORTING LIBRARIES

In [204]:
import pandas as pd
import plotly
import plotly.express as px

###### READING DATA

In [205]:
data = pd.read_csv('../data/covid19be_cases_agesex.csv')

###### EXPLORING DATA

In [206]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56270 entries, 0 to 56269
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   DATE      56221 non-null  object
 1   PROVINCE  52739 non-null  object
 2   REGION    52739 non-null  object
 3   AGEGROUP  54734 non-null  object
 4   SEX       55382 non-null  object
 5   CASES     56270 non-null  int64 
dtypes: int64(1), object(5)
memory usage: 2.6+ MB


In [207]:
data.head()

Unnamed: 0,DATE,PROVINCE,REGION,AGEGROUP,SEX,CASES
0,2020-03-01,Antwerpen,Flanders,40-49,M,1
1,2020-03-01,Brussels,Brussels,10-19,F,1
2,2020-03-01,Brussels,Brussels,10-19,M,1
3,2020-03-01,Brussels,Brussels,20-29,M,1
4,2020-03-01,Brussels,Brussels,30-39,F,1


In [208]:
data.tail()

Unnamed: 0,DATE,PROVINCE,REGION,AGEGROUP,SEX,CASES
56265,,VlaamsBrabant,Flanders,40-49,M,3
56266,,VlaamsBrabant,Flanders,50-59,M,1
56267,,WestVlaanderen,Flanders,20-29,F,1
56268,,WestVlaanderen,Flanders,50-59,M,3
56269,,,,,,1


In [209]:
nan_counts = [(i, data[i].isna().sum()) for i in data.columns]
nan_counts

[('DATE', 49),
 ('PROVINCE', 3531),
 ('REGION', 3531),
 ('AGEGROUP', 1536),
 ('SEX', 888),
 ('CASES', 0)]

In [210]:
data['DATE'].isna().sum() # calculate number of NA values

49

In [211]:
data['PROVINCE'].value_counts(dropna=False).sort_index(na_position='first')

NaN               3531
Antwerpen         5792
BrabantWallon     3709
Brussels          5594
Hainaut           5213
Limburg           4822
Liège             5043
Luxembourg        3296
Namur             3899
OostVlaanderen    5215
VlaamsBrabant     4929
WestVlaanderen    5227
Name: PROVINCE, dtype: int64

In [212]:
data['REGION'].value_counts(dropna=False).sort_index(na_position='first')

NaN          3531
Brussels     5594
Flanders    25985
Wallonia    21160
Name: REGION, dtype: int64

In [213]:
data['AGEGROUP'].value_counts(dropna=False).sort_index(na_position='first')

NaN      1536
0-9      3979
10-19    5107
20-29    6571
30-39    6457
40-49    6494
50-59    6341
60-69    5688
70-79    5208
80-89    5105
90+      3784
Name: AGEGROUP, dtype: int64

In [214]:
data['SEX'].value_counts(dropna=False).sort_index(na_position='first')

NaN      888
F      28233
M      27149
Name: SEX, dtype: int64

###### CLEANING DATA

In [215]:
data.dropna(axis=0, how='any', thresh=None, subset=['DATE'], inplace=True)

In [216]:
for i in ['PROVINCE', 'REGION', 'AGEGROUP', 'SEX']:
    data[i].fillna('MISSING', inplace=True)

In [217]:
data['DATE'].isna().sum()

0

In [218]:
data['PROVINCE'].value_counts(dropna=False).sort_index(na_position='first')

Antwerpen         5788
BrabantWallon     3705
Brussels          5587
Hainaut           5208
Limburg           4821
Liège             5030
Luxembourg        3295
MISSING           3530
Namur             3898
OostVlaanderen    5208
VlaamsBrabant     4926
WestVlaanderen    5225
Name: PROVINCE, dtype: int64

In [219]:
data['REGION'].value_counts(dropna=False).sort_index(na_position='first')

Brussels     5587
Flanders    25968
MISSING      3530
Wallonia    21136
Name: REGION, dtype: int64

In [220]:
data['AGEGROUP'].value_counts(dropna=False).sort_index(na_position='first')

0-9        3979
10-19      5105
20-29      6566
30-39      6452
40-49      6483
50-59      6333
60-69      5685
70-79      5204
80-89      5101
90+        3784
MISSING    1529
Name: AGEGROUP, dtype: int64

In [221]:
data['SEX'].value_counts(dropna=False).sort_index(na_position='first')

F          28213
M          27121
MISSING      887
Name: SEX, dtype: int64

In [222]:
data['DATE'] = pd.to_datetime(data['DATE'], format='%Y-%m-%d')

In [223]:
data.reset_index(drop=True, inplace=True)

###### DATA ANALYSIS

In [224]:
data_by_date = data.resample('D', on='DATE')['CASES'].sum().reset_index()
data_by_date.head(10)

Unnamed: 0,DATE,CASES
0,2020-03-01,19
1,2020-03-02,19
2,2020-03-03,34
3,2020-03-04,53
4,2020-03-05,81
5,2020-03-06,110
6,2020-03-07,27
7,2020-03-08,64
8,2020-03-09,94
9,2020-03-10,99


In [225]:
data_by_province_date = data.groupby(by=['PROVINCE'], dropna=False).resample('D', on='DATE')['CASES'].sum().reset_index()
data_by_province_date.head(10)

Unnamed: 0,PROVINCE,DATE,CASES
0,Antwerpen,2020-03-01,1
1,Antwerpen,2020-03-02,1
2,Antwerpen,2020-03-03,5
3,Antwerpen,2020-03-04,6
4,Antwerpen,2020-03-05,11
5,Antwerpen,2020-03-06,11
6,Antwerpen,2020-03-07,8
7,Antwerpen,2020-03-08,9
8,Antwerpen,2020-03-09,8
9,Antwerpen,2020-03-10,14


In [226]:
data_by_agegroup_date = data.groupby(by=['AGEGROUP'], dropna=False).resample('D', on='DATE')['CASES'].sum().reset_index()
data_by_agegroup_date.head(10)

Unnamed: 0,AGEGROUP,DATE,CASES
0,0-9,2020-03-03,1
1,0-9,2020-03-04,3
2,0-9,2020-03-05,3
3,0-9,2020-03-06,2
4,0-9,2020-03-07,0
5,0-9,2020-03-08,0
6,0-9,2020-03-09,2
7,0-9,2020-03-10,1
8,0-9,2020-03-11,4
9,0-9,2020-03-12,4


In [227]:
data_by_province_agegroup_date = data.groupby(by=['PROVINCE', 'AGEGROUP'], dropna=False).resample('D', on='DATE')['CASES'].sum().reset_index()
data_by_province_agegroup_date.head(10)

Unnamed: 0,PROVINCE,AGEGROUP,DATE,CASES
0,Antwerpen,0-9,2020-03-04,1
1,Antwerpen,0-9,2020-03-05,1
2,Antwerpen,0-9,2020-03-06,0
3,Antwerpen,0-9,2020-03-07,0
4,Antwerpen,0-9,2020-03-08,0
5,Antwerpen,0-9,2020-03-09,0
6,Antwerpen,0-9,2020-03-10,0
7,Antwerpen,0-9,2020-03-11,1
8,Antwerpen,0-9,2020-03-12,0
9,Antwerpen,0-9,2020-03-13,0


###### DATA VISUALIZATION

In [228]:
# cases by date
fig_cases_by_date = px.line(
    data_frame=data_by_date,
    x='DATE',
    y='CASES',
    labels={'CASES' : 'Confirmed Cases', 'DATE' : 'Date'},
    title='Confirmed Cases by Date'
)

fig_cases_by_date.show()

In [229]:
# cases by province
fig_cases_by_province = px.line(
    data_frame=data_by_province_date,
    x='DATE',
    y='CASES',
    line_group='PROVINCE',
    color='PROVINCE',
    labels={'CASES' : 'Confirmed Cases', 'PROVINCE' : 'Province', 'DATE' : 'Date'},
    title='Confirmed Cases by Province'
)

fig_cases_by_province.show()

In [230]:
# cases by age group
fig_cases_by_agegroup = px.line(
    data_frame=data_by_agegroup_date,
    x='DATE',
    y='CASES',
    line_group='AGEGROUP',
    color='AGEGROUP',
    labels={'CASES' : 'Confirmed Cases', 'AGEGROUP' : 'Age Group', 'DATE' : 'Date'},
    title='Confirmed Cases by Age Group'
)

fig_cases_by_agegroup.show()