### COVID-19 - CONFIRMED CASES BY DATE, PROVINCE, AGE, AND SEX

###### IMPORTING LIBRARIES

In [1]:
import pandas as pd
import plotly
import plotly.express as px

###### READING DATA

In [2]:
data = pd.read_csv('./data/COVID19BE_CASES_AGESEX.csv')

###### EXPLORING DATA

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55144 entries, 0 to 55143
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   DATE      55095 non-null  object
 1   PROVINCE  51690 non-null  object
 2   REGION    51690 non-null  object
 3   AGEGROUP  53636 non-null  object
 4   SEX       54275 non-null  object
 5   CASES     55144 non-null  int64 
dtypes: int64(1), object(5)
memory usage: 2.5+ MB


In [4]:
data.head()

Unnamed: 0,DATE,PROVINCE,REGION,AGEGROUP,SEX,CASES
0,2020-03-01,Antwerpen,Flanders,40-49,M,1
1,2020-03-01,Brussels,Brussels,10-19,F,1
2,2020-03-01,Brussels,Brussels,10-19,M,1
3,2020-03-01,Brussels,Brussels,20-29,M,1
4,2020-03-01,Brussels,Brussels,30-39,F,1


In [5]:
data.tail()

Unnamed: 0,DATE,PROVINCE,REGION,AGEGROUP,SEX,CASES
55139,,VlaamsBrabant,Flanders,40-49,M,3
55140,,VlaamsBrabant,Flanders,50-59,M,1
55141,,WestVlaanderen,Flanders,20-29,F,1
55142,,WestVlaanderen,Flanders,50-59,M,3
55143,,,,,,1


In [6]:
data['DATE'].isna().sum() # calculate number of NA values

49

In [7]:
data['PROVINCE'].value_counts(dropna=False).sort_index(na_position='first')

NaN               3454
Antwerpen         5693
BrabantWallon     3620
Brussels          5494
Hainaut           5114
Limburg           4722
Liège             4954
Luxembourg        3219
Namur             3807
OostVlaanderen    5113
VlaamsBrabant     4830
WestVlaanderen    5124
Name: PROVINCE, dtype: int64

In [8]:
data['REGION'].value_counts(dropna=False).sort_index(na_position='first')

NaN          3454
Brussels     5494
Flanders    25482
Wallonia    20714
Name: REGION, dtype: int64

In [9]:
data['AGEGROUP'].value_counts(dropna=False).sort_index(na_position='first')

NaN      1508
0-9      3868
10-19    4992
20-29    6450
30-39    6335
40-49    6371
50-59    6225
60-69    5574
70-79    5108
80-89    5003
90+      3710
Name: AGEGROUP, dtype: int64

In [10]:
data['SEX'].value_counts(dropna=False).sort_index(na_position='first')

NaN      869
F      27669
M      26606
Name: SEX, dtype: int64

###### CLEANING DATA

In [11]:
data.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)

In [12]:
data['DATE'].isna().sum()

0

In [13]:
data['PROVINCE'].value_counts(dropna=False).sort_index(na_position='first')

Antwerpen         5521
BrabantWallon     3560
Brussels          5109
Hainaut           4926
Limburg           4637
Liège             4760
Luxembourg        3169
Namur             3751
OostVlaanderen    4908
VlaamsBrabant     4675
WestVlaanderen    4957
Name: PROVINCE, dtype: int64

In [14]:
data['REGION'].value_counts(dropna=False).sort_index(na_position='first')

Brussels     5109
Flanders    24698
Wallonia    20166
Name: REGION, dtype: int64

In [15]:
data['AGEGROUP'].value_counts(dropna=False).sort_index(na_position='first')

0-9      3715
10-19    4641
20-29    5832
30-39    5796
40-49    5836
50-59    5743
60-69    5212
70-79    4859
80-89    4763
90+      3576
Name: AGEGROUP, dtype: int64

In [16]:
data['SEX'].value_counts(dropna=False).sort_index(na_position='first')

F    25596
M    24377
Name: SEX, dtype: int64

In [17]:
data['DATE'] = pd.to_datetime(data['DATE'], format='%Y-%m-%d')

In [18]:
data.reset_index(drop=True, inplace=True)

###### DATA ANALYSIS

Questions:
- number of cases per province
- number of cases per age group
- number of cases per province per age group

In [30]:
data_by_date = data.resample('D', on='DATE')['CASES'].sum().reset_index()
data_by_date.head(10)

Unnamed: 0,DATE,CASES
0,2020-03-01,19
1,2020-03-02,19
2,2020-03-03,34
3,2020-03-04,46
4,2020-03-05,81
5,2020-03-06,109
6,2020-03-07,27
7,2020-03-08,62
8,2020-03-09,90
9,2020-03-10,92


In [19]:
data_by_province_date = data.groupby(by=['PROVINCE']).resample('D', on='DATE')['CASES'].sum().reset_index()
data_by_province_date.head(10)

Unnamed: 0,PROVINCE,DATE,CASES
0,Antwerpen,2020-03-01,1
1,Antwerpen,2020-03-02,1
2,Antwerpen,2020-03-03,5
3,Antwerpen,2020-03-04,6
4,Antwerpen,2020-03-05,11
5,Antwerpen,2020-03-06,11
6,Antwerpen,2020-03-07,8
7,Antwerpen,2020-03-08,9
8,Antwerpen,2020-03-09,8
9,Antwerpen,2020-03-10,14


In [20]:
data_by_agegroup_date = data.groupby(by=['AGEGROUP']).resample('D', on='DATE')['CASES'].sum().reset_index()
data_by_agegroup_date.head(10)

Unnamed: 0,AGEGROUP,DATE,CASES
0,0-9,2020-03-03,1
1,0-9,2020-03-04,2
2,0-9,2020-03-05,3
3,0-9,2020-03-06,2
4,0-9,2020-03-07,0
5,0-9,2020-03-08,0
6,0-9,2020-03-09,2
7,0-9,2020-03-10,1
8,0-9,2020-03-11,4
9,0-9,2020-03-12,4


In [21]:
data_by_province_agegroup_date = data.groupby(by=['PROVINCE', 'AGEGROUP']).resample('D', on='DATE')['CASES'].sum().reset_index()
data_by_province_agegroup_date.head(10)

Unnamed: 0,PROVINCE,AGEGROUP,DATE,CASES
0,Antwerpen,0-9,2020-03-04,1
1,Antwerpen,0-9,2020-03-05,1
2,Antwerpen,0-9,2020-03-06,0
3,Antwerpen,0-9,2020-03-07,0
4,Antwerpen,0-9,2020-03-08,0
5,Antwerpen,0-9,2020-03-09,0
6,Antwerpen,0-9,2020-03-10,0
7,Antwerpen,0-9,2020-03-11,1
8,Antwerpen,0-9,2020-03-12,0
9,Antwerpen,0-9,2020-03-13,0


###### DATA VISUALIZATION

In [36]:
# cases by date
fig_cases_by_date = px.line(
    data_frame=data_by_date,
    x='DATE',
    y='CASES',
    labels={'CASES' : 'Confirmed Cases', 'DATE' : 'Date'},
    title='Confirmed Cases by Date'
)

fig_cases_by_date.show()

In [37]:
# cases by province
fig_cases_by_province = px.line(
    data_frame=data_by_province_date,
    x='DATE',
    y='CASES',
    line_group='PROVINCE',
    color='PROVINCE',
    labels={'CASES' : 'Confirmed Cases', 'PROVINCE' : 'Province', 'DATE' : 'Date'},
    title='Confirmed Cases by Province'
)

fig_cases_by_province.show()

In [23]:
# cases by age group
fig_cases_by_agegroup = px.line(
    data_frame=data_by_agegroup_date,
    x='DATE',
    y='CASES',
    line_group='AGEGROUP',
    color='AGEGROUP',
    labels={'CASES' : 'Confirmed Cases', 'AGEGROUP' : 'Age Group', 'DATE' : 'Date'},
    title='Confirmed Cases by Age Group'
)

fig_cases_by_agegroup.show()