# Tests per agegroup
Data extracted from weekly report of RIVM

In [1]:
import os
import pandas as pd
import tabula
import requests
from bs4 import BeautifulSoup
import PyPDF2
import re

rivm_url = 'https://www.rivm.nl/coronavirus-covid-19/actueel/wekelijkse-update-epidemiologische-situatie-covid-19-in-nederland'
response = requests.get(rivm_url)
rivm_pdf_data_loc = './data/pdfs/'

In [2]:
url = []
href = []
period = []
name = []
datetime = []
week = []
filename = []

def columns(l):
    def ll(l):
        return -l.count('')
    
    if 'http' in l:
        l = l[ll(l) + 20:]
    
    href.append(l)
    period.append(pd.Period(l[ll(l) + 22 : ll(l) + 29],"M"))
    name.append(l[ll(l) + 65 : -4])
    datetime.append(pd.to_datetime(l[ll(l) + 65 : ll(l) + 73], format="%Y%m%d"))
    filename.append(l[ll(l) + 65:])
    url.append('https://www.rivm.nl'+l)

In [3]:
soup = BeautifulSoup(response.text, 'html.parser') 
aa = soup.find_all('a')
for a in aa:
    link = a.get('href')
    if link == None:
        pass
    elif link.endswith('.pdf'):
        columns(link)

pdfs = pd.DataFrame([period, datetime, week, name, filename, href, url]).T
pdfs = pdfs.rename(columns={0:'period',1:'date',2:'week',3:'name',4:'filename',5:'href',6:'url'})
pdfs['week'] = pdfs.date.dt.isocalendar().week
pdfs.sort_values(by='period', inplace=True)

In [4]:
def extract_tabel(pdf,csv,date):
    
    def check_tabel(value):
        for key in checks:
            if checks[key]:
                pass
            elif isinstance(value,str) and key in value:
                checks[key] = True
    
    def create_agegroup_tabel(tabel):
        #print(tabel)
        if checks['Leeftijdsgroep'] and checks['fgelopen']:
            category = 'Leeftijd'
        elif checks['Groep'] and checks['fgelopen'] and checks['Kinderen']:
            category = 'Groep'
        elif checks['Groep'] and checks['Kinderen']:
            category = 'Groep'
        else:
            category = ''
        #print('cat='+category)
        
        if category != '':
            tabel.dropna(inplace=True)
            if tabel.empty:
                #print("df empty")
                pass
            else:
                first_column = tabel.columns[0]
                #first_column = 'Unnamed: 0'
                if len(tabel.columns) > 1:
                    second_column = tabel.columns[1]
                agegroup_tabel = tabel.loc[(tabel[first_column].str.count('-') > 0)]                
                if category == 'Leeftijd':
                    agegroup_tabel = agegroup_tabel.append(tabel.loc[(tabel[first_column].str.find('+') > 0)])
                    agegroup_tabel.rename(columns={first_column:'Leeftijdsgroep',second_column:date}, inplace=True)
                elif category == 'Groep':
                    agegroup_tabel = agegroup_tabel.append(tabel.loc[(tabel[first_column].str.count('fgelopen') > 0)])
                    agegroup_tabel = agegroup_tabel.append(tabel.loc[(tabel[first_column].str.count('Groep') > 0)])
                agegroup_tabel = agegroup_tabel.sort_index()
                agegroup_tabel['Date'] = date
                agegroup_tabel.to_csv(csv, index=False)
                #print('CSV CREATED')
                #print(agegroup_tabel)
                return True
        #else:
            #print("NOTHING")
    
    #print('extracting '+pdf)
    #print(pdf)
    found_pages = []
    object = PyPDF2.PdfFileReader(pdf)
    NumPages = object.getNumPages()
    String = "SARS-COV-2.*TESTEN.*AFGENOMEN.*DOOR.*DE.*GGD"
    
    for i in range(0, NumPages):
        PageObj = object.getPage(i)
        Text = PageObj.extractText()
        if re.search(String,Text):
            found_pages.append(i)
    
    if len(found_pages) == 0:
        print("NO CHAPTER FOUND in "+str(pdf))
    else:
        tabels = tabula.read_pdf(input_path=pdf, pages=found_pages, guess=True, stream=False, multiple_tables=True)
        csv_created = None
        for tabel in tabels:
            checks = {
                'Leeftijdsgroep': False,
                'fgelopen': False,
                'Groep': False,
                'Kinderen': False}
            for column in tabel:
                check_tabel(column)
                for i in tabel[column].values:
                    check_tabel(i)
            #print(checks)
            csv_created = create_agegroup_tabel(tabel)
            #print(csv_created)
        if csv_created == None:
            no_tabel_found.append(pdf)
            #print('No tabel found in: '+pdf)

In [5]:
# download all pdfs
print('Checking for new weekreport')
no_tabel_found = []
for row in pdfs.itertuples():
    pdf = rivm_pdf_data_loc+row.filename
    csv = pdf[:-3]+'csv'
    try:
        os.stat(csv)
        #print('skiping '+csv+' exists')
    except:
        #print(pdf)
        try:
            os.stat(pdf)
        except:
            #print('downloading '+row.filename)
            r = requests.get(row.url, stream=True)
            with open(pdf, 'wb') as f:
                f.write(r.content)
        finally:
            #print(pdf)
            extract_tabel(pdf, csv, row.date)
print("DONE")
print("")
print("Nog tabels found in:")
no_tabel_found

Checking for new weekreport
DONE

Nog tabels found in:


['./data/pdfs/20200714_1040.pdf', './data/pdfs/20210406_1518.pdf']

In [185]:
# clean up extracted tabels
## values in string as list. Find out what value to use.
data = pd.DataFrame()

first = True  
for filename in os.listdir(rivm_pdf_data_loc):
    if filename in ['20200908_1159.csv', '20210413_1259.csv', '20200825_1217.csv']:
        # There is a better tabel in the pdf, check tabula scrupt ans get that tabel
        pass
    elif '.csv' in filename:
        #print(filename)
        path = rivm_pdf_data_loc+filename
        df = pd.read_csv(path)
        #print(df.iloc[0,0])
        
        # clean df's
        if df.iloc[0,0] == 'Groep':
            #print('GROEP')
            df.rename(columns={'0': df.loc[0,'0'], '1': df.loc[0,'1'], '2': df.loc[0,'2'], '3': df.loc[0,'3']}, inplace=True)
            df.rename(columns={'Groep': 'Agegroup'}, inplace=True)
        elif df.iloc[0,0] == 'Groep Aantal Aantal':
            #print('GROEP AANTAL AANTAL')
            # splitting multi numbers in column
            df_temp = df['3'].str.split(expand=True)
            df['Aantal getest'] = df_temp.iloc[:,0]
            df['Percentage positief'] = df_temp.iloc[:,1]
            df.rename(columns={'0': 'Agegroup', '2': 'Aantal positief'}, inplace=True)
            df.drop(columns=['1','3'], inplace=True)
            # splitting numbers and title group
            df_temp = df.iloc[:,0].str.split(" ",3, expand=True)
            df_temp = df_temp.iloc[:,:3]
            df['Agegroup'] = df_temp.iloc[:,0]+' '+df_temp.iloc[:,1]+' '+df_temp.iloc[:,2]
            # cleaning up df
            df = df[['Agegroup', 'Aantal positief', 'Aantal getest', 'Percentage positief', 'Date']]
        elif df.iloc[0,0][0:8] == 'Kinderen':
            #print('KINDEREN')
            # splitting multi numbers in column
            df_temp = df['2'].str.split(expand=True)
            df['Aantal positief'] = df_temp.iloc[:,0]
            df['Aantal getest'] = df_temp.iloc[:,1]
            df.rename(columns={'0': 'Agegroup', '3': 'Percentage positief'}, inplace=True)
            df.drop(columns=['1','2'], inplace=True)
            # splitting numbers and title group
            df_temp = df.iloc[:,0].str.split(" ",3, expand=True)
            df_temp = df_temp.iloc[:,:3]
            df['Agegroup'] = df_temp.iloc[:,0]+' '+df_temp.iloc[:,1]+' '+df_temp.iloc[:,2]
            # cleaning up df
            df = df[['Agegroup', 'Aantal positief', 'Aantal getest', 'Percentage positief', 'Date']]
        elif df.iloc[0,0] == 'Groep Aantal positief Aantal getest Percentage positief':
            #print('ALL IN STRING COLUMN 0')
            # dropping rows
            for i,v in df.loc[:,'0'].items():
                if 'twee weken' in v:
                    ii = i
            df = df.iloc[ii+2:ii+5,:]
            # splitting multi numbers in column
            df_temp = df['0'].str.split(expand=True)
            df['Agegroup'] = df_temp.iloc[:,0]+' '+df_temp.iloc[:,1]+' '+df_temp.iloc[:,2]
            df['Aantal positief'] = df_temp.iloc[:,3]
            df['Aantal getest'] = df_temp.iloc[:,4]
            df['Percentage positief'] = df_temp.iloc[:,5]
            # cleaning up df
            df = df[['Agegroup', 'Aantal positief', 'Aantal getest', 'Percentage positief', 'Date']]
        elif df.iloc[0,0] == 'Vanaf 1 juni Afgelopen kalender week1':
            #print('ALL IN STRING COLUMN 01')
            # dropping rows
            df = df[df.iloc[:,0] != 'Vanaf 1 juni Afgelopen kalender week1']
            df = df[df.iloc[:,0] != 'Groep Aantal Aantal Percentage Aantal Aantal Percentage']
            # splitting multi numbers in column
            df_temp = df['0'].str.split(expand=True)
            df['Agegroup'] = df_temp.iloc[:,0]+' '+df_temp.iloc[:,1]+' '+df_temp.iloc[:,2]
            df['Aantal positief'] = df_temp.iloc[:,-3]
            df['Aantal getest'] = df_temp.iloc[:,-2]
            df['Percentage positief'] = df_temp.iloc[:,-1]
            # cleaning up df
            df = df[['Agegroup', 'Aantal positief', 'Aantal getest', 'Percentage positief', 'Date']]
        elif df.iloc[0,0] == 'Vanaf 31 augustus Afgelopen kalender week1':
            #print('ALL IN STRING COLUMN 31')
            # dropping rows
            df = df[df.iloc[:,0] != 'Vanaf 31 augustus Afgelopen kalender week1']
            df = df[df.iloc[:,0] != 'Groep Aantal Aantal Percentage Aantal Aantal Percentage']
            # splitting multi numbers in column
            df_temp = df['0'].str.split(expand=True)
            df['Agegroup'] = df_temp.iloc[:,0]+' '+df_temp.iloc[:,1]+' '+df_temp.iloc[:,2]
            df['Aantal positief'] = df_temp.iloc[:,-3]
            df['Aantal getest'] = df_temp.iloc[:,-2]
            df['Percentage positief'] = df_temp.iloc[:,-1]
            # cleaning up df
            df = df[['Agegroup', 'Aantal positief', 'Aantal getest', 'Percentage positief', 'Date']]
        elif df.columns[0] == 'Leeftijdsgroep':
            #print('LEEFTIJDSGROEP')
            if len(df.iloc[0,0]) < 10:
                if len(df.columns) < 4:
                    df['Leeftijdsgroep'].replace({'70\+':'70 en ouder'}, regex=True, inplace=True)
                    df_temp = df.iloc[:,1].str.split(expand=True)
                    #print(df_temp)
                    df['Agegroup'] = df['Leeftijdsgroep']
                    df['Aantal positief'] = df_temp.iloc[:,-3]
                    df['Aantal getest'] = df_temp.iloc[:,-2]
                    df['Percentage positief'] = df_temp.iloc[:,-1]
                    df = df[['Agegroup', 'Aantal positief', 'Aantal getest', 'Percentage positief', 'Date']]
                    #print(filename)
                    #print(df)
                else:
                    df['Leeftijdsgroep'].replace({'70\+':'70 en ouder'}, regex=True, inplace=True)
                    df_temp = df.iloc[:,2].str.split(expand=True)
                    #print(df_temp)
                    df['Agegroup'] = df['Leeftijdsgroep']
                    df['Aantal positief'] = df_temp.iloc[:,-3]
                    df['Aantal getest'] = df_temp.iloc[:,-2]
                    df['Percentage positief'] = df_temp.iloc[:,-1]
                    df = df[['Agegroup', 'Aantal positief', 'Aantal getest', 'Percentage positief', 'Date']]
                    #print(filename)
                    #print(df)
            else:
                df['Leeftijdsgroep'].replace({'70\+':'70 en ouder'}, regex=True, inplace=True)
                df_temp = df['Leeftijdsgroep'].str.split(expand=True)
                df['Agegroup'] = df_temp.iloc[:,0]+' '+df_temp.iloc[:,1]+' '+df_temp.iloc[:,2]
                df['Aantal positief'] = df_temp.iloc[:,-2]
                df['Aantal getest'] = df_temp.iloc[:,-1]
                df['Percentage positief'] = df.iloc[:,1]
                df = df[['Agegroup', 'Aantal positief', 'Aantal getest', 'Percentage positief', 'Date']]
                #print(filename)
                #print(df)
        else:
            print(df.iloc[0,0][0:8])
            print('ELSE')
        
        df = df[df['Agegroup'] != 'Groep']
        df = df[df['Agegroup'] != 'Onderwijs- of kinderopvang personeel']
        df = df[df['Agegroup'] != 'Onderwijs- of kinderopvang']
        df = df[df['Agegroup'] != 'Groep Aantal Aantal']
        
        if first:
            #print(df)
            data = pd.concat([data, df])
            first = False
        elif str(df.columns) == str(data.columns):
            #print('concatenating')
            data = pd.concat([data, df])
        else:
            print('ERROR')
            print(data.columns)
            print(df.columns)
            #data = pd.merge(data, df, on='Groep', how='outer')
            break # for debug        

data['Date'] = pd.to_datetime(data['Date'])
data.set_index('Date', inplace=True)
data.index = pd.PeriodIndex(data.index, freq='D')
data['Aantal positief'] = data['Aantal positief'].astype(int)
data['Aantal getest'] = data['Aantal getest'].astype(float, errors='ignore')
data['Percentage positief'] = data['Percentage positief'].astype(float)
data

Unnamed: 0_level_0,Agegroup,Aantal positief,Aantal getest,Percentage positief
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-07-21,Kinderen 0-6 jaar,5,898.0,0.6
2020-07-21,Kinderen 7-12 jaar,23,6221.0,0.4
2020-07-21,Kinderen 13-18 jaar,61,4864.0,1.3
2020-08-11,Kinderen 0-6 jaar,5,443.0,1.1
2020-08-11,Kinderen 7-12 jaar,46,1712.0,2.7
...,...,...,...,...
2021-03-30,30 - 39,95266,6796.0,7.1
2021-03-30,40 - 49,78325,6735.0,8.6
2021-03-30,50 - 59,60882,6949.0,11.4
2021-03-30,60 - 69,45228,4261.0,9.4


In [186]:
for i,v in data['Aantal getest'].items():
    print(i)
    print(v)
    print(type(v))
    print()

2020-07-21
898.0
<class 'float'>

2020-07-21
6221.0
<class 'float'>

2020-07-21
4864.0
<class 'float'>

2020-08-11
443.0
<class 'float'>

2020-08-11
1712.0
<class 'float'>

2020-08-11
4135.0
<class 'float'>

2020-07-07
7033.0
<class 'float'>

2020-07-07
19446.0
<class 'float'>

2020-07-07
8661.0
<class 'float'>

2020-08-18
538.0
<class 'float'>

2020-08-18
2602.0
<class 'float'>

2020-08-18
7172.0
<class 'float'>

2020-07-28
936.0
<class 'float'>

2020-07-28
5099.0
<class 'float'>

2020-07-28
6320.0
<class 'float'>

2020-08-04
394.0
<class 'float'>

2020-08-04
1510.0
<class 'float'>

2020-08-04
3586.0
<class 'float'>

2020-09-29
72.0
<class 'float'>

2020-09-29
1151.0
<class 'float'>

2020-09-29
23414.0
<class 'float'>

2020-09-22
829.0
<class 'float'>

2020-09-22
10010.0
<class 'float'>

2020-09-22
22762.0
<class 'float'>

2020-09-01
947.0
<class 'float'>

2020-09-01
6621.0
<class 'float'>

2020-09-01
nan
<class 'float'>

2020-09-15
1214.0
<class 'float'>

2020-09-15
12720.0
<class 'f

In [190]:
%matplotlib widget
data.plot()

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<AxesSubplot:xlabel='Date'>

## Chapter 7 or 9

## What can be used from the reports

### Chapter 4 agegroups
Reported, Hospitalised, Deceased agegroups

https://data.rivm.nl/covid-19/COVID-19_casus_landelijk.json