Notebook to prototype/test update.py

- OPH page => PDF link
- PDF link => PDF file
- PDF file => text
- text => Table
- Table => CSV

In [1]:
import requests as rq
from html.parser import HTMLParser
from PyPDF4 import PdfFileReader as PDFR
import os
import csv

In [2]:
'''
- Get page
- Find PDF link
'''

OPH_url = 'https://www.ottawapublichealth.ca/en/reports-research-and-statistics/la-maladie-coronavirus-covid-19.aspx'

req = rq.get(OPH_url)

In [3]:
class PdfUrlParser(HTMLParser):
    
    def __init__(self):
        super().__init__()
        self.pdf_url = None
    
    def handle_starttag(self, tag, attrs):
        for attr in attrs:
            if self.pdf_url == None \
               and 'href' in attr[0] \
               and 'pdf' in attr[1]:
                self.pdf_url = '/'.join(OPH_url.split('/')[:3]) + attr[1]
                break
                
parser = PdfUrlParser()
parser.feed(req.text)
PDF_url = parser.pdf_url

In [4]:
'''
Fetch PDF
'''
pdfreq = rq.get(PDF_url)

In [5]:
PDF_file = './pdf/' + os.path.basename(PDF_url)
print(PDF_file)
with open(PDF_file, 'wb') as f:
    f.write(pdfreq.content)

./pdf/Web-PDF-COVID-2019-epi-update_20200501-y9i5.pdf


In [6]:
# creating a pdf reader object
fr = PDFR(PDF_file)

text = ''

for pgn in range(fr.numPages):
    pg = fr.getPage(pgn) 
    text = text + pg.extractText()

'''
Strip all newlines
(random from one PDF to another)
'''
text = text.replace('\n', '')


In [7]:

res = [i for i in range(len(text)) if text.upper().startswith('Data Table for Figure'.upper(), i)] 
[ text[i:i+30] for i in res]

['Data Table for Figures 1 Earli',
 'Data table for Figure 2 Earlie',
 'Data Table for Figure 3  Week ']

In [8]:
start1 = res[0]
start2 = res[1]
end = res[2]
start1, start2, end

(7370, 8973, 10488)

In [9]:
'''
Need to get around some ugliness where the 
PDF reader ends up with dates split across lines
'''
snippet1 = text[start1:start2].split()
snippet2 = text[start2:end].split()

In [10]:
snippet2

['Data',
 'table',
 'for',
 'Figure',
 '2',
 'Earliest',
 'of',
 'Onset,',
 'Test',
 'or',
 'Reported',
 'Date',
 'Daily',
 'Total',
 'of',
 'Ottawa',
 'Residents',
 'with',
 'Confirmed',
 'COVID-19',
 '2/10/2020',
 '2',
 '2/11/2020',
 '0',
 '2/12/2020',
 '0',
 '2/13/2020',
 '0',
 '2/14/2020',
 '0',
 '2/15/2020',
 '0',
 '2/16/2020',
 '0',
 '2/17/2020',
 '0',
 '2/18/2020',
 '0',
 '2/19/2020',
 '0',
 '2/20/2020',
 '1',
 '2/21/2020',
 '0',
 '2/22/2020',
 '1',
 '2/23/2020',
 '1',
 '2/24/2020',
 '1',
 '2/25/2020',
 '0',
 '2/26/2020',
 '0',
 '2/27/2020',
 '0',
 '2/28/2020',
 '0',
 '2/29/2020',
 '0',
 '3/01/2020',
 '2',
 '3/02/2020',
 '3',
 '3/03/2020',
 '1',
 '3/04/2020',
 '1',
 '3/05/2020',
 '6',
 '3/06/2020',
 '5',
 '3/07/2020',
 '3',
 '3/08/2020',
 '8',
 '3/09/2020',
 '8',
 '3/10/2020',
 '7',
 '3/11/2020',
 '8',
 'COVID-19',
 'Epidemiology',
 'Update',
 '|',
 'Ottawa',
 'Public',
 'Health',
 '11',
 'Earliest',
 'of',
 'Onset,',
 'Test',
 'or',
 'Reported',
 'Date',
 'Daily',
 'Total',
 'o

In [11]:
class DateNumTable(object):
    '''
    
    '''
    def __init__(self):
        self.dict = {}

    def is_date(self, line):
        # Date format is like 2/19/2020
        rc = False
        mdy = line.split('/')
        if len(mdy) == 3 \
           and all( [n.isnumeric() for n in mdy] ):
           rc = True
        return rc

    def cell_to_col_dict(self, slice, col_name, col_idx):
        for i in range(len(slice)):
            if self.is_date(slice[i]) \
               and slice[i+col_idx].isnumeric():
                if slice[i] not in self.dict:
                    self.dict[slice[i]] = { col_name: slice[i+col_idx] }
                else:
                    self.dict[slice[i]].update({ col_name: slice[i+col_idx] })
    
    
dnt = DateNumTable()
dnt.cell_to_col_dict(snippet1, 'Total', 1)
dnt.cell_to_col_dict(snippet2, 'Daily', 1)
dnt.dict


{'2/10/2020': {'Total': '2', 'Daily': '2'},
 '2/11/2020': {'Total': '2', 'Daily': '0'},
 '2/12/2020': {'Total': '2', 'Daily': '0'},
 '2/13/2020': {'Total': '2', 'Daily': '0'},
 '2/14/2020': {'Total': '2', 'Daily': '0'},
 '2/15/2020': {'Total': '2', 'Daily': '0'},
 '2/16/2020': {'Total': '2', 'Daily': '0'},
 '2/17/2020': {'Total': '2', 'Daily': '0'},
 '2/18/2020': {'Total': '2', 'Daily': '0'},
 '2/19/2020': {'Total': '2', 'Daily': '0'},
 '2/20/2020': {'Total': '3', 'Daily': '1'},
 '2/21/2020': {'Total': '3', 'Daily': '0'},
 '2/22/2020': {'Total': '4', 'Daily': '1'},
 '2/23/2020': {'Total': '5', 'Daily': '1'},
 '2/24/2020': {'Total': '6', 'Daily': '1'},
 '2/25/2020': {'Total': '6', 'Daily': '0'},
 '2/26/2020': {'Total': '6', 'Daily': '0'},
 '2/27/2020': {'Total': '6', 'Daily': '0'},
 '2/28/2020': {'Total': '6', 'Daily': '0'},
 '2/29/2020': {'Total': '6', 'Daily': '0'},
 '3/01/2020': {'Total': '8', 'Daily': '2'},
 '3/02/2020': {'Total': '11', 'Daily': '3'},
 '3/03/2020': {'Total': '12', '

In [12]:
dnt.dict['4/25/2020']['Total']

'1214'

In [13]:
import csv

with open('timeseries/ottawa_cases.csv', 'w', newline='') as csvfile:
    fieldnames = ['Date', 'Total', 'Daily']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for k in dnt.dict.keys():
        v = dnt.dict[k]
        row_dict = {
            'Date': k, 
            'Total': v['Total'],
            'Daily': v['Daily'],
        }
        writer.writerow(row_dict)
