Notebook to prototype/test update.py

- OPH page => PDF link
- PDF link => PDF file
- PDF file => text
- text => Table
- Table => CSV

In [1]:
import requests as rq

from html.parser import HTMLParser

from PyPDF4 import PdfFileReader as PDFR

import os

In [2]:
'''
- Get page
- Find PDF link
'''

OPH_url = 'https://www.ottawapublichealth.ca/en/reports-research-and-statistics/la-maladie-coronavirus-covid-19.aspx'

req = rq.get(OPH_url)

In [3]:
class PdfUrlParser(HTMLParser):
    
    def __init__(self):
        super().__init__()
        self.pdf_url = None
    
    def handle_starttag(self, tag, attrs):
        for attr in attrs:
            if self.pdf_url == None \
               and 'href' in attr[0] \
               and 'pdf' in attr[1]:
                self.pdf_url = '/'.join(OPH_url.split('/')[:3]) + attr[1]
                break
                
parser = PdfUrlParser()
parser.feed(req.text)
PDF_url = parser.pdf_url

In [4]:
'''
Fetch PDF
'''
pdfreq = rq.get(PDF_url)

In [5]:
PDF_file = './pdf/' + os.path.basename(PDF_url)
print(PDF_file)
with open(PDF_file, 'wb') as f:
    f.write(pdfreq.content)

./pdf/Web-PDF-COVID-2019-epi-update_20200425-ri8ri.pdf


In [6]:
# creating a pdf reader object
fr = PDFR(PDF_file)

text = ''

for pgn in range(fr.numPages):
    pg = fr.getPage(pgn) 
    text = text + pg.extractText()

'''
Strip all newlines
(random from one PDF to another)
'''
text = text.replace('\n', '')


In [7]:
start = text.index('Data Table for Figures 1 and 2')
end = text.index('Data Table for Figure 3')
start, end

(9938, 11975)

In [8]:
'''
Need to get around some ugliness where the 
PDF reader ends up with dates split across lines
'''
snippet = text[start:end].split()

In [9]:
len(snippet)

334

In [10]:
class Table(object):
    def __init__(self):
        self.row = ['Date,Total,Daily']
        
    def add_row(self, date, total, daily):
        row = '%s,%s,%s' % (date, total, daily)
        self.row.append(row)
        
    def is_date(self, line):
        # Date format is like 2/19/2020
        rc = False
        mdy = line.split('/')
        if len(mdy) == 3 \
           and all( [n.isnumeric() for n in mdy] ):
           rc = True
        return rc
        
    def process(self, slice):
        for i in range(len(slice)):
            if i + 2 < len(slice) \
               and self.is_date(slice[i]) \
               and slice[i+1].isnumeric() \
               and slice[i+2].isnumeric():
                self.add_row(slice[i], slice[i+1], slice[i+2])

table = Table()
table.process(snippet)

In [11]:
table.row[:5]

['Date,Total,Daily',
 '2/19/2020,0,0',
 '2/20/2020,0,0',
 '2/21/2020,0,0',
 '2/22/2020,0,0']

In [12]:
table.row[-5:]

['4/20/2020,985,18',
 '4/21/2020,1002,48',
 '4/22/2020,1044,39',
 '4/23/2020,1060,48',
 '4/24/2020,1060,21']

In [13]:
'''
Ready to write the csv!
'''

with open('timeseries/ottawa_cases.csv', 'w')as csv:
    for line in table.row:
        csv.write(line + '\n')