In [9]:
# Import necessary libraries.
import requests
import camelot
import pandas as pd
from datetime import date, timedelta

In [5]:
# Define the variable for the date string.
yesterday = date.today() - timedelta(days=1)
yesterday_a = yesterday.strftime('%Y%m%d')
today = date.today()
today_a = today.strftime('%Y%m%d')
print(yesterday_a)
print(today_a)

20220220
20220221


In [6]:
# Download source latest PDF file "Arrangements for Compulsory Testing
# in respect of Buildings Resided by COVID-19 Cases with the N501Y/L452R variants
# in accordance with the Compulsory Testing Notice issued on 10 January 2022" from coronavirus.gov.hk.
url = 'https://www.coronavirus.gov.hk/pdf/CTN_Specified_premises_and_Dates_of_Testing.pdf'
r = requests.get(url, allow_redirects=True)

file_path = 'data/CTN_Specified_premises_and_Dates_of_Testing_' + today_a + '.pdf'

open(file_path, 'wb').write(r.content)

465798

In [7]:
# Define the source PDF file path.
source_file = file_path

In [8]:
# Extract all tables from all pages in a PDF file.
data_tables = camelot.read_pdf(source_file, pages='all')

In [10]:
# Print the number or tables are bthat eing extracted.
print("Total Number of Tables Being Extracted:", data_tables.n)

Total Number of Tables Being Extracted: 8


In [11]:
# Parse all extract tables into a single list.
data_list = []

for t in range(data_tables.n):
    data = data_tables[t].df
    data_list.append(data)

In [12]:
# Convert list into Pandas dataframe.
data_final_1 = pd.concat(data_list, ignore_index=False, sort=False)

In [13]:
# Change the dataframe column headers.
data_final_1.columns = ['Specified_Premises', 'Dates_of_Testing']

In [14]:
# Display header of dataframe.
data_final_1

Unnamed: 0,Specified_Premises,Dates_of_Testing
0,指明地方 \nSpecified Premises,檢測日期 \n(日/月/年) \nDates of Testing \n(DD/MM/YYYY)
1,沙田利安邨利盛樓（不包括位於地下 A 及 B 翼的香港神託會培\n真幼稚園） \nLee S...,20/02/2022 ~ 22/02/2022
2,"東涌滿東邨滿和樓 \nMun Wo House, Mun Tung Estate, Tung...",19/02/2022 ~ 21/02/2022
3,"慈雲山慈樂邨樂誠樓 \nLok Shing House, Tsz Lok Estate, T...",19/02/2022 ~ 21/02/2022
4,"鑽石山鳳德邨紫鳳樓 \nTsz Fung House, Fung Tak Estate, D...",19/02/2022 ~ 21/02/2022
...,...,...
14,"石硤尾石硤尾邨 21 座 \nBlock 21, Shek Kip Mei Estate, ...",21/02/2022 ~ 22/02/2022 \n23/02/2022 ~ 25/02/2022
15,"沙田禾輋邨景和樓 \nKing Wo House, Wo Che Estate, Sha Tin",21/02/2022 ~ 22/02/2022 \n23/02/2022 ~ 25/02/2022
16,"屯門育康街 8 號慧景閣 \nParkview Court, 8 Yuk Hong Stre...",20/02/2022 ~ 22/02/2022 \n23/02/2022 ~ 25/02/2022
17,"粉嶺牽晴間 10 座 \nBlock 10, Dawning Views, Fanling",20/02/2022 ~ 22/02/2022 \n23/02/2022 ~ 25/02/2022


In [15]:
# Erase unnecessary rows from the dataframe.
data_final_2 = data_final_1[data_final_1.Specified_Premises != '指明地方 \nSpecified Premises']

In [16]:
# Display header of dataframe.
data_final_2

Unnamed: 0,Specified_Premises,Dates_of_Testing
1,沙田利安邨利盛樓（不包括位於地下 A 及 B 翼的香港神託會培\n真幼稚園） \nLee S...,20/02/2022 ~ 22/02/2022
2,"東涌滿東邨滿和樓 \nMun Wo House, Mun Tung Estate, Tung...",19/02/2022 ~ 21/02/2022
3,"慈雲山慈樂邨樂誠樓 \nLok Shing House, Tsz Lok Estate, T...",19/02/2022 ~ 21/02/2022
4,"鑽石山鳳德邨紫鳳樓 \nTsz Fung House, Fung Tak Estate, D...",19/02/2022 ~ 21/02/2022
5,"藍田安田邨安麗樓 \nOn Lai House, On Tin Estate, Lam Tin",19/02/2022 ~ 21/02/2022
...,...,...
14,"石硤尾石硤尾邨 21 座 \nBlock 21, Shek Kip Mei Estate, ...",21/02/2022 ~ 22/02/2022 \n23/02/2022 ~ 25/02/2022
15,"沙田禾輋邨景和樓 \nKing Wo House, Wo Che Estate, Sha Tin",21/02/2022 ~ 22/02/2022 \n23/02/2022 ~ 25/02/2022
16,"屯門育康街 8 號慧景閣 \nParkview Court, 8 Yuk Hong Stre...",20/02/2022 ~ 22/02/2022 \n23/02/2022 ~ 25/02/2022
17,"粉嶺牽晴間 10 座 \nBlock 10, Dawning Views, Fanling",20/02/2022 ~ 22/02/2022 \n23/02/2022 ~ 25/02/2022


In [17]:
# Define the variable for the date string.
today = date.today()
today_a = today.strftime("%d/%m/%Y")
print(today_a)

21/02/2022


In [18]:
# Filter dataframe by the specific name of district (Tai Po) and date (today).
data_final_Tai_Po_Today = data_final_2[(data_final_2['Specified_Premises'].str.contains("Tai Po")) & 
                                        data_final_2['Dates_of_Testing'].str.contains(str(today_a))]

In [19]:
# Display header of dataframe.
data_final_Tai_Po_Today

Unnamed: 0,Specified_Premises,Dates_of_Testing
8,"大埔富善邨善美樓 \nShin Mei House, Fu Shin Estate, Tai Po",19/02/2022 ~ 21/02/2022 \n22/02/2022 ~ 24/02/2022
3,"大埔富善邨善群樓 \nShin Kwan House, Fu Shin Estate, Ta...",19/02/2022 ~ 21/02/2022 \n22/02/2022 ~ 24/02/2022
8,"大埔富亨邨亨耀樓 \nHeng Yiu House, Fu Heng Estate, Tai Po",19/02/2022 ~ 21/02/2022 \n22/02/2022 ~ 24/02/2022


In [20]:
# Filter dataframe by the specific name of district (Tsuen Wan) and date (today).
data_final_Tsuen_Wan_Today = data_final_2[(data_final_2['Specified_Premises'].str.contains("Tsuen Wan")) & 
                                        data_final_2['Dates_of_Testing'].str.contains(str(today_a))]

In [21]:
# Display header of dataframe.
data_final_Tsuen_Wan_Today

Unnamed: 0,Specified_Premises,Dates_of_Testing
2,"荃灣綠楊新邨 F 座 \nBlock F, Luk Yeung Sun Chuen, Tsu...",21/02/2022 ~ 23/02/2022
