In [16]:
# Import necessary libraries.
import requests
import camelot
import pandas as pd
from datetime import date

In [2]:
# Download source latest PDF file "Arrangements for Compulsory Testing
# in respect of Buildings Resided by COVID-19 Cases with the N501Y/L452R variants
# in accordance with the Compulsory Testing Notice issued on 10 January 2022" from coronavirus.gov.hk.
url = 'https://www.coronavirus.gov.hk/pdf/CTN_Specified_premises_and_Dates_of_Testing.pdf'
r = requests.get(url, allow_redirects=True)

open('data/CTN_Specified_premises_and_Dates_of_Testing.pdf', 'wb').write(r.content)

532457

In [3]:
# Define the source PDF file path.
source_file = 'data/CTN_Specified_premises_and_Dates_of_Testing.pdf'

In [4]:
# Extract all tables from all pages in a PDF file.
data_tables = camelot.read_pdf(source_file, pages='all')

In [5]:
# Print the number or tables are bthat eing extracted.
print("Total Number of Tables Being Extracted:", data_tables.n)

Total Number of Tables Being Extracted: 12


In [6]:
# Parse all extract tables into a single list.
data_list = []

for t in range(data_tables.n):
    data = data_tables[t].df
    data_list.append(data)

In [7]:
# Convert list into Pandas dataframe.
data_final_1 = pd.concat(data_list, ignore_index=False, sort=False)

In [8]:
# Change the dataframe column headers.
data_final_1.columns = ['Specified_Premises', 'Dates_of_Testing']

In [9]:
# Display header of dataframe.
data_final_1

Unnamed: 0,Specified_Premises,Dates_of_Testing
0,指明地方 \nSpecified Premises,檢測日期 \n(日/月/年) \nDates of Testing \n(DD/MM/YYYY)
1,"香港仔東勝道 2-4 號 \n2-4 Tung Sing Road, Aberdeen",04/02/2022
2,"長沙灣元州街 450-464 號寶華閣 \nPo Wah Court, 450-464 Un...",05/02/2022
3,"香港仔嘉隆苑嘉昇閣 \nKa Sing House, Ka Lung Court, Aber...",06/02/2022
4,"東涌映灣園 1 期賞濤軒 1 座 \nTower 1, Monterey Cove, Car...",06/02/2022
...,...,...
11,"沙田博康邨博智樓 \nPok Chi House, Pok Hong Estate, Sha...",06/02/2022
12,"西營盤怡豐閣 \nYee Fung Court, Sai Ying Pun",05/02/2022 \n06/02/2022 \n09/02/2022
13,"深水埗南昌街 137 號 \n137 Nam Cheong Street, Sham Shu...",05/02/2022 \n06/02/2022 \n07/02/2022 \n10/02/2022
0,指明地方 \nSpecified Premises,檢測日期 \n(日/月/年) \nDates of Testing \n(DD/MM/YYYY)


In [10]:
# Erase unnecessary rows from the dataframe.
data_final_2 = data_final_1[data_final_1.Specified_Premises != '指明地方 \nSpecified Premises']

In [11]:
# Display header of dataframe.
data_final_2

Unnamed: 0,Specified_Premises,Dates_of_Testing
1,"香港仔東勝道 2-4 號 \n2-4 Tung Sing Road, Aberdeen",04/02/2022
2,"長沙灣元州街 450-464 號寶華閣 \nPo Wah Court, 450-464 Un...",05/02/2022
3,"香港仔嘉隆苑嘉昇閣 \nKa Sing House, Ka Lung Court, Aber...",06/02/2022
4,"東涌映灣園 1 期賞濤軒 1 座 \nTower 1, Monterey Cove, Car...",06/02/2022
5,"觀塘協和街 33 號凱滙第 3 座 \nTower 3, Grand Central, 33...",06/02/2022
...,...,...
10,"葵涌葵翠邨碧翠樓 \nBik Tsui House, Kwai Tsui Estate, K...",06/02/2022
11,"沙田博康邨博智樓 \nPok Chi House, Pok Hong Estate, Sha...",06/02/2022
12,"西營盤怡豐閣 \nYee Fung Court, Sai Ying Pun",05/02/2022 \n06/02/2022 \n09/02/2022
13,"深水埗南昌街 137 號 \n137 Nam Cheong Street, Sham Shu...",05/02/2022 \n06/02/2022 \n07/02/2022 \n10/02/2022


In [21]:
# Define the variable for the date string.
today = date.today()
today_a = today.strftime("%d/%m/%Y")
print(today_a)

04/02/2022


In [34]:
# Filter dataframe by the specific name of district (Tai Po) and date (today).
data_final_Tai_Po_Today = data_final_2[(data_final_2['Specified_Premises'].str.contains("Tai Po")) & 
                                        data_final_2['Dates_of_Testing'].str.contains(str(today_a))]

In [35]:
# Display header of dataframe.
data_final_Tai_Po_Today

Unnamed: 0,Specified_Premises,Dates_of_Testing
10,"大埔大明里 1-27 號寶華樓 C 座 \nBlock C, Po Wah Building...",04/02/2022
3,"大埔太和邨福和樓 \nFook Wo House, Tai Wo Estate, Tai Po",04/02/2022 \n05/02/2022 \n06/02/2022 \n09/02/2022
8,"深水埗大埔道 76-84 號華僑大廈 \nWah Kiu Mansion, 76-84 Ta...",04/02/2022 \n05/02/2022 \n06/02/2022 \n09/02/2022
5,"大埔太和邨翠和樓 \nTsui Wo House, Tai Wo Estate, Tai Po",04/02/2022 \n05/02/2022 \n06/02/2022 \n09/02/2022
7,"大埔海日灣 II \n 11 座 \nTower 11, Centra Horizon, ...",04/02/2022 \n05/02/2022 \n08/02/2022
9,"大埔富善邨善景樓 \nShin King House, Fu Shin Estate, Ta...",04/02/2022 \n05/02/2022 \n08/02/2022


In [36]:
# Filter dataframe by the specific name of district (Tsuen Wan) and date (today).
data_final_Tsuen_Wan_Today = data_final_2[(data_final_2['Specified_Premises'].str.contains("Tsuen Wan")) & 
                                        data_final_2['Dates_of_Testing'].str.contains(str(today_a))]

In [37]:
# Display header of dataframe.
data_final_Tsuen_Wan_Today

Unnamed: 0,Specified_Premises,Dates_of_Testing
9,"荃灣石圍角邨石菊樓 \nShek Kuk House, Shek Wai Kok Estat...",04/02/2022
13,"荃灣翠豐臺 3 座 \nBlock 3, Summit Terrace, Tsuen Wan",04/02/2022
11,"荃灣綠楊新邨 F 座 \nBlock F, Luk Yeung Sun Chuen, Tsu...",04/02/2022 \n07/02/2022
12,"荃灣綠楊新邨 P 座 \nBlock P, Luk Yeung Sun Chuen, Tsu...",04/02/2022 \n07/02/2022
8,"荃灣海濱花園海銀閣 \nHoi Ngan Mansion, Riviera Gardens,...",04/02/2022 \n07/02/2022
