In [1]:
# Import necessary libraries.
import requests
import camelot
import pandas as pd
from datetime import date

In [2]:
# Download source latest PDF file "Arrangements for Compulsory Testing
# in respect of Buildings Resided by COVID-19 Cases with the N501Y/L452R variants
# in accordance with the Compulsory Testing Notice issued on 10 January 2022" from coronavirus.gov.hk.
url = 'https://www.coronavirus.gov.hk/pdf/CTN_Specified_premises_and_Dates_of_Testing.pdf'
r = requests.get(url, allow_redirects=True)

open('data/CTN_Specified_premises_and_Dates_of_Testing.pdf', 'wb').write(r.content)

609441

In [3]:
# Define the source PDF file path.
source_file = 'data/CTN_Specified_premises_and_Dates_of_Testing.pdf'

In [4]:
# Extract all tables from all pages in a PDF file.
data_tables = camelot.read_pdf(source_file, pages='all')

In [5]:
# Print the number or tables are bthat eing extracted.
print("Total Number of Tables Being Extracted:", data_tables.n)

Total Number of Tables Being Extracted: 17


In [6]:
# Parse all extract tables into a single list.
data_list = []

for t in range(data_tables.n):
    data = data_tables[t].df
    data_list.append(data)

In [7]:
# Convert list into Pandas dataframe.
data_final_1 = pd.concat(data_list, ignore_index=False, sort=False)

In [8]:
# Change the dataframe column headers.
data_final_1.columns = ['Specified_Premises', 'Dates_of_Testing']

In [9]:
# Display header of dataframe.
data_final_1

Unnamed: 0,Specified_Premises,Dates_of_Testing
0,指明地方 \nSpecified Premises,檢測日期 \n(日/月/年) \nDates of Testing \n(DD/MM/YYYY)
1,"香港仔嘉隆苑嘉昇閣 \nKa Sing House, Ka Lung Court, Aber...",06/02/2022
2,"東涌映灣園 1 期賞濤軒 1 座 \nTower 1, Monterey Cove, Car...",06/02/2022
3,"觀塘協和街 33 號凱滙第 3 座 \nTower 3, Grand Central, 33...",06/02/2022
4,"深水埗西邨路 19 號富昌邨富悅樓 \nFu Yuet House, Fu Cheong E...",07/02/2022
...,...,...
2,"深水埗荔枝角道 258 號 \n258 Lai Chi Kok Road, Sham Shu...",05/02/2022 ~ 07/02/2022 \n08/02/2022 ~ 10/02/2...
3,"天水圍天華路 39 號天悅邨悅泰樓 \nYuet Tai House, Tin Yuet E...",05/02/2022 ~ 07/02/2022 \n08/02/2022 ~ 10/02/2...
4,"元朗福德街 36A 號 \n36A Fook Tak Street, Yuen Long",05/02/2022 ~ 07/02/2022 \n08/02/2022 ~ 10/02/2...
5,"屯門友愛邨愛智樓 \nOi Chi House, Yau Oi Estate, Tuen Mun",05/02/2022 ~ 07/02/2022 \n08/02/2022 ~ 10/02/2...


In [10]:
# Erase unnecessary rows from the dataframe.
data_final_2 = data_final_1[data_final_1.Specified_Premises != '指明地方 \nSpecified Premises']

In [11]:
# Display header of dataframe.
data_final_2

Unnamed: 0,Specified_Premises,Dates_of_Testing
1,"香港仔嘉隆苑嘉昇閣 \nKa Sing House, Ka Lung Court, Aber...",06/02/2022
2,"東涌映灣園 1 期賞濤軒 1 座 \nTower 1, Monterey Cove, Car...",06/02/2022
3,"觀塘協和街 33 號凱滙第 3 座 \nTower 3, Grand Central, 33...",06/02/2022
4,"深水埗西邨路 19 號富昌邨富悅樓 \nFu Yuet House, Fu Cheong E...",07/02/2022
5,大嶼山東涌健東路 1 號映灣園二期映濤軒第 7 座 \nAlbany Cove Tower...,07/02/2022
...,...,...
2,"深水埗荔枝角道 258 號 \n258 Lai Chi Kok Road, Sham Shu...",05/02/2022 ~ 07/02/2022 \n08/02/2022 ~ 10/02/2...
3,"天水圍天華路 39 號天悅邨悅泰樓 \nYuet Tai House, Tin Yuet E...",05/02/2022 ~ 07/02/2022 \n08/02/2022 ~ 10/02/2...
4,"元朗福德街 36A 號 \n36A Fook Tak Street, Yuen Long",05/02/2022 ~ 07/02/2022 \n08/02/2022 ~ 10/02/2...
5,"屯門友愛邨愛智樓 \nOi Chi House, Yau Oi Estate, Tuen Mun",05/02/2022 ~ 07/02/2022 \n08/02/2022 ~ 10/02/2...


In [12]:
# Define the variable for the date string.
today = date.today()
today_a = today.strftime("%d/%m/%Y")
print(today_a)

06/02/2022


In [13]:
# Filter dataframe by the specific name of district (Tai Po) and date (today).
data_final_Tai_Po_Today = data_final_2[(data_final_2['Specified_Premises'].str.contains("Tai Po")) & 
                                        data_final_2['Dates_of_Testing'].str.contains(str(today_a))]

In [14]:
# Display header of dataframe.
data_final_Tai_Po_Today

Unnamed: 0,Specified_Premises,Dates_of_Testing
7,"大埔天賦海灣二期-溋玥 16 座 \nTower 16, Phase 2 - Provide...",06/02/2022
2,"大埔太和邨福和樓 \nFook Wo House, Tai Wo Estate, Tai Po",06/02/2022 \n09/02/2022
4,"深水埗大埔道 76-84 號華僑大廈 \nWah Kiu Mansion, 76-84 Ta...",06/02/2022 \n09/02/2022
12,"大埔太和邨翠和樓 \nTsui Wo House, Tai Wo Estate, Tai Po",06/02/2022 \n09/02/2022
5,"大埔太湖花園 10 座 \nBlock 10, Serenity Park, Tai Po",04/02/2022 ~ 06/02/2022 \n07/02/2022 ~ 09/02/2...


In [15]:
# Filter dataframe by the specific name of district (Tsuen Wan) and date (today).
data_final_Tsuen_Wan_Today = data_final_2[(data_final_2['Specified_Premises'].str.contains("Tsuen Wan")) & 
                                        data_final_2['Dates_of_Testing'].str.contains(str(today_a))]

In [16]:
# Display header of dataframe.
data_final_Tsuen_Wan_Today

Unnamed: 0,Specified_Premises,Dates_of_Testing
6,"荃灣梨木樹邨葵樹樓 \nKwai Shue House, Lei Muk Shue Esta...",04/02/2022 ~ 06/02/2022 \n07/02/2022 ~ 09/02/2...
