In [1]:
# Import necessary libraries.
import requests
import camelot
import pandas as pd
from datetime import date

In [2]:
# Download source latest PDF file "Arrangements for Compulsory Testing
# in respect of Buildings Resided by COVID-19 Cases with the N501Y/L452R variants
# in accordance with the Compulsory Testing Notice issued on 10 January 2022" from coronavirus.gov.hk.
url = 'https://www.coronavirus.gov.hk/pdf/CTN_Specified_premises_and_Dates_of_Testing.pdf'
r = requests.get(url, allow_redirects=True)

open('data/CTN_Specified_premises_and_Dates_of_Testing.pdf', 'wb').write(r.content)

663875

In [3]:
# Define the source PDF file path.
source_file = 'data/CTN_Specified_premises_and_Dates_of_Testing.pdf'

In [4]:
# Extract all tables from all pages in a PDF file.
data_tables = camelot.read_pdf(source_file, pages='all')

In [5]:
# Print the number or tables are bthat eing extracted.
print("Total Number of Tables Being Extracted:", data_tables.n)

Total Number of Tables Being Extracted: 21


In [6]:
# Parse all extract tables into a single list.
data_list = []

for t in range(data_tables.n):
    data = data_tables[t].df
    data_list.append(data)

In [7]:
# Convert list into Pandas dataframe.
data_final_1 = pd.concat(data_list, ignore_index=False, sort=False)

In [8]:
# Change the dataframe column headers.
data_final_1.columns = ['Specified_Premises', 'Dates_of_Testing']

In [9]:
# Display header of dataframe.
data_final_1

Unnamed: 0,Specified_Premises,Dates_of_Testing
0,指明地方 \nSpecified Premises,檢測日期 \n(日/月/年) \nDates of Testing \n(DD/MM/YYYY)
1,"筲箕灣明華大廈 1 座 \nTower 1, Ming Wah Dai Ha, Shau K...",08/02/2022
2,"荃灣名逸居 1 座 \nBlock 1, New Haven, Tsuen Wan",08/02/2022
3,"長沙灣長沙灣邨長欣樓 \nCheung Yan House, Cheung Sha Wan ...",08/02/2022
4,"大嶼山嶼南道貝澳老圍村 20C 號 \n20C Pui O Lo Wai Tsuen, So...",08/02/2022
...,...,...
1,"荃灣川龍街 66 號華景樓 \nWah King House, 66 Chuen Lung ...",07/02/2022 ~ 09/02/2022 \n10/02/2022 ~ 12/02/2022
2,"荃灣荃威花園 J 座 \nBlock J, Allway Gardens, Tsuen Wan",07/02/2022 ~ 09/02/2022 \n10/02/2022 ~ 12/02/2022
3,"荃灣海濱花園 22 座海裕閣 \nBlock 22, Hoi Yue Mansion, Ri...",07/02/2022 ~ 09/02/2022 \n10/02/2022 ~ 12/02/2022
4,"荃灣海灣花園 1 座 \nBlock 1, Waterside Plaza, Tsuen Wan",07/02/2022 ~ 09/02/2022 \n10/02/2022 ~ 12/02/2022


In [10]:
# Erase unnecessary rows from the dataframe.
data_final_2 = data_final_1[data_final_1.Specified_Premises != '指明地方 \nSpecified Premises']

In [11]:
# Display header of dataframe.
data_final_2

Unnamed: 0,Specified_Premises,Dates_of_Testing
1,"筲箕灣明華大廈 1 座 \nTower 1, Ming Wah Dai Ha, Shau K...",08/02/2022
2,"荃灣名逸居 1 座 \nBlock 1, New Haven, Tsuen Wan",08/02/2022
3,"長沙灣長沙灣邨長欣樓 \nCheung Yan House, Cheung Sha Wan ...",08/02/2022
4,"大嶼山嶼南道貝澳老圍村 20C 號 \n20C Pui O Lo Wai Tsuen, So...",08/02/2022
5,香港仔香港仔中心港昌閣(G 座) \nKong Cheong Court (Block G)...,08/02/2022
...,...,...
1,"荃灣川龍街 66 號華景樓 \nWah King House, 66 Chuen Lung ...",07/02/2022 ~ 09/02/2022 \n10/02/2022 ~ 12/02/2022
2,"荃灣荃威花園 J 座 \nBlock J, Allway Gardens, Tsuen Wan",07/02/2022 ~ 09/02/2022 \n10/02/2022 ~ 12/02/2022
3,"荃灣海濱花園 22 座海裕閣 \nBlock 22, Hoi Yue Mansion, Ri...",07/02/2022 ~ 09/02/2022 \n10/02/2022 ~ 12/02/2022
4,"荃灣海灣花園 1 座 \nBlock 1, Waterside Plaza, Tsuen Wan",07/02/2022 ~ 09/02/2022 \n10/02/2022 ~ 12/02/2022


In [12]:
# Define the variable for the date string.
today = date.today()
today_a = today.strftime("%d/%m/%Y")
print(today_a)

08/02/2022


In [13]:
# Filter dataframe by the specific name of district (Tai Po) and date (today).
data_final_Tai_Po_Today = data_final_2[(data_final_2['Specified_Premises'].str.contains("Tai Po")) & 
                                        data_final_2['Dates_of_Testing'].str.contains(str(today_a))]

In [14]:
# Display header of dataframe.
data_final_Tai_Po_Today

Unnamed: 0,Specified_Premises,Dates_of_Testing
3,"大埔海日灣 II 11 座 \nTower 11, Centra Horizon, Tai Po",08/02/2022
4,"大埔富善邨善景樓 \nShin King House, Fu Shin Estate, Ta...",08/02/2022
12,"大埔怡雅苑怡達閣 \nYee Dat House, Yee Nga Court, Tai Po",05/02/2022 ~ 07/02/2022 \n08/02/2022 ~ 10/02/2...
11,"大埔安埔路 15 號怡雅苑怡厚閣 \nYee Hau House, Yee Nga Cour...",06/02/2022 ~ 08/02/2022 \n09/02/2022 ~ 11/02/2...
4,"大埔太湖山莊 5 座 \nBlock 5, Serenity Villa, Tai Po",06/02/2022 ~ 08/02/2022 \n09/02/2022 ~ 11/02/2...
6,"大埔太和邨麗和樓 \nLai Wo House, Tai Wo Estate, Tai Po",06/02/2022 ~ 08/02/2022 \n09/02/2022 ~ 11/02/2...


In [15]:
# Filter dataframe by the specific name of district (Tsuen Wan) and date (today).
data_final_Tsuen_Wan_Today = data_final_2[(data_final_2['Specified_Premises'].str.contains("Tsuen Wan")) & 
                                        data_final_2['Dates_of_Testing'].str.contains(str(today_a))]

In [16]:
# Display header of dataframe.
data_final_Tsuen_Wan_Today

Unnamed: 0,Specified_Premises,Dates_of_Testing
2,"荃灣名逸居 1 座 \nBlock 1, New Haven, Tsuen Wan",08/02/2022
2,"荃灣愉景新城 7 座 \nBlock 7, Discovery Park, Tsuen Wan",08/02/2022
3,"荃灣麗城花園 3 期 7 座 \nBlock 7, Phase 3, Belvedere G...",05/02/2022 ~ 07/02/2022 \n08/02/2022 ~ 10/02/2...
2,"荃灣荃威花園 N 座 \nBlock N, Allway Gardens, Tsuen Wan",05/02/2022 ~ 07/02/2022 \n08/02/2022 ~ 10/02/2...
