#**Introduction**

A python project to scrape school list from Hong Kong Education Bureau (EDB) website and generating the data file into Excel format.


In [None]:
"""
 * File Name: HK_EDB_Master_School_List_Generator.ipynb
 * Author:    Donald Hung
 * Created:   03.May.2022
 * 
 * Description:
 *  - A python project to scrape school list from Hong Kong Education Bureau (EDB) website and generating the data file into Excel format.
"""

#Step 1: Import required packages / libraries

In [None]:
#!/usr/bin/env python3

import requests
import pandas as pd, numpy as np
from pandas import ExcelWriter
from bs4 import BeautifulSoup

from google.colab import files

#Step 2: Set variable for EDB website link & create dictionary for the name of districts

In [None]:
edbSchListURL = 'https://www.edb.gov.hk/en/student-parents/sch-info/sch-search/schlist-by-district/school-list-{}.html'
district18 = {'cw' : 'Central & Western',
                'hke' : 'Hong Kong East',
                'i' : 'Islands',
                'sou' : 'Southern',
                'wch' : 'Wan Chai',
                'kc' : 'Kowloon City',
                'kt' : 'Kwun Tong',
                'sk' : 'Sai Kung',
                'ssp' : 'Sham Shui Po',
                'wts' : 'Wong Tai Sin',
                'ytm' : 'Yau Tsim & Mong Kok',
                'n' : 'North',
                'st' : 'Sha Tin',
                'tp' : 'Tai Po',
                'kwt' : 'Kwai Chung & Tsing Yi',
                'tw' : 'Tsuen Wan',
                'tm' : 'Tuen Mun',
                'yl' : 'Yuen Long'}

#Step 3: Prepare dataframe with header for target scraping data

In [None]:
dfCols = ['Sch ID', 'Sch Name (EN)', 'Sch Name (ZH)', 'Address (EN)', 'Website', 'Tel.', 'Principal (EN)', 'Principal (ZH)', 'Sch Type', 'District']
df = pd.DataFrame(columns=dfCols, dtype=str)

In [18]:
#@title #Step 3.1: Select target export school type
#@markdown ##Please select school type:
slt_schType = "SECONDARY" #@param ["ALL", "PRIMARY", "SECONDARY", "KINDERGARTEN", "SPECIAL", "OTHERS"] {allow-input: false}
bool_skipPrivate = True #@param {type:"boolean"}

print("Selected school type: " + slt_schType)
print("Skip PRIVATE SCHOOL? " + str(bool_skipPrivate))

Selected school type: SECONDARY
Skip PRIVATE SCHOOL? True


#Step 4: Start looping the web scraping process for individual page of school list for 18 districts

In [19]:
for districtAlias in district18:
    tarURL = edbSchListURL.format(districtAlias)
    htmlContent = requests.get(tarURL)
    htmlContent.encoding = 'utf-8'

    # -- Step 4.1: Get all school data tables from return HTML content
    soup = BeautifulSoup(htmlContent.text, 'html.parser')
    tblContent = soup.find_all('table', {'class': 'tablestyleA'})

    for schTypeTblContent in tblContent:
        count = 0
        district = ''
        schType = ''

        # -- Step 4.2: Break down all single row record into useful dataset -- 
        for trContent in schTypeTblContent.findChildren('tr'):
            count += 1
            if count == 2:
                # -- Get district name & school type -- 
                tdContent = trContent.findChildren('td')
                district = tdContent[0].contents[1].strip()
                schType = tdContent[1].contents[1].strip()
            elif count == 3:
                # -- List out those schools with corresponding school type -- 
                if slt_schType != 'ALL' and slt_schType not in schType:
                  continue
                
                if bool_skipPrivate and 'PRIVATE' in schType:
                  continue
                
                # -- Data cleaning --
                dataList = trContent.contents[0].contents[1].text.split('\n')
                dataList = [x.replace('\t', '').replace('\r', '').replace('\r', '').replace('\xa0', ' ').strip() for x in dataList]

                # -- Step 4.3: Create final school list -- 
                schInfoList = [[]]
                for data in dataList[17:]:
                    schInfoList[len(schInfoList)-1].append(data)

                    if 'School No./Location ID' in data:
                        schInfoList.append(list(schInfoList[len(schInfoList)-1][-13:]))
                        schInfoList[len(schInfoList)-2] = schInfoList[len(schInfoList)-2][0:-13]

                schInfoList = schInfoList[1:]

                # -- Step 4.4: Reformat the return data into dataframe -- 
                for schInfo in schInfoList:
                    url = ''
                    if len(schInfo) > 57:
                        url = schInfo[57].strip()

                    # -- Replace all incomplete name to complete format -- 
                    schName = schInfo[3].strip().replace(' SEC ', ' SECONDARY ').replace(' PRI ', ' PRIMARY ')
                    schName = schName.replace(' SCH', ' SCHOOL').replace(' SCHOOLOOL', ' SCHOOL')
                    schName = schName.replace(' COLL', ' COLLEGE').replace(' COLLEGEEGE', ' COLLEGE')
                    schName = schName.replace('(SEC SECT', '(SEC SECTION').replace(' SECTIONION', ' SECTION')
                    schName = schName.replace('(SEC ', '(SECONDARY ')
                    schName = schName.replace(' SECONDARY SC', ' SECONDARY SCHOOL').replace(' SCHOOLHOOL', ' SCHOOL')
                    schName = schName.replace('CO-EDU ENG', 'CO-EDUCATIONAL ENGLISH')
                    
                    principalName = schInfo[31].strip()
                    schID = schInfo[12].split(':')[1].split('/')[0].strip()
                    schTel = schInfo[17].split(':')[1].strip()[0:8]

                    # -- Skip those schools without Principal -- 
                    if principalName != 'PENDING':
                        if True:
                            df = df.append({
                                'Sch ID' : schID,
                                'Sch Name (EN)' : schName,
                                'Sch Name (ZH)' : schInfo[8].strip(),
                                'Address (EN)' : schInfo[6].strip(),
                                'Website' : url,
                                'Tel.' : schTel,
                                'Principal (EN)' : schInfo[31].strip(),
                                'Principal (ZH)' : schInfo[33].strip(),
                                'Sch Type' : schType,
                                'District' : district
                            }, ignore_index=True)

#Step 5: Export the result dataframe into an Excel file

In [20]:
xlsFileName = 'EDB Master {} School List.xlsx'.format(slt_schType.capitalize())
df.to_excel(xlsFileName, 'School List', index=True)
files.download(xlsFileName)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>