#**Introduction**

A python project to scrape school list from Hong Kong Education Bureau (EDB) website and generating the data file into Excel format.


In [86]:
"""
 * File Name: HK_EDB_Master_School_List_Generator.ipynb
 * Author:    Donald Hung
 * Created:   03.May.2022
 * 
 * Description:
 *  - A python project to scrape school list from Hong Kong Education Bureau (EDB) website and generating the data file into Excel format.
"""

'\n * File Name: HK_EDB_Master_School_List_Generator.ipynb\n * Author:    Donald Hung\n * Created:   03.May.2022\n * \n * Description:\n *  - A python project to scrape school list from Hong Kong Education Bureau (EDB) website and generating the data file into Excel format.\n'

#Step 1: Import required packages / libraries

In [87]:
#!/usr/bin/env python3

import requests
import pandas as pd, numpy as np
from pandas import ExcelWriter
from bs4 import BeautifulSoup
from datetime import datetime

from google.colab import files

#Step 2: Set variable for EDB website link & create dictionary for the name of districts

In [88]:
edbSchListURL = 'https://www.edb.gov.hk/en/student-parents/sch-info/sch-search/schlist-by-district/school-list-{}.html'
district18 = {'cw' : 'Central & Western',
                'hke' : 'Hong Kong East',
                'i' : 'Islands',
                'sou' : 'Southern',
                'wch' : 'Wan Chai',
                'kc' : 'Kowloon City',
                'kt' : 'Kwun Tong',
                'sk' : 'Sai Kung',
                'ssp' : 'Sham Shui Po',
                'wts' : 'Wong Tai Sin',
                'ytm' : 'Yau Tsim & Mong Kok',
                'n' : 'North',
                'st' : 'Sha Tin',
                'tp' : 'Tai Po',
                'kwt' : 'Kwai Chung & Tsing Yi',
                'tw' : 'Tsuen Wan',
                'tm' : 'Tuen Mun',
                'yl' : 'Yuen Long'}

#Step 3: Select target school type for export

In [92]:
#@markdown ##Please select school type:
slt_schType = "PRIMARY" #@param ["ALL", "PRIMARY", "SECONDARY", "KINDERGARTEN", "SPECIAL", "OTHERS"] {allow-input: false}
bool_skipPrivate = True #@param {type:"boolean"}

print("Selected school type: " + slt_schType)
print("Skip PRIVATE SCHOOL? " + str(bool_skipPrivate))

Selected school type: PRIMARY
Skip PRIVATE SCHOOL? True


#Step 4: Start looping the web scraping process for individual page of school list for 18 districts

In [107]:
boolDebug = True
import sys, re

# -- Function for data cleaning -- 
def dataCleaning(dataList):
  return [x.replace('\n\n', '\n').replace('\t', '').replace('\r', '').replace('\r', '').replace('\xa0', ' ').strip() for x in dataList]

# -- Function for getting scraping school list from EDB website -- 
def getScrapingSchoolList(slt_schType, bool_bkipPrivate):
  # -- Prepare dataframe with header for target scraping data -- 
  dfCols = ['Sch ID', 'Sch Name (EN)', 'Sch Name (ZH)', 'Address (EN)', 'Address (ZH)', 'Website', 'Tel.', 'Principal (EN)', 'Principal (ZH)', 'Sch Type', 'District']
  df = pd.DataFrame(columns=dfCols, dtype=str)
  
  for districtAlias in district18:
    tarURL = edbSchListURL.format(districtAlias)
    if boolDebug:
      print('Scrapping data from: ' + tarURL)
    htmlContent = requests.get(tarURL)
    htmlContent.encoding = 'utf-8'

    # -- Step 4.1: Get all school data tables from return HTML content
    soup = BeautifulSoup(htmlContent.text, 'html.parser')
    tblContent = soup.find_all('table', {'class': 'tablestyleA'})

    for schTypeTblContent in tblContent:
      count = 0
      district = ''
      schType = ''

      # -- Step 4.2: Break down all single row record into useful dataset -- 
      for trContent in schTypeTblContent.findChildren('tr'):
        count += 1
        if count == 2:
          # -- Get district name & school type -- 
          tdContent = trContent.findChildren('td')
          district = tdContent[0].contents[1].strip()
          schType = tdContent[1].contents[1].strip()
        elif count == 3:
          # -- List out those schools with corresponding school type -- 
          if slt_schType != 'ALL' and slt_schType not in schType:
            continue
          
          if bool_skipPrivate and 'PRIVATE' in schType:
            continue
          
          schInfoList = []
          dataList = []

          # -- Skip 1st row of data as it is table header -- 
          dataRow = trContent.contents[0].contents[1].find_all('tr', recursive=False)[1:]
          
          for row in dataRow:
            dataCol = row.find_all('td', recursive=False)
            recIdx = dataCol[0].text.split(' ')[0]
            
            # -- Check whether it is a new school record, if yes, store the previous record in list first -- 
            if recIdx.isnumeric():
              if len(dataList) > 0:
                schInfoList.append(dataCleaning(dataList))
              
              dataList = [recIdx]
            else:
              dataList.append(dataCol[0].text.strip())
            
            # -- Append column data into data list -- 
            for cols in dataCol[1:]:
              for col in cols.contents[1].contents:
                if col != '\n':
                  dataset = col.find_all('td', recursive=True)
                  for data in dataset:
                    dataList.append(data.text.strip())
          
          # -- Append the last record to the list -- 
          if len(dataList) > 0:
            schInfoList.append(dataCleaning(dataList))
          
          # if boolDebug:
          #   print(schInfoList)
          
          # -- Step 4.4: Reformat the return data into dataframe -- 
          for schInfo in schInfoList:
            # -- Replace all incomplete school name to complete format -- 
            schName = schInfo[1].strip()
            if boolDebug:
              orgSchName = schName
            
            schName = re.sub(r'([ (])KG(?: |)', r'\1KINDERGARTEN ', schName)
            schName = re.sub(r'([ (])PR(?: |I )', r'\1PRIMARY ', schName)
            schName = re.sub(r'([ (])SE(?: |C )', r'\1SECONDARY ', schName)
            schName = schName.replace('  ', ' ')
            schName = schName.replace(' DIST ', ' DISTRICT ')
            schName = schName.replace(' MEM ', ' MEMORIAL ').replace(' ASSN ', ' ASSOCIATION ')
            schName = schName.replace(' FDN ', ' FOUNDATION ').replace(' GOVT ', ' GOVERNMENT ')
            schName = schName.replace(' SCH ', ' SCHOOL ').replace(' COLL ', ' COLLEGE ')
            schName = re.sub(r' (?:S|SC|SCH)$', ' SCHOOL', schName)
            schName = re.sub(r' (?:CT|CTR)$', ' CENTER', schName)
            schName = re.sub(r' (?:CO|COL|COLL)$', ' COLLEGE', schName)
            schName = schName.replace(' ENG ', ' ENGLISH ').replace(' CO-EDU ', ' CO-EDUCATIONAL ')
            schName = re.sub(r' SECT([)])$', r' SECTION\1', schName)
            
            if boolDebug:
              if orgSchName != schName:
                print(orgSchName)
                print(schName)
                print('='*50)
            
            # -- Reformating useful information -- 
            principalName_EN = ''
            principalName_ZH = ''
            schID = ''
            schTel = ''
            schWebsite = ''

            for data in schInfo:
              if 'Head of School' in data:
                principalNameInfo = data.split(':')[1].strip().split('\n')
                principalName_EN = principalNameInfo[0].strip()
                principalName_ZH = principalNameInfo[1] if len(principalNameInfo) > 1 else ''
              elif 'School No./Location ID' in data:
                schID = data.split(':')[1].split('/')[0].strip()
              elif 'Tel.' in data:
                schTel = data.split(':')[1].strip()[0:8]
              elif 'Website' in data:
                schWebsite = data.split(':\n')[1].strip()
            
            # -- Skip those schools without Principal -- 
            if principalName_EN not in ['PENDING', '-']:
              df = df.append({
                  'Sch ID' : schID,
                  'Sch Name (EN)' : schName.strip(),
                  'Sch Name (ZH)' : schInfo[4].strip(),
                  'Address (EN)' : schInfo[3].strip(),
                  'Address (ZH)' : schInfo[6].strip(),
                  'Website' : schWebsite,
                  'Tel.' : schTel,
                  'Principal (EN)' : principalName_EN,
                  'Principal (ZH)' : principalName_ZH,
                  'Sch Type' : schType,
                  'District' : district
              }, ignore_index=True)
  return df

df = getScrapingSchoolList(slt_schType, bool_skipPrivate)

Scrapping data from: https://www.edb.gov.hk/en/student-parents/sch-info/sch-search/schlist-by-district/school-list-cw.html
BONHAM RD GOVERNMENT PRI SCH
BONHAM RD GOVERNMENT PRIMARY SCHOOL
CENTRAL & WESTERN DIST ST ANTHONY'S SCH
CENTRAL & WESTERN DISTRICT ST ANTHONY'S SCHOOL
KING'S COLL OLD BOYS' ASSN PRI SCH
KING'S COLLEGE OLD BOYS' ASSOCIATION PRIMARY SCHOOL
KING'S COLL OLD BOYS' ASSN PRI SCH NO. 2
KING'S COLLEGE OLD BOYS' ASSOCIATION PRIMARY SCHOOL NO. 2
SKH LUI MING CHOI MEMORIAL PRI SCH
SKH LUI MING CHOI MEMORIAL PRIMARY SCHOOL
Scrapping data from: https://www.edb.gov.hk/en/student-parents/sch-info/sch-search/schlist-by-district/school-list-hke.html
ALDRICH BAY GOVERNMENT PRI SCH
ALDRICH BAY GOVERNMENT PRIMARY SCHOOL
BUDDHIST CHUNG WAH KORNHILL PRI SCH
BUDDHIST CHUNG WAH KORNHILL PRIMARY SCHOOL
ENDEAVR LEUNG LEE SAU YU MEM PRI SCH
ENDEAVR LEUNG LEE SAU YU MEMORIAL PRIMARY SCHOOL
HKCWC HIOE TJO YOENG PRI SCH
HKCWC HIOE TJO YOENG PRIMARY SCHOOL
S.K.H. CHAI WAN ST. MICHAEL'S PRI SCH
S

#Step 5: Export the result dataframe into an Excel file

In [108]:
# print(df.head(0))
# %pip install xlsxwriter
# import xlsxwriter

xlsFileName = 'HK_EDB_Master_{}_School_List_{}.xlsx'.format(slt_schType.capitalize(), datetime.now().strftime("%Y%m%d_%H%M%S"))
df.to_excel(xlsFileName, 'School List', index=True)
            # , engine='xlsxwriter')
files.download(xlsFileName)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>