This notebook was created by Donna Faith Go.

In [1]:
import sys
!{sys.executable} -m pip install -qq -r requirements.txt

In [2]:
# standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

# beaultifulsoup
import requests
from bs4 import BeautifulSoup

# selenium
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time

## List of Indices

In [3]:
# setting up driver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

# access website
url = "https://stockanalysis.com/list/hong-kong-stock-exchange/?__v=1771418419981"
driver.get(url)

# get table
wait = WebDriverWait(driver, 10)
table = wait.until(
    EC.presence_of_element_located(
        (By.CSS_SELECTOR, ".symbol-table.svelte-1ro3niy")
    )
)

In [4]:
# get companies and their stock numbers
companies = []
page_num = 1

while True:
    try:
        print(f"Scraping page {page_num}...")
        
        # wait for table to load
        wait.until(
            EC.presence_of_element_located(
                (By.CSS_SELECTOR, ".symbol-table.svelte-1ro3niy")
            )
        )
        
        # beautiful soup for current page
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        rows = soup.find('tbody').find_all('tr')
        
        # data from current page
        for row in rows:
            cols = row.find_all('td')
            if len(cols) >= 2:
                company = {
                    "No.": cols[0].get_text(strip=True),
                    "Symbol": cols[1].get_text(strip=True),
                    "Company Name": cols[2].get_text(strip=True),
                    "Market Cap": cols[3].get_text(strip=True),
                    "Stock Price": cols[4].get_text(strip=True),
                    "% Change": cols[5].get_text(strip=True),
                    "Revenue": cols[6].get_text(strip=True),
                }
                companies.append(company)
        
        # next button
        try:
            next_button = driver.find_element(By.XPATH, "//a[.//span[contains(text(),'Next')]]")            
            if "disabled" in next_button.get_attribute("class") or not next_button.is_enabled():
                print("Reached the last page. Stopping.")
                break
                
            # click button
            driver.execute_script("arguments[0].click();", next_button)            
            time.sleep(2)
            
            page_num += 1
            
        except NoSuchElementException:
            print("No more pages available. Scraping complete.")
            break
            
    except Exception as e:
        print(f"Error on page {page_num}: {e}")
        break

driver.quit()

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Error on page 6: name 'NoSuchElementException' is not defined


The code above takes around 3-5 minutes to run.

In [5]:
# Save the data
companies_df = pd.DataFrame(companies)
companies_df.to_pickle('HKEX companies list.pkl')
print(f"Scraped {len(companies)} companies from {page_num} pages")

Scraped 2712 companies from 6 pages


In [6]:
filepath = 'HKEX companies list.pkl'
with open(filepath, 'rb') as f:
    companies_df = pickle.load(f)
companies_df.head()

Unnamed: 0,No.,Symbol,Company Name,Market Cap,Stock Price,% Change,Revenue
0,1,700,Tencent Holdings Limited,4.85T,533.0,0.19%,797.65B
1,2,1398,Industrial and Commercial Bank of China Limited,2.72T,6.4,-0.16%,737.77B
2,3,9988,Alibaba Group Holding Limited,2.72T,154.7,-0.45%,1.11T
3,4,1288,Agricultural Bank of China Limited,2.52T,5.39,-0.37%,649.55B
4,5,4333,"Cisco Systems, Inc.",2.37T,580.0,45.00%,460.51B
