## Import Relevant Modules

In [None]:
import pandas as pd
import numpy as np
import os
import json
import re
import time
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options

## Frameworks 

For web-scraping, we will mainly focus on using ```selenium``` and ```beautifulsoup4``` to scrape the start up information from the [website](https://www.cityu.edu.hk/hktech300/start-ups/all-start-ups)

Then, we will save the information into a single ```Comma-seperated values (CSV) file```

## Setting up Selenium Chrome Webdriver

In [2]:
# Setting up the Chrome WebDriver
# options = Options()
# options.add_argument("--headless")
driver = webdriver.Chrome()

# Setting up the URL and headers
BASE_PAGE_URL = "https://www.cityu.edu.hk/hktech300/start-ups/all-start-ups?page="
MAX_PAGES = 40
BASE_DOMAIN = "https://www.cityu.edu.hk"

driver.quit()

## Scraping data for a Single Page

The following section contains the initial scraping of each individual start up page before using the logic to scrape the full start up pages

In [None]:
driver = webdriver.Chrome()
driver.get(BASE_PAGE_URL + "0")

# Wait for the page to load
time.sleep(5)

# Get the page source
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')

# Find the total number of pages
content_container = soup.find('div', class_='item-list')

driver.quit()

In [24]:
start_ups_list = []

# Select all <li> elements under the target <ul>
li_elements = soup.select('ul.list-unstyled.row > li')

# Loop through each startup block
for li in tqdm(li_elements, leave=False):
    card = li.select_one("div.card.fund.team")
    if not card:
        continue

    # Find startup name and profile link
    a_tags = card.find_all("a")
    name, profile_url = None, None
    for a in a_tags:
        if a.text.strip():
            name = a.text.strip()
            relative_url = a.get("href")
            profile_url = BASE_DOMAIN + relative_url
            print(f"Name: {name}, Profile URL: {profile_url}")
            start_ups_list.append({
                "name": name,
                "profile_url": profile_url
            })
            break  # Found the name-containing <a>

    if not name or not profile_url:
        continue  # Skip incomplete entries

  0%|          | 0/18 [00:00<?, ?it/s]

Name: 3D printing low-carbon ecological concrete, Profile URL: https://www.cityu.edu.hk/hktech300/start-ups/seed-fund-teams/3d-printing-low-carbon-ecological-concrete
Name: 5th Immersiv, Profile URL: https://www.cityu.edu.hk/hktech300/start-ups/seed-fund-teams/5th-immersiv
Name: 6J Biotechnology (Hong Kong) Limited, Profile URL: https://www.cityu.edu.hk/hktech300/start-ups/angel-fund-teams/6j-biotechnology
Name: A-TCMS (Autonomous Tire Condition Monitoring System), Profile URL: https://www.cityu.edu.hk/hktech300/start-ups/seed-fund-teams/a-tcms
Name: A.I. Ambassador (Virtual Receptionist) kiosk solution, Profile URL: https://www.cityu.edu.hk/hktech300/start-ups/seed-fund-teams/ai-ambassador
Name: A3-ml Ltd, Profile URL: https://www.cityu.edu.hk/hktech300/start-ups/seed-fund-teams/a3-ml
Name: ABCDYi, Profile URL: https://www.cityu.edu.hk/hktech300/start-ups/seed-fund-teams/abcdyi
Name: Accessible Vision, Profile URL: https://www.cityu.edu.hk/hktech300/start-ups/seed-fund-teams/accessibl

In [24]:
# Example Use Case
driver = webdriver.Chrome()
driver.get('https://www.cityu.edu.hk/hktech300/start-ups/angel-fund-teams/starkid-limited')
time.sleep(2)
profile_soup = BeautifulSoup(driver.page_source, "html.parser")
driver.quit()

In [None]:
def extract_profile_details(profile_soup):
    """
    Extracts the company website and email address from a startup's profile page.

    Args:
        profile_soup (BeautifulSoup): The BeautifulSoup-parsed HTML content of a startup's profile page.

    Returns:
        tuple: A tuple containing two strings:
            - website (str): The company's external website URL (if available).
            - email (str): The team member's email address (if available).
    """

    website, email = None, None

    # Dynamically look for any plugin-id ending with :field_client or :field_company_website (Company Website)
    website_div = profile_soup.find(
        'div',
        attrs={"data-block-plugin-id": lambda v: v and (
            v.endswith(":field_client") or v.endswith(":field_company_website")
        )}
    )
    if website_div:
        a_tag = website_div.find('a', href=True)
        if a_tag:
            website = a_tag['href'].strip()

    # Dynamically look for any plugin-id ending with :field_team_members_email (Company Email)
    email_div = profile_soup.find(
        'div',
        attrs={"data-block-plugin-id": lambda v: v and v.endswith(":field_team_members_email")}
    )
    if email_div:
        a_tag = email_div.find('a', href=True)
        if a_tag:
            email = a_tag.text.strip()

    # Fallbacks
    website = website or "No Info Found"
    email = email or "No Info Found"

    return website, email



# Extracting details
website, email = extract_profile_details(profile_soup)
print(f"Website: {website}")
print(f"Email: {email}")

Website: https://www.starkid.com.hk/
Email: admin@starkid.com.hk


In [37]:
all_data = []

# Setting up the Chrome WebDriver 
driver = webdriver.Chrome()

# Entire code to extract all profiles in one page
li_elements = soup.select("ul.list-unstyled.row > li")
for li in tqdm(li_elements, leave=False, desc=f"Scraping profiles"):
    card = li.select_one("div.card.fund.team")
    if not card:
        continue

    # Find the correct <a> with company name and link
    a_tags = card.find_all("a")
    name, profile_url = None, None
    for a in a_tags:
        if a.text.strip():
            name = a.text.strip()
            profile_url = BASE_DOMAIN + a.get("href")
            break

    if not name or not profile_url:
        continue

    # Load the company profile page
    driver.get(profile_url)
    time.sleep(2)
    profile_soup = BeautifulSoup(driver.page_source, "html.parser")

    # Extract website and email from profile
    website, email = extract_profile_details(profile_soup)

    all_data.append({
        "Company Name": name,
        "CityU URL": profile_url,
        "Company Website": website,
        "Email Address": email
    })

# Quit the driver
driver.quit()

Scraping profiles:   0%|          | 0/18 [00:00<?, ?it/s]

In [38]:
df = pd.DataFrame(all_data)
df

Unnamed: 0,Company Name,CityU URL,Company Website,Email Address
0,Vision AI Engineering Limited,https://www.cityu.edu.hk/hktech300/start-ups/s...,No Info Found,info@visionaiengineering.com
1,Vision Carbon Limited,https://www.cityu.edu.hk/hktech300/start-ups/a...,https://www.visioncarbon.net/,kyrie.luo@visioncarbon.net
2,Vismed Training Limited,https://www.cityu.edu.hk/hktech300/start-ups/v...,https://www.vismed.tech/,No Info Found
3,Vismed Training Limited,https://www.cityu.edu.hk/hktech300/start-ups/s...,No Info Found,vismed.hk@gmail.com
4,Vista Innotech Ltd,https://www.cityu.edu.hk/hktech300/start-ups/v...,https://www.vit.com.hk/,info@vit.com.hk
5,Vivablee,https://www.cityu.edu.hk/hktech300/start-ups/s...,https://www.vivablee.com/,info@vivablee.com
6,Volar Air Mobility,https://www.cityu.edu.hk/hktech300/start-ups/s...,No Info Found,No Info Found
7,VR Community Tour,https://www.cityu.edu.hk/hktech300/start-ups/s...,https://kaifongtour.com/,info@kaifongtour.com
8,Vuzec Limited,https://www.cityu.edu.hk/hktech300/start-ups/a...,https://vuzec.com/,contact@vuzec.com
9,Wanmeng Tech Limited,https://www.cityu.edu.hk/hktech300/start-ups/w...,No Info Found,No Info Found


## Scraping all information

Using the logic dervied from extracting one single page, we will now attempt to scrape all the start up information sequentially

### Helper Function to extract start up informatioon

In [None]:
def extract_profile_details(profile_soup):
    """
    Extracts the company website and email address from a startup's profile page.

    Args:
        profile_soup (BeautifulSoup): The BeautifulSoup-parsed HTML content of a startup's profile page.

    Returns:
        tuple: A tuple containing two strings:
            - website (str): The company's external website URL (if available).
            - email (str): The team member's email address (if available).
    """

    website, email = None, None

    # Dynamically look for any plugin-id ending with :field_client or :field_company_website (Company Website)
    website_div = profile_soup.find(
        'div',
        attrs={"data-block-plugin-id": lambda value: value and (
            value.endswith(":field_client") or value.endswith(":field_company_website")
        )}
    )
    if website_div:
        a_tag = website_div.find('a', href=True)
        if a_tag:
            website = a_tag['href'].strip()

    # Dynamically look for any plugin-id ending with :field_team_members_email (Company Email)
    email_div = profile_soup.find(
        'div',
        attrs={"data-block-plugin-id": lambda value: value and value.endswith(":field_team_members_email")}
    )
    if email_div:
        a_tag = email_div.find('a', href=True)
        if a_tag:
            email = a_tag.text.strip()

    # If no email or website found, return "No Info Found" else return the found values
    website = website or "No Info Found"
    email = email or "No Info Found"

    return website, email

In [27]:
# Initialise a list to store all data
all_startup_data = []

In [30]:
# Initialise the WebDriver
driver = webdriver.Chrome() 

# Loop through all pages
for page in tqdm(range(MAX_PAGES), desc="Scraping Startup Information from CityU"):
    page_url = f"{BASE_PAGE_URL}{page}"
    driver.get(page_url)
    time.sleep(2)  # Wait for the page to load

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    li_elements = soup.select("ul.list-unstyled.row > li")

    for li in tqdm(li_elements, desc=f"Page {page + 1}"):
        card = li.select_one("div.card.fund.team")
        if not card:
            continue

        # Extract company name + profile URL
        a_tags = card.find_all("a")
        name, profile_url = None, None
        for a in a_tags:
            if a.text.strip():
                name = a.text.strip()
                profile_url = BASE_DOMAIN + a.get("href")
                break

        if not name or not profile_url:
            continue

        # Load the company profile page
        driver.get(profile_url)
        time.sleep(2) # Wait for the page to load
        profile_soup = BeautifulSoup(driver.page_source, "html.parser")

        # Extract website and email from profile
        website, email = extract_profile_details(profile_soup)

        all_startup_data.append({
            "Company Name": name,
            "CityU URL": profile_url,
            "Company Website": website,
            "Email": email
        })

# Close the WebDriver
driver.quit()

Scraping Startup Information from CityU:   0%|          | 0/40 [00:00<?, ?it/s]

Page 1:   0%|          | 0/18 [00:00<?, ?it/s]

Page 2:   0%|          | 0/18 [00:00<?, ?it/s]

Page 3:   0%|          | 0/18 [00:00<?, ?it/s]

Page 4:   0%|          | 0/18 [00:00<?, ?it/s]

Page 5:   0%|          | 0/18 [00:00<?, ?it/s]

Page 6:   0%|          | 0/18 [00:00<?, ?it/s]

Page 7:   0%|          | 0/18 [00:00<?, ?it/s]

Page 8:   0%|          | 0/18 [00:00<?, ?it/s]

Page 9:   0%|          | 0/18 [00:00<?, ?it/s]

Page 10:   0%|          | 0/18 [00:00<?, ?it/s]

Page 11:   0%|          | 0/18 [00:00<?, ?it/s]

Page 12:   0%|          | 0/18 [00:00<?, ?it/s]

Page 13:   0%|          | 0/18 [00:00<?, ?it/s]

Page 14:   0%|          | 0/18 [00:00<?, ?it/s]

Page 15:   0%|          | 0/18 [00:00<?, ?it/s]

Page 16:   0%|          | 0/18 [00:00<?, ?it/s]

Page 17:   0%|          | 0/18 [00:00<?, ?it/s]

Page 18:   0%|          | 0/18 [00:00<?, ?it/s]

Page 19:   0%|          | 0/18 [00:00<?, ?it/s]

Page 20:   0%|          | 0/18 [00:00<?, ?it/s]

Page 21:   0%|          | 0/18 [00:00<?, ?it/s]

Page 22:   0%|          | 0/18 [00:00<?, ?it/s]

Page 23:   0%|          | 0/18 [00:00<?, ?it/s]

Page 24:   0%|          | 0/18 [00:00<?, ?it/s]

Page 25:   0%|          | 0/18 [00:00<?, ?it/s]

Page 26:   0%|          | 0/18 [00:00<?, ?it/s]

Page 27:   0%|          | 0/18 [00:00<?, ?it/s]

Page 28:   0%|          | 0/18 [00:00<?, ?it/s]

Page 29:   0%|          | 0/18 [00:00<?, ?it/s]

Page 30:   0%|          | 0/18 [00:00<?, ?it/s]

Page 31:   0%|          | 0/18 [00:00<?, ?it/s]

Page 32:   0%|          | 0/18 [00:00<?, ?it/s]

Page 33:   0%|          | 0/18 [00:00<?, ?it/s]

Page 34:   0%|          | 0/18 [00:00<?, ?it/s]

Page 35:   0%|          | 0/18 [00:00<?, ?it/s]

Page 36:   0%|          | 0/18 [00:00<?, ?it/s]

Page 37:   0%|          | 0/18 [00:00<?, ?it/s]

Page 38:   0%|          | 0/18 [00:00<?, ?it/s]

Page 39:   0%|          | 0/18 [00:00<?, ?it/s]

Page 40:   0%|          | 0/18 [00:00<?, ?it/s]

## Saving the information into a CSV file

In [31]:
# Create a DataFrame from the collected data
df = pd.DataFrame(all_startup_data)

# Show the DataFrame
df.head()

Unnamed: 0,Company Name,CityU URL,Company Website,Email
0,3D printing low-carbon ecological concrete,https://www.cityu.edu.hk/hktech300/start-ups/s...,No Info Found,No Info Found
1,5th Immersiv,https://www.cityu.edu.hk/hktech300/start-ups/s...,https://brideunionhk.com/,martin@bride-union.com
2,6J Biotechnology (Hong Kong) Limited,https://www.cityu.edu.hk/hktech300/start-ups/a...,http://www.6jbiotech.com/,qzhang@6jbiotech.com
3,A-TCMS (Autonomous Tire Condition Monitoring S...,https://www.cityu.edu.hk/hktech300/start-ups/s...,No Info Found,No Info Found
4,A.I. Ambassador (Virtual Receptionist) kiosk s...,https://www.cityu.edu.hk/hktech300/start-ups/s...,https://www.innocorn.com/,info@innocorn.com


In [33]:
# Save the DataFrame to a CSV file
df.to_csv("cityu_tech300_startups.csv", index=False)