In [1]:
import csv
import os
import re
from playwright.sync_api import sync_playwright
import warnings
warnings.filterwarnings('ignore')

print("All libraries imported successfully.")


All libraries imported successfully.


In [4]:
BASE_URL = "https://hk.centanet.com/findproperty/en/list/buy"
OUTPUT_CSV = "/Users/clarencemarvin/Downloads/regularized/property_data.csv"


def parse_title_sm(title_sm_text: str):
    """
    Example: "High Floor・FLAT E・2 Rooms"
    Returns floor (e.g. "High Floor"), floor_level ("High"),
    unit ("FLAT E"), bedroom_count ("2").
    """
    floor = ""
    floor_level = ""
    unit = ""
    bedroom_count = ""

    if not title_sm_text:
        return floor, floor_level, unit, bedroom_count

    parts = [p.strip() for p in title_sm_text.split("・") if p.strip()]

    if parts:
        floor = parts[0]                 # "High Floor"
        floor_level = floor.split()[0]   # "High"

    for p in parts:
        if "FLAT" in p.upper():
            unit = p
        if "Room" in p or "Rooms" in p:
            m = re.search(r"(\d+)", p)
            if m:
                bedroom_count = m.group(1)

    return floor, floor_level, unit, bedroom_count


def extract_number(text: str):
    """Extract first integer-like number (ignore commas)."""
    if not text:
        return ""
    m = re.search(r"(\d[\d,]*)", text)
    return m.group(1).replace(",", "") if m else ""


async def safe_text(locator):
    """Return stripped inner_text of first match or empty string."""
    try:
        if await locator.count() == 0:
            return ""
        return (await locator.first.inner_text()).strip()
    except Exception:
        return ""


async def extract_floor_info(card):
    """
    From div.floor-info:
    e.g. "44 years · 81% Efficiency(%)"
    Returns (property_age, efficiency_text)
    """
    floor_info_loc = card.locator("div.floor-info")
    text = await safe_text(floor_info_loc)
    if not text:
        return "", ""
    parts = [p.strip() for p in text.split("·") if p.strip()]
    property_age = ""
    efficiency = ""
    for p in parts:
        if "year" in p:
            property_age = p
        if "Efficiency" in p:
            efficiency = p
    return property_age, efficiency


async def extract_orientation(card):
    """
    Orientation: look for a small span whose text looks like
    'South East', 'South West', 'North', etc.
    """
    orientation = ""
    span_loc = card.locator("span")
    count = await span_loc.count()
    for i in range(count):
        txt = (await span_loc.nth(i).inner_text()).strip()
        if 0 < len(txt) <= 20 and re.search(r"\b(North|South|East|West)\b", txt):
            orientation = txt
            break
    return orientation


async def scrape_current_page(page, page_index):
    """
    Scrape all listings on the current result page.
    """
    listings = page.locator("div.list a.property-text")
    n = await listings.count()
    print(f"[Page {page_index}] Found {n} listings")

    results = []

    for i in range(n):
        card = listings.nth(i)

        property_name = await safe_text(card.locator("span.title-lg"))
        title_sm = await safe_text(card.locator("span.title-sm"))
        floor, floor_level, unit, bedroom_count = parse_title_sm(title_sm)

        # district
        district = await safe_text(card.locator("span.adress.tag-adress"))
        if not district:
            district = await safe_text(card.locator("div.title + div.area"))

        price = await safe_text(card.locator("span.price-info"))
        property_age, efficiency = await extract_floor_info(card)

        # saleable_area
        sa_text = await safe_text(
            card.locator("div.area-block.usable-area div.num span.hidden-xs-only")
        )
        saleable_area = extract_number(sa_text)

        # gross_floor_area
        gfa_text = await safe_text(
            card.locator("div.area-block.construction-area div.num span.hidden-xs-only")
        )
        gross_floor_area = extract_number(gfa_text)

        # pet_policy
        pet_policy = "yes" if await card.locator("text=Pet friendly").count() > 0 else "no"

        # attribute tags
        attr_loc = card.locator("div.tag.hidden-sm-and-down span")
        attributes = []
        if await attr_loc.count() > 0:
            texts = await attr_loc.all_inner_texts()
            attributes = [t.strip() for t in texts if t.strip()]
        attribute = " | ".join(attributes)

        # orientation
        orientation = await extract_orientation(card)

        results.append({
            "property_name": property_name,
            "district": district,
            "bedroom_count": bedroom_count,
            "price": price,
            "unit": unit,
            "property_age": property_age,
            "floor": floor,
            "efficiency": efficiency,
            "gross_floor_area": gross_floor_area,
            "saleable_area": saleable_area,
            "orientation": orientation,
            "pet_policy": pet_policy,
            "attribute": attribute,
            "floor_level": floor_level,
        })

    return results


async def main():
    all_rows = []

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(BASE_URL, wait_until="networkidle")

        await page.wait_for_timeout(3000)

        for page_index in range(1, 417):
            all_rows.extend(await scrape_current_page(page, page_index))

            if page_index < 5:
                next_btn = page.locator("button.btn-next")
                if await next_btn.is_disabled():
                    print("Next button disabled; stopping early.")
                    break
                await next_btn.click()
                await page.wait_for_timeout(3000)

        await browser.close()

    # write CSV
    os.makedirs(os.path.dirname(OUTPUT_CSV), exist_ok=True)

    fieldnames = [
        "property_name",
        "district",
        "bedroom_count",
        "price",
        "unit",
        "property_age",
        "floor",
        "efficiency",
        "gross_floor_area",
        "saleable_area",
        "orientation",
        "pet_policy",
        "attribute",
        "floor_level",
    ]

    with open(OUTPUT_CSV, "w", newline="", encoding="utf-8-sig") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(all_rows)

    print(f"Saved {len(all_rows)} rows to {OUTPUT_CSV}")


await main()


[Page 1] Found 24 listings
[Page 2] Found 24 listings
[Page 3] Found 24 listings
[Page 4] Found 24 listings
[Page 5] Found 24 listings
Saved 120 rows to /Users/clarencemarvin/Downloads/regularized/centanet_trial.csv
