# Community Clubs Scraper

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
import time
import csv

def scrape_onepa():
    options = webdriver.ChromeOptions()
    # options.add_argument("--headless")  # run headless if desired
    driver = webdriver.Chrome(options=options)

    url = "https://www.onepa.gov.sg/cc"
    driver.get(url)
    time.sleep(5)  # let page load
    
    # We'll store *unique* results in a set (for easy deduping)
    unique_set = set()
    # Also keep a list of dicts for writing to CSV in the original order
    all_results = []
    
    while True:
        # 1) Parse current page
        soup = BeautifulSoup(driver.page_source, "html.parser")
        cc_divs = soup.find_all("div", class_="CCLocatorItem__inner__details")
        
        # Keep track of how many new items we add in this iteration
        items_added_this_page = 0

        for cc_div in cc_divs:
            name_tag = cc_div.find("a", class_="CCLocatorItem__inner__details--heading")
            name = name_tag.get_text(strip=True) if name_tag else ""
            
            p_tags = cc_div.find_all("p", class_="CCLocatorItem__inner__details--label")
            address = p_tags[0].get_text(strip=True) if len(p_tags) > 0 else ""
            tel = p_tags[1].get_text(strip=True) if len(p_tags) > 1 else ""
            if tel.startswith("Tel:"):
                tel = tel.replace("Tel:", "").strip()

            # Create a tuple to represent uniqueness
            item_tuple = (name, address, tel)

            # Only add if not already seen
            if item_tuple not in unique_set:
                unique_set.add(item_tuple)
                all_results.append({
                    "Name": name,
                    "Address": address,
                    "Tel": tel
                })
                items_added_this_page += 1

        # If we didn't add anything new on this page, it's time to stop
        if items_added_this_page == 0:
            print("No new results found on this page. Likely the last page or repeating data.")
            break

        # 2) Try to click Next
        try:
            next_btn = driver.find_element(By.CSS_SELECTOR, "span.btnNext")
            
            # Scroll into view
            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_btn)
            time.sleep(1)
            
            # JS click to avoid intercept issues
            driver.execute_script("arguments[0].click();", next_btn)
            
            # Wait for next page to load
            time.sleep(3)
        except NoSuchElementException:
            print("No next button found. Probably the last page.")
            break
        except Exception as e:
            print("Error clicking Next:", e)
            break

    driver.quit()

    # 3) Save to CSV
    csv_path = r"C:\Users\irvin\Downloads\community_clubs_next_span.csv" #change to appropriate filepath
    with open(csv_path, "w", newline="", encoding="utf-8") as f:
        fieldnames = ["Name", "Address", "Tel"]
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for row in all_results:
            writer.writerow(row)
    
    print(f"Scraped a total of {len(all_results)} **unique** items.")
    print(f"Saved results to {csv_path}")

if __name__ == "__main__":
    scrape_onepa()


InvalidSessionIdException: Message: invalid session id: session deleted as the browser has closed the connection
from disconnected: not connected to DevTools
  (Session info: chrome=134.0.6998.178)
Stacktrace:
	GetHandleVerifier [0x00007FF733B04C25+3179557]
	(No symbol) [0x00007FF7337688A0]
	(No symbol) [0x00007FF7335F91CA]
	(No symbol) [0x00007FF7335E4F85]
	(No symbol) [0x00007FF733609F94]
	(No symbol) [0x00007FF73367F9DF]
	(No symbol) [0x00007FF73369FBE2]
	(No symbol) [0x00007FF733677A03]
	(No symbol) [0x00007FF7336406D0]
	(No symbol) [0x00007FF733641983]
	GetHandleVerifier [0x00007FF733B667CD+3579853]
	GetHandleVerifier [0x00007FF733B7D1D2+3672530]
	GetHandleVerifier [0x00007FF733B72153+3627347]
	GetHandleVerifier [0x00007FF7338D092A+868650]
	(No symbol) [0x00007FF733772FFF]
	(No symbol) [0x00007FF73376F4A4]
	(No symbol) [0x00007FF73376F646]
	(No symbol) [0x00007FF73375EAA9]
	BaseThreadInitThunk [0x00007FFE0AC8E8D7+23]
	RtlUserThreadStart [0x00007FFE0BB9BF6C+44]


# Primary School Scraper

In [None]:
def scrape_moe_schools():
    options = webdriver.ChromeOptions()
    # options.add_argument("--headless")  # Uncomment to run headless if desired
    driver = webdriver.Chrome(options=options)

    # Replace this URL with the actual page you are scraping
    url = "https://www.moe.gov.sg/schoolfinder?journey=Primary%20school"
    driver.get(url)
    time.sleep(5)  # Allow time for page to load fully

    # Use a set for deduplication
    unique_set = set()
    all_results = []

    while True:
        # 1) Parse the current page with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Example: each "card" or container might have the classes "df fldr jcsb m-bxs aic"
        # Adjust if needed based on the actual structure in your HTML.
        school_cards = soup.find_all("div", class_="d:f fld:r jc:sb m-b:xs ai:c")

        items_added_this_page = 0

        for card in school_cards:
            # Grab the <p> with class "ff:heading ts:l fw:6 c:grey-1 m-b:s"
            name_tag = card.find("p", class_="ff:heading ts:l fw:6 c:grey-1 m-b:s")
            school_name = name_tag.get_text(strip=True) if name_tag else ""

            # Grab the <p> with class "ts:s c:grey-2 m-t:0 m-l:s p-l:m"
            address_tag = card.find("p", class_="ts:s c:grey-2 m-t:0 m-l:s p-l:m")
            address = address_tag.get_text(strip=True) if address_tag else ""

            # Create a tuple for deduping
            item_tuple = (school_name, address)
            if item_tuple not in unique_set:
                unique_set.add(item_tuple)
                all_results.append({
                    "SchoolName": school_name,
                    "Address": address
                })
                items_added_this_page += 1

        # If no new items were added, we can assume we've reached the end
        if items_added_this_page == 0:
            print("No new results found on this page. Stopping.")
            break

        # 2) Click the "Next" button to go to the next page
        try:
            # Based on your snapshot, this might work:
            #   <button aria-label="next page" class="mce-pagination__btn dir--right">
            #       <span class="icon-arrow-right"></span>
            #   </button>
            # You can select by the button class or the span icon:
            next_btn = driver.find_element(By.CSS_SELECTOR, "button.moe-pagination__btn.dir--right")

            # Scroll into view (sometimes helpful)
            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", next_btn)
            time.sleep(1)

            # Click via JavaScript
            driver.execute_script("arguments[0].click();", next_btn)

            # Wait a bit for the next page to load
            time.sleep(3)

        except NoSuchElementException:
            print("No next button found. Probably the last page.")
            break
        except Exception as e:
            print("Error clicking Next:", e)
            break

    # 3) Done scraping: close the browser
    driver.quit()

    # 4) Write results to CSV
    csv_path = r"C:\Users\irvin\Downloads\schools.csv" #Change to appropriate file path
    with open(csv_path, "w", newline="", encoding="utf-8") as f:
        fieldnames = ["SchoolName", "Address"]
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for row in all_results:
            writer.writerow(row)

    print(f"Scraped a total of {len(all_results)} unique items.")
    print(f"Saved results to {csv_path}")

if __name__ == "__main__":
    scrape_moe_schools()

No new results found on this page. Stopping.
Scraped a total of 182 unique items.
Saved results to C:\Users\irvin\Downloads\schools.csv


# Initial PropertyGuru Scraper

In [6]:
import re
import time
from selenium import webdriver
from bs4 import BeautifulSoup, Comment

def property_guru(input_link):
    options = webdriver.ChromeOptions()
    # options.add_argument("--headless")  # Uncomment to run headless if desired
    driver = webdriver.Chrome(options=options)
    driver.get(input_link)

    # Give the page time to load
    time.sleep(5)

    # Parse the loaded page
    soup = BeautifulSoup(driver.page_source, "html.parser")

    # 1) Address
    address_tag = soup.find("span", class_="full-address__address")
    address = address_tag.get_text(strip=True) if address_tag else ""

    # 2) Listed price
    price_tag = soup.find("h2", class_="amount", attrs={"data-automation-id": "overview-price-txt"})
    listed_price = price_tag.get_text(strip=True) if price_tag else ""

    # 3) HDB type
    hdb_type_tag = soup.find("div", class_="meta-table__item__wrapper__value", string=re.compile("HDB for sale"))
    hdb_type = hdb_type_tag.get_text(strip=True) if hdb_type_tag else ""

    # 4) Lease year
    lease_year_tag = soup.find("div", class_="meta-table__item__wrapper__value", string=re.compile("^TOP in"))
    lease_year = ""
    if lease_year_tag:
        text_val = lease_year_tag.get_text(strip=True)
        match = re.search(r"\b\d{4}\b", text_val)
        if match:
            lease_year = match.group(0)

    # 5) Square feet extraction from <h4 class="amenity__text">
    amenity_tags = soup.find_all("h4", class_="amenity__text")
    
    sqft = ""
    # Check if we have at least three such elements
    if len(amenity_tags) >= 3:
        # The third element (index 2) is where sqft is located
        sqft_tag = amenity_tags[2]
        
        # Approach: extract text from the tag while ignoring comment nodes
        lines = []
        for element in sqft_tag.descendants:
            if isinstance(element, Comment):
                continue
            if element.string and element.string.strip():
                lines.append(element.string.strip())
                
        # Debug print to inspect what we got:
        # print("Extracted lines:", lines)
        
        # Expecting something like: ['1,109 == $0', 'sqft']
        if len(lines) >= 2:
            # Extract the number using regex from the first part
            match = re.search(r'([\d,]+)', lines[0])
            if match and "sqft" in lines[1].lower():
                sqft_number = match.group(1)
                sqft = f"{sqft_number} sqft"
        else:
            # Fallback: try to extract using the entire text content
            text_val = sqft_tag.get_text(separator=" ", strip=True)
            match = re.search(r'([\d,]+).*?sqft', text_val, re.IGNORECASE)
            if match:
                sqft = f"{match.group(1)} sqft"

    driver.quit()

    data = {
        "address": address,
        "listed_price": listed_price,
        "hdb_type": hdb_type,
        "lease_year": lease_year,
        "sqft": sqft
    }
    return data

# Example usage:
if __name__ == "__main__":
    link = "https://www.propertyguru.com.sg/listing/hdb-for-sale-658c-jurong-west-street-65-25503413"
    results = property_guru(link)
    print(results)


{'address': '658C Jurong West Street 65', 'listed_price': 'S$ 650,000', 'hdb_type': '5I HDB for sale', 'lease_year': '2000', 'sqft': '1,206 sqft'}
