In [1]:
import os
import csv
import time
from urllib.parse import urlparse
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

In [2]:
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

In [3]:
from selenium.common.exceptions import TimeoutException

In [4]:
def create_directory_structure(base_dir, url, start_from):
    """
    Create a directory structure based on the URL, starting from a specified part of the URL path,
    excluding the base directory and network location. Creates directories up to the parent of the last segment.
    """
    try:
        parsed_url = urlparse(url)
        if not parsed_url.scheme or not parsed_url.netloc:
            raise ValueError("Invalid URL")

        # Find the starting index of the desired directory structure
        start_index = parsed_url.path.find(start_from)
        if start_index == -1:
            raise ValueError(f"The start_from segment '{start_from}' not found in the URL path")

        # Extract the relevant path starting from the specified part
        relevant_path = parsed_url.path[start_index:].lstrip('/')
        
        # Get the parent directory of the last segment
        parent_dir = os.path.dirname(relevant_path)
        
        # Construct the full path to the parent directory
        full_path = os.path.join(base_dir, parent_dir)
        
        # Create directories if they do not exist
        if not os.path.exists(full_path):
            os.makedirs(full_path)
        
        return full_path
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [12]:
import json

In [13]:
def scrape_and_save_csv(csv_file, base_dir, start_from):
    """
    Parse a CSV file to get a list of URLs, scrape each URL using Selenium, and save the content as an HTML file
    in a hierarchical directory structure created by create_directory_structure.
    """
    errors_dict = {}
    try:
        # Set up Selenium WebDriver (using Chrome in this example)
        chrome_options = Options()
        #chrome_options.add_argument("--headless")  # Run headless Chrome
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        
        # Set up WebDriver
        driver = webdriver.Chrome(service=Service(), options=chrome_options)
        
        with open(csv_file, newline='') as csvfile:
            reader = csv.reader(csvfile)
            for row in reader:
                url = row[0]
                if "https://www.ura.gov.sg/-/media/Corporate/Guidelines/Development-control" in url:
                    print(f"Media file: {url} - skipping...")
                    continue
                print(f"Processing URL: {url}")
                
                # Create the directory structure
                directory = create_directory_structure(base_dir, url, start_from)
                if directory is None:
                    print(f"Failed to create directory for URL: {url}")
                    continue
                
                # Scrape the webpage using Selenium
                driver.get(url)
                errors = []
                try:
                    # Add an explicit wait for the desired element to appear
                    try:
                        wait = WebDriverWait(driver, 10)  # Adjust the timeout as needed
                        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#pnlMain > div.fullbody-wrapper.no-t-padding > div > div.row > div.col-sm-9.col-md-9.col-xs-12')))
                        content_element_1 = driver.find_element(By.CSS_SELECTOR, '#pnlMain > div.fullbody-wrapper.no-t-padding > div > div.row > div.col-sm-9.col-md-9.col-xs-12 > div')
                        content_html_1 = content_element_1.get_attribute("outerHTML")
                    except:
                        errors.append("Main header not found")
                        print("Main header not found")
                        content_html_1 = ""

                    try:
                        content_element_2 = driver.find_element(By.CSS_SELECTOR, '#pnlMain > div.fullbody-wrapper.no-t-padding > div > div.row > div.col-sm-9.col-md-9.col-xs-12 > div.fullbody-wrapper.no-t-padding > div > div > div')
                        content_html_2 = content_element_2.get_attribute("outerHTML")
                    except:
                        errors.append("Main body not found")
                        print("Main body not found")
                        content_html_2 = ""

                    try:
                        content_element_3 = driver.find_element(By.CSS_SELECTOR, '#pnlMain > div.fullbody-wrapper.no-t-padding > div > div.row > div.col-sm-9.col-md-9.col-xs-12 > div:nth-child(5)')
                        content_html_3 = content_element_3.get_attribute("outerHTML")
                    except:
                        errors.append("Date not found")
                        print("Date not found")
                        content_html_3 = ""
                        

                    # Now you can proceed with further actions after the element appears
                except:
                    errors.append("Nothing found")
                    print("Nothing found")
                    print("Skipping parse...")
                    continue

                html_content = content_html_1 + content_html_2 + content_html_3
                if html_content == "":
                    errors.append("Nothing found")
                    print("Nothing found")
                    continue
                
                if len(errors) > 0:
                    errors_dict[url] = errors

                # Save the HTML content
                parsed_url = urlparse(url)
                filename = os.path.basename(parsed_url.path)
                if not filename.endswith('.html'):
                    filename += '.html'
                file_path = os.path.join(directory, filename)
                
                with open(file_path, 'w', encoding='utf-8') as file:
                    file.write(html_content)
                
                print(f"Saved content of URL: {url} to {file_path}")
                
                # Rate-limit the scraping
                time.sleep(0.25)

        # Write errors to JSON
        with open("../data/errors.json", "w") as outfile:
            json.dump(errors_dict, outfile)
        # Close the WebDriver
        driver.quit()
                
    except Exception as e:
        print(f"An error occurred: {e}")



In [14]:
# Example usage
csv_file = '../data/dc_links.csv'
base_dir = '../data'
start_from = 'Development-Control'
scrape_and_save_csv(csv_file, base_dir, start_from)

Processing URL: https://www.ura.gov.sg/Corporate/Guidelines/Development-Control
Main header not found
Main body not found
Date not found
Nothing found
Processing URL: https://www.ura.gov.sg/Corporate/Guidelines/Development-Control/Residential
Main header not found
Main body not found
Date not found
Nothing found
Processing URL: https://www.ura.gov.sg/Corporate/Guidelines/Development-Control/Residential/Flats-Condominiums
Main body not found
Date not found
Saved content of URL: https://www.ura.gov.sg/Corporate/Guidelines/Development-Control/Residential/Flats-Condominiums to ../data\Development-Control/Residential\Flats-Condominiums.html
Processing URL: https://www.ura.gov.sg/Corporate/Guidelines/Development-Control/Residential/Bungalows
Main body not found
Date not found
Saved content of URL: https://www.ura.gov.sg/Corporate/Guidelines/Development-Control/Residential/Bungalows to ../data\Development-Control/Residential\Bungalows.html
Processing URL: https://www.ura.gov.sg/Corporate/Guid