In [1]:
from unstructured.partition.html import partition_html
import time

In [2]:
def get_body(elements):
    '''
    This function finds the document element list indexes corresponding to the main body of the webpage, in the absence of usage of proper body tags

    Parameters:
        elements (list): A list of Unstructured document objects

    Returns:
        list: The sliced elements list whose indexes point to document objects containing text and other metadata from the main body of the webpage
    '''

    START = 0
    END = 0
    flag = False

    for i in range(len(elements)):

        if flag == False and elements[i].text == 'Search':
            flag = True
            continue
        elif START == 0 and elements[i].category == 'Title':
            START = i
            continue
        elif END == 0 and elements[i].text == 'Urban Redevelopment Authority':
            END = i
            break
    return elements[START:END]

In [3]:
def scrape_links(url, visited, queue):
    unique_links = []
    try:
        elements = partition_html(url=url,
                                    headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'})
        body_elements = get_body(elements)
        for element in body_elements:
            if element.metadata.link_urls != None:
                links = element.metadata.link_urls
                for link in links :
                    if 'https://' not in link and 'http://' not in link:
                        link = 'https://www.ura.gov.sg' + link
                    if "https://www.ura.gov.sg/" not in link or "https://www.ura.gov.sg/maps" in link:
                        continue
                    if "https://www.ura.gov.sg/-/media/Corporate/Guidelines/Development-control" in link or "https://www.ura.gov.sg/Corporate/Guidelines/Development-Control" in link:
                        if link not in visited and link not in queue:
                            unique_links.append(link)
    except:
        pass
    return unique_links
    

In [4]:
def all_links_in_page(url):
    links = []
    visited = set()
    queue = [url]
    while queue:
        current_url = queue.pop(0)
        if current_url in visited:
            continue
        print(f"Current Page: {current_url}")
        visited.add(current_url)
        links.append(current_url)
        queue += scrape_links(current_url, visited, queue)
        time.sleep(0.5)
    return links

In [5]:
test = all_links_in_page("https://www.ura.gov.sg/Corporate/Guidelines/Development-Control")

Current Page: https://www.ura.gov.sg/Corporate/Guidelines/Development-Control
Current Page: https://www.ura.gov.sg/Corporate/Guidelines/Development-Control/Residential
Current Page: https://www.ura.gov.sg/Corporate/Guidelines/Development-Control/Residential/Flats-Condominiums
Current Page: https://www.ura.gov.sg/Corporate/Guidelines/Development-Control/Residential/Bungalows
Current Page: https://www.ura.gov.sg/Corporate/Guidelines/Development-Control/Residential/Semi-Detached-Houses
Current Page: https://www.ura.gov.sg/Corporate/Guidelines/Development-Control/Residential/Terrace
Current Page: https://www.ura.gov.sg/Corporate/Guidelines/Development-Control/Residential/Strata-Landed-Housing
Current Page: https://www.ura.gov.sg/Corporate/Guidelines/Development-Control/Non-Residential
Current Page: https://www.ura.gov.sg/Corporate/Guidelines/Development-Control/Non-Residential/Commercial
Current Page: https://www.ura.gov.sg/Corporate/Guidelines/Development-Control/Non-Residential/Hotel
Cur

In [7]:
count = 0
for each in test:
    if "https://www.ura.gov.sg/-/media/Corporate/Guidelines/Development-control" in each:
        continue
    count +=1
print(count)

448


In [9]:
test

['https://www.ura.gov.sg/Corporate/Guidelines/Development-Control',
 'https://www.ura.gov.sg/Corporate/Guidelines/Development-Control/Residential',
 'https://www.ura.gov.sg/Corporate/Guidelines/Development-Control/Residential/Flats-Condominiums',
 'https://www.ura.gov.sg/Corporate/Guidelines/Development-Control/Residential/Bungalows',
 'https://www.ura.gov.sg/Corporate/Guidelines/Development-Control/Residential/Semi-Detached-Houses',
 'https://www.ura.gov.sg/Corporate/Guidelines/Development-Control/Residential/Terrace',
 'https://www.ura.gov.sg/Corporate/Guidelines/Development-Control/Residential/Strata-Landed-Housing',
 'https://www.ura.gov.sg/Corporate/Guidelines/Development-Control/Non-Residential',
 'https://www.ura.gov.sg/Corporate/Guidelines/Development-Control/Non-Residential/Commercial',
 'https://www.ura.gov.sg/Corporate/Guidelines/Development-Control/Non-Residential/Hotel',
 'https://www.ura.gov.sg/Corporate/Guidelines/Development-Control/Non-Residential/B1',
 'https://www.ur

In [11]:
import csv
with open("../data/dc_links.csv", "w", newline="") as csvfile:
  # Create a csv writer object
    writer = csv.writer(csvfile)
    writer.writerows(test)


In [12]:
import csv

def write_links_to_csv(links, filename):
  """Writes a list of links to a CSV file, each link on a new line.

  Args:
    links: List of strings containing the links.
    filename: Path to the CSV file to be created.
  """
  with open(filename, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    for link in links:
      writer.writerow([link])  # Write each link as a single element list

# Example usage (replace with your list of links and desired filename)
filename = "../data/dc_links.csv"
write_links_to_csv(test, filename)
print(f"Links written to CSV: {filename}")



Links written to CSV: ../data/dc_links.csv
