In [94]:
import csv
import time  # Add this import for sleep handling
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# Set up Chrome driver
options = Options()
options.headless = False  # Set to True if GUI isn't needed
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

def scrape_page(url):
    """Scrape data from a single page."""
    driver.get(url)
    
    # Ensure page has loaded by waiting for a key element to be present
    try:
        WebDriverWait(driver, 15).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, 'card-body'))
        )
    except Exception as e:
        print(f"Error loading page: {e}")
        return []  # Return an empty list if the page fails to load

    soup = BeautifulSoup(driver.page_source, 'html.parser')

    courses = []
    for card in soup.find_all('div', class_='card-body'):
        # Extract course title
        title_tag = card.find('h5', class_='card-title')
        course_title = title_tag.text.strip() if title_tag else 'N/A'

        if course_title == 'N/A':
            continue  # Skip if no title

        # Extract provider
        provider_tag = card.find('div', class_='course-provider')
        provider = provider_tag.find('a').text.strip() if provider_tag and provider_tag.find('a') else 'N/A'

        # Extract upcoming date
        upcoming_date_tag = card.find('ul', class_='list-group-flush')
        date_item = upcoming_date_tag.find('li', class_='list-group-item') if upcoming_date_tag else None
        upcoming_date = date_item.find('strong').text.strip() if date_item and date_item.find('strong') else 'N/A'

        # Extract course fee from the footer
        footer = card.find_next_sibling('div', class_='card-footer')
        full_fee = footer.find('p').find('strong').text.strip() if footer else 'N/A'

        # Store course information
        courses.append({
            'Title': course_title,
            'Provider': provider,
            'Upcoming Date': upcoming_date,
            'Course Fee': full_fee
        })

    return courses

# Construct the base URL for the first page (without `start` parameter)
first_page_url = (
    "https://www.myskillsfuture.gov.sg/content/portal/en/portal-search/portal-search.html"
    "?fq=Course_Supp_Period_To_1%3A%5B2024-10-13T00%3A00%3A00Z%20TO%20*%5D"
    "&fq=IsValid%3Atrue&q=*%3A*&cat=fq%3DArea_of_Training_text_exact%3A(%22Marine%20%26%20Port%20Services%22)"
    "&cattext=Marine%20%26%20Port%20Services"
)

# Construct the base URL for subsequent pages (with `start` parameter)
base_url_with_start = (
    "https://www.myskillsfuture.gov.sg/content/portal/en/portal-search/portal-search.html"
    "?fq=Course_Supp_Period_To_1%3A%5B2024-10-13T00%3A00%3A00Z%20TO%20*%5D&fq=IsValid%3Atrue&q="
    "*%3A*&start={start_num}&cat=fq%3DArea_of_Training_text_exact%3A(%22Marine%20%26%20Port%20Services%22)&cattext=Marine%20%26%20Port%20Services"
)

all_courses = []
count = 0  # Initialize course counter

# Scrape the first page (without `start` parameter)
print(f"Scraping first page: {first_page_url}")
courses = scrape_page(first_page_url)
all_courses.extend(courses)
count += 24  # Increment the count

print(all_courses)

course_2 = scrape_page("https://www.myskillsfuture.gov.sg/content/portal/en/portal-search/portal-search.html?fq=Course_Supp_Period_To_1%3A%5B2024-10-13T00%3A00%3A00Z%20TO%20*%5D&fq=IsValid%3Atrue&q=*%3A*&start=24&cat=fq%3DArea_of_Training_text_exact%3A(%22Marine%20%26%20Port%20Services%22)&cattext=Marine%20%26%20Port%20Services")
all_courses.extend(course_2)

print(all_courses)

# Scrape subsequent pages (with `start` parameter)
start_num = 24  # Start from 24 for the second page onward
while count <= 95:  # Stop if count exceeds 95
    page_url = base_url_with_start.format(start_num=start_num)
    print(f"Scraping {page_url}")

    # Scrape the page
    courses = scrape_page(page_url)

    all_courses.extend(courses)
    count += 24  # Increment the count

    time.sleep(6)

    if count > 95:
        print(f"Course count exceeded 95. Exiting loop.")
        break

    start_num += 24  # Increment start by 24 for the next page

# Write data to a CSV file
csv_file = "courses.csv"
csv_columns = ['Title', 'Provider', 'Upcoming Date', 'Course Fee']

try:
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=csv_columns)
        writer.writeheader()  # Write header row
        writer.writerows(all_courses)  # Write course data
    print(f"Data successfully written to {csv_file}")
except Exception as e:
    print(f"Error writing to CSV: {e}")

# Close the browser session
driver.quit()


Scraping first page: https://www.myskillsfuture.gov.sg/content/portal/en/portal-search/portal-search.html?fq=Course_Supp_Period_To_1%3A%5B2024-10-13T00%3A00%3A00Z%20TO%20*%5D&fq=IsValid%3Atrue&q=*%3A*&cat=fq%3DArea_of_Training_text_exact%3A(%22Marine%20%26%20Port%20Services%22)&cattext=Marine%20%26%20Port%20Services
[{'Title': 'DIPLOMA IN MARITIME AND OFFSHORE MANAGEMENT', 'Provider': 'SINGAPORE POLYTECHNIC', 'Upcoming Date': '14 Oct 24', 'Course Fee': '$4,200.00'}, {'Title': 'Modular Certificate in Naval Architecture - Part of Diploma in Engineering (Marine) (DEMOT)', 'Provider': 'NGEE ANN POLYTECHNIC', 'Upcoming Date': '14 Oct 24', 'Course Fee': '$3,456.00'}, {'Title': 'Introduction to Eco-Clean Marine Energy', 'Provider': 'NGEE ANN POLYTECHNIC', 'Upcoming Date': '14 Oct 24', 'Course Fee': '$600.00'}, {'Title': 'Proficiency in Designated Security Duties', 'Provider': 'SINGAPORE POLYTECHNIC', 'Upcoming Date': '14 Oct 24', 'Course Fee': '$300.00'}, {'Title': 'Operational Use of Electro

In [87]:
import csv
import time  # For sleep handling
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# Set up Chrome driver
options = Options()
options.headless = False  # Set to True if GUI isn't needed
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

def scrape_page(url):
    """Scrape data from a single page."""
    print(f"Accessing: {url}")  # Debugging statement
    driver.get(url)

    # Ensure page has loaded by waiting for a key element to be present
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, 'card-body'))
        )
        print("Page loaded successfully.")  # Debugging statement
    except Exception as e:
        print(f"Error loading page: {e}")
        return []  # Return an empty list if the page fails to load

    soup = BeautifulSoup(driver.page_source, 'html.parser')

    courses = []
    for card in soup.find_all('div', class_='card-body'):
        # Extract course title
        title_tag = card.find('h5', class_='card-title')
        course_title = title_tag.text.strip() if title_tag else 'N/A'

        if course_title == 'N/A':
            print("Skipping course with no title.")  # Debugging statement
            continue

        # Extract provider
        provider_tag = card.find('div', class_='course-provider')
        provider = provider_tag.find('a').text.strip() if provider_tag and provider_tag.find('a') else 'N/A'

        # Extract upcoming date
        upcoming_date_tag = card.find('ul', class_='list-group-flush')
        date_item = upcoming_date_tag.find('li', class_='list-group-item') if upcoming_date_tag else None
        upcoming_date = date_item.find('strong').text.strip() if date_item and date_item.find('strong') else 'N/A'

        # Extract course fee from the footer
        footer = card.find_next_sibling('div', class_='card-footer')
        full_fee = footer.find('p').find('strong').text.strip() if footer else 'N/A'

        # Store course information
        course_data = {
            'Title': course_title,
            'Provider': provider,
            'Upcoming Date': upcoming_date,
            'Course Fee': full_fee
        }
        print(f"Scraped course: {course_data}")  # Debugging statement
        courses.append(course_data)

    return courses

# Construct the base URL for the first page (without `start` parameter)
first_page_url = (
    "https://www.myskillsfuture.gov.sg/content/portal/en/portal-search/portal-search.html"
    "?fq=Course_Supp_Period_To_1%3A%5B2024-10-13T00%3A00%3A00Z%20TO%20*%5D"
    "&fq=IsValid%3Atrue&q=*%3A*&cat=fq%3DArea_of_Training_text_exact%3A(%22Marine%20%26%20Port%20Services%22)"
    "&cattext=Marine%20%26%20Port%20Services"
)

# Construct the base URL for subsequent pages (with `start` parameter)
base_url_with_start = (
    "https://www.myskillsfuture.gov.sg/content/portal/en/portal-search/portal-search.html"
    "?fq=Course_Supp_Period_To_1%3A%5B2024-10-13T00%3A00%3A00Z%20TO%20*%5D&fq=IsValid%3Atrue&q="
    "*%3A*&start={start_num}&cat=fq%3DArea_of_Training_text_exact%3A(%22Marine%20%26%20Port%20Services%22)&cattext=Marine%20%26%20Port%20Services"
)

all_courses = []
count = 0  # Initialize course counter

# Scrape the first page (without `start` parameter)
print(f"Scraping first page: {first_page_url}")
courses = scrape_page(first_page_url)
if courses:
    all_courses.extend(courses)
count += len(courses)  # Increment the count
print(f"Total courses scraped: {count}")  # Debugging statement

# Scrape subsequent pages (with `start` parameter)
start_num = 24  # Start from 24 for the second page onward
while count <= 95:  # Stop if count exceeds 95
    page_url = base_url_with_start.format(start_num=start_num)
    print(f"Scraping {page_url}")

    # Scrape the page
    courses = scrape_page(page_url)

    if not courses:
        print("No more courses found. Exiting.")
        break

    all_courses.extend(courses)
    count += len(courses)  # Increment the count
    print(f"Total courses scraped so far: {count}")  # Debugging statement

    # Introduce a short delay between requests
    time.sleep(5)  # Ensure the page has time to load

    if count > 95:
        print(f"Course count exceeded 95. Exiting loop.")
        break

    start_num += 24  # Increment start by 24 for the next page

# Write data to a CSV file
csv_file = "courses.csv"
csv_columns = ['Title', 'Provider', 'Upcoming Date', 'Course Fee']

try:
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=csv_columns)
        writer.writeheader()  # Write header row
        writer.writerows(all_courses)  # Write course data
    print(f"Data successfully written to {csv_file}")
except Exception as e:
    print(f"Error writing to CSV: {e}")

# Close the browser session
driver.quit()  # Ensure the browser is closed properly


Scraping first page: https://www.myskillsfuture.gov.sg/content/portal/en/portal-search/portal-search.html?fq=Course_Supp_Period_To_1%3A%5B2024-10-13T00%3A00%3A00Z%20TO%20*%5D&fq=IsValid%3Atrue&q=*%3A*&cat=fq%3DArea_of_Training_text_exact%3A(%22Marine%20%26%20Port%20Services%22)&cattext=Marine%20%26%20Port%20Services
Accessing: https://www.myskillsfuture.gov.sg/content/portal/en/portal-search/portal-search.html?fq=Course_Supp_Period_To_1%3A%5B2024-10-13T00%3A00%3A00Z%20TO%20*%5D&fq=IsValid%3Atrue&q=*%3A*&cat=fq%3DArea_of_Training_text_exact%3A(%22Marine%20%26%20Port%20Services%22)&cattext=Marine%20%26%20Port%20Services
Page loaded successfully.
Skipping course with no title.
Skipping course with no title.
Skipping course with no title.
Skipping course with no title.
Skipping course with no title.
Skipping course with no title.
Skipping course with no title.
Skipping course with no title.
Skipping course with no title.
Skipping course with no title.
Skipping course with no title.
Skippin