In [None]:
# # consistent scraping using randomized delay

# import csv
# import time
# import random  # For randomized delays
# from selenium import webdriver
# from selenium.webdriver.chrome.service import Service
# from selenium.webdriver.common.by import By
# from selenium.webdriver.chrome.options import Options
# from webdriver_manager.chrome import ChromeDriverManager
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from bs4 import BeautifulSoup
# from datetime import datetime

# # Get today's date in the required format (YYYY-MM-DD)
# current_date = datetime.today().strftime('%Y-%m-%d')

# def setup_driver():
#     """Set up the Selenium Chrome driver."""
#     options = Options()
#     options.headless = False  # Set to True if GUI isn't needed
#     return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# def scrape_page(url, retries=3):
#     """Scrape data from a single page with retry logic."""
#     for attempt in range(retries):
#         try:
#             driver = setup_driver()
#             print(f"Accessing: {url} (Attempt {attempt + 1}/{retries})")
#             driver.get(url)

#             # Wait for elements to load
#             WebDriverWait(driver, 15).until(
#                 EC.presence_of_all_elements_located((By.CLASS_NAME, 'card-body'))
#             )
#             print("Page loaded successfully.")
#             soup = BeautifulSoup(driver.page_source, 'html.parser')
#             driver.quit()  # Close the browser
#             return extract_courses(soup)

#         except Exception as e:
#             print(f"Error loading page: {e}. Retrying...")
#             driver.quit()  # Ensure the browser is closed

#     print(f"Failed to load {url} after {retries} attempts.")
#     return []  # Return an empty list if all retries fail

# def extract_courses(soup):
#     """Extract course data from the page's HTML soup."""
#     courses = []

#     for card in soup.find_all('div', class_='card-body'):
#         title_tag = card.find('h5', class_='card-title')
#         course_title = title_tag.text.strip() if title_tag else 'N/A'

#         if course_title == 'N/A':
#             print("Skipping course with no title.")
#             continue

#         provider_tag = card.find('div', class_='course-provider')
#         provider = provider_tag.find('a').text.strip() if provider_tag and provider_tag.find('a') else 'N/A'

#         upcoming_date_tag = card.find('ul', class_='list-group-flush')
#         date_item = upcoming_date_tag.find('li', class_='list-group-item') if upcoming_date_tag else None
#         upcoming_date = date_item.find('strong').text.strip() if date_item and date_item.find('strong') else 'N/A'

#         footer = card.find_next_sibling('div', class_='card-footer')
#         full_fee = footer.find('p').find('strong').text.strip() if footer else 'N/A'

#         course_data = {
#             'Title': course_title,
#             'Provider': provider,
#             'Upcoming Date': upcoming_date,
#             'Course Fee': full_fee
#         }
#         print(f"Scraped course: {course_data}")
#         courses.append(course_data)

#     return courses

# # Base URL for the first page
# base_url = (
#     f"https://www.myskillsfuture.gov.sg/content/portal/en/portal-search/portal-search.html"
#     f"?fq=Course_Supp_Period_To_1%3A%5B{current_date}T00%3A00%3A00Z%20TO%20*%5D"
#     "&fq=IsValid%3Atrue&q=*%3A*&cat=fq%3DArea_of_Training_text_exact%3A(%22Marine%20%26%20Port%20Services%22)"
#     "&cattext=Marine%20%26%20Port%20Services"
# )

# all_courses = []
# start = 24  # Initial start value
# count = 0

# # Scrape the first page
# print(f"Scraping first page: {base_url}")
# courses = scrape_page(base_url)
# if courses:
#     all_courses.extend(courses)
# count += len(courses)
# print(f"Total courses scraped: {count}")

# # Loop through subsequent pages
# while count < 95:
#     page_url = (
#         f"https://www.myskillsfuture.gov.sg/content/portal/en/portal-search/portal-search.html"
#         f"?fq=Course_Supp_Period_To_1%3A%5B{current_date}T00%3A00%3A00Z%20TO%20*%5D"
#         f"&fq=IsValid%3Atrue&q=*%3A*&start={start}&cat=fq%3DArea_of_Training_text_exact%3A(%22Marine%20%26%20Port%20Services%22)"
#         "&cattext=Marine%20%26%20Port%20Services"
#     )
#     print(f"Scraping {page_url}")

#     courses = scrape_page(page_url)

#     if not courses:
#         print("No more courses found. Exiting.")
#         break

#     all_courses.extend(courses)
#     count += len(courses)
#     print(f"Total courses scraped so far: {count}")

#     start += 24
#     time.sleep(random.uniform(3, 7))  # Randomized delay

# # Write data to a CSV file
# csv_file = "courses.csv"
# csv_columns = ['Title', 'Provider', 'Upcoming Date', 'Course Fee']

# try:
#     with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
#         writer = csv.DictWriter(file, fieldnames=csv_columns)
#         writer.writeheader()
#         writer.writerows(all_courses)
#     print(f"Data successfully written to {csv_file}")
# except Exception as e:
#     print(f"Error writing to CSV: {e}")


In [116]:
# randomized timing + incognito session each time (managed to scrape all)

import csv
import time
import random
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
from datetime import datetime

# Get today's date in the required format (YYYY-MM-DD)
current_date = datetime.today().strftime('%Y-%m-%d')

def setup_driver():
    """Set up the Selenium Chrome driver with incognito mode."""
    options = webdriver.ChromeOptions()
    options.headless = False  # Run in GUI mode
    options.add_argument("--incognito")  # Open in incognito mode
    return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

def scrape_page(url, retries=3):
    """Scrape data from a single page with retry logic."""
    for attempt in range(retries):
        try:
            driver = setup_driver()  # New incognito session for each request
            print(f"Accessing: {url} (Attempt {attempt + 1}/{retries})")
            driver.get(url)

            # Wait for the elements to load
            WebDriverWait(driver, 15).until(
                EC.presence_of_all_elements_located((By.CLASS_NAME, 'card-body'))
            )
            print("Page loaded successfully.")
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            driver.quit()  # Close browser session
            return extract_courses(soup)

        except Exception as e:
            print(f"Error loading page: {e}. Retrying...")
            driver.quit()  # Ensure browser is closed between attempts

    print(f"Failed to load {url} after {retries} attempts.")
    return []

def extract_courses(soup):
    """Extract course data from the page's HTML soup."""
    courses = []

    for card in soup.find_all('div', class_='card-body'):
        title_tag = card.find('h5', class_='card-title')
        course_title = title_tag.text.strip() if title_tag else 'N/A'

        if course_title == 'N/A':
            print("Skipping course with no title.")
            continue

        provider_tag = card.find('div', class_='course-provider')
        provider = provider_tag.find('a').text.strip() if provider_tag and provider_tag.find('a') else 'N/A'

        upcoming_date_tag = card.find('ul', class_='list-group-flush')
        date_item = upcoming_date_tag.find('li', class_='list-group-item') if upcoming_date_tag else None
        upcoming_date = date_item.find('strong').text.strip() if date_item and date_item.find('strong') else 'N/A'

        footer = card.find_next_sibling('div', class_='card-footer')
        full_fee = footer.find('p').find('strong').text.strip() if footer else 'N/A'

        course_data = {
            'Title': course_title,
            'Provider': provider,
            'Upcoming Date': upcoming_date,
            'Course Fee': full_fee
        }
        print(f"Scraped course: {course_data}")
        courses.append(course_data)

    return courses

# Base URL for the first page
base_url = (
    f"https://www.myskillsfuture.gov.sg/content/portal/en/portal-search/portal-search.html"
    f"?fq=Course_Supp_Period_To_1%3A%5B{current_date}T00%3A00%3A00Z%20TO%20*%5D"
    "&fq=IsValid%3Atrue&q=*%3A*&cat=fq%3DArea_of_Training_text_exact%3A(%22Marine%20%26%20Port%20Services%22)"
    "&cattext=Marine%20%26%20Port%20Services"
)

all_courses = []
start = 24
count = 0

# Scrape the first page
print(f"Scraping first page: {base_url}")
courses = scrape_page(base_url)
if courses:
    all_courses.extend(courses)
count += len(courses)
print(f"Total courses scraped: {count}")

# Loop through subsequent pages
while count < 95:
    page_url = (
        f"https://www.myskillsfuture.gov.sg/content/portal/en/portal-search/portal-search.html"
        f"?fq=Course_Supp_Period_To_1%3A%5B{current_date}T00%3A00%3A00Z%20TO%20*%5D"
        f"&fq=IsValid%3Atrue&q=*%3A*&start={start}&cat=fq%3DArea_of_Training_text_exact%3A(%22Marine%20%26%20Port%20Services%22)"
        "&cattext=Marine%20%26%20Port%20Services"
    )
    print(f"Scraping {page_url}")

    courses = scrape_page(page_url)

    if not courses:
        print("No more courses found. Exiting.")
        break

    all_courses.extend(courses)
    count += len(courses)
    print(f"Total courses scraped so far: {count}")

    start += 24
    time.sleep(random.uniform(3, 8))  # Randomized delay

# Write data to a CSV file
csv_file = "courses.csv"
csv_columns = ['Title', 'Provider', 'Upcoming Date', 'Course Fee']

try:
    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=csv_columns)
        writer.writeheader()
        writer.writerows(all_courses)
    print(f"Data successfully written to {csv_file}")
except Exception as e:
    print(f"Error writing to CSV: {e}")


Scraping first page: https://www.myskillsfuture.gov.sg/content/portal/en/portal-search/portal-search.html?fq=Course_Supp_Period_To_1%3A%5B2024-10-13T00%3A00%3A00Z%20TO%20*%5D&fq=IsValid%3Atrue&q=*%3A*&cat=fq%3DArea_of_Training_text_exact%3A(%22Marine%20%26%20Port%20Services%22)&cattext=Marine%20%26%20Port%20Services
Accessing: https://www.myskillsfuture.gov.sg/content/portal/en/portal-search/portal-search.html?fq=Course_Supp_Period_To_1%3A%5B2024-10-13T00%3A00%3A00Z%20TO%20*%5D&fq=IsValid%3Atrue&q=*%3A*&cat=fq%3DArea_of_Training_text_exact%3A(%22Marine%20%26%20Port%20Services%22)&cattext=Marine%20%26%20Port%20Services (Attempt 1/3)
Page loaded successfully.
Skipping course with no title.
Skipping course with no title.
Skipping course with no title.
Skipping course with no title.
Skipping course with no title.
Skipping course with no title.
Skipping course with no title.
Skipping course with no title.
Skipping course with no title.
Skipping course with no title.
Skipping course with no 