# How to use:
1. Fill in your query in the first cell
2. Put in the path to your web driver in the second cell.
3. If you have not installed the web driver, follow the instructions in the third cell
4. Run the remaining cells in sequence

In [None]:
queries = ['"data analyst"','"data engineer"', '"data science"', '"machine learning engineer"']

In [None]:
# Path to the ChromeDriver executable
CHROME_DRIVER_PATH = "/usr/local/bin/chromedriver_mac_arm64/chromedriver"

In [None]:
# !pip install selenium
# !pip install webdriver-manager
# View your Google Chrome browser version: chrome://settings/help
# Download ChromeDriver that corresponds to your Google Chrome browser version: https://sites.google.com/chromium.org/driver/

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import re
import pandas as pd
import tqdm

### Setting up

In [None]:
# Path to the directory to save the output CSV files
DATA_WRITE_PATH = "./../../Data/jobs/"

In [None]:
# Initialising parameters for Chrome Driver
ser = Service(CHROME_DRIVER_PATH)
op = webdriver.ChromeOptions()

### Retrieving URLs to all search results from a particular query
Input: Query to input into search engine on https://www.mycareersfuture.gov.sg/ 

Output: List of all URLs 

In [None]:
def get_urls(query):
    start_time = time.time()
    # List to store the URLs of job listings
    all_jobs_links = []

    # Loop over the first 100 pages of search results
    for page_num in range(0, 1000):
        try:
            # Navigate to the page for the current search query and page number
            driver.get(f'https://www.mycareersfuture.gov.sg/search?search={query}&sortBy=relevancy&page={page_num}')
            # Find all the job listing URLs on the page
            urls = driver.find_elements(By.XPATH, "//a[@data-testid='job-card-link']")
            
            # If no URLs are found, break out of the loop
            if not urls:
                break
                
            # Add the URLs to the list of all job listing URLs
            all_jobs_links.extend([i.get_attribute("href") for i in urls])
            
        # If there is an exception, print "FAILED" and quit the driver
        except:
            print("FAILED")
            driver.quit()
            break
                
    print(f"Job completed in {round(time.time() - start_time)} seconds")
    print(f"{len(all_jobs_links)} job listings found")

    # Return the list of all job listing URLs
    return all_jobs_links

### Scraping Indiviudal Job Postings from each URL
Input: List of URLS

Output: Dataframe with extracted information:  

- job titles, companies, descriptions and URLs of job postings

In [None]:
def scraper(list_of_urls):
    
    job_titles = []
    job_employers = []
    job_descriptions = []
    job_urls = []
    # Initialize a counter to keep track of any exceptions that occur during scraping
    count = 0

    # Iterate over the list of URLs and scrape job information
    for i in tqdm.trange(len(list_of_urls)):
        # Get the URL at the current index
        url = list_of_urls[i]
        try:
            # Load the URL and wait for the job title element to appear
            driver.get(url)
            job_title_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.ID, "job_title"))
            )
            # Extract the job title, employer, and description elements
            job_title = job_title_element.text
            company = driver.find_element(By.CSS_SELECTOR, "p[data-cy='company-hire-info__company']").text
            job_description_element = driver.find_element(By.ID, "job_description")
            job_description = job_description_element.find_element(By.ID, "description-content").text
            # Append the scraped job information to the respective lists
            job_titles.append(job_title)
            job_employers.append(company)
            job_descriptions.append(job_description)
            job_urls.append(url)
        except Exception as e :
            # If an exception occurs, increment the counter and print information about the error
            count += 1 
            print(e)
            print(url)
            print(count)
    # Convert the scraped job information into a pandas dataframe and return it
    jobs_df = pd.DataFrame([job_titles, job_employers, job_descriptions, job_urls]).\
                transpose().\
                rename(columns = {0 : "Title",
                                  1 : "Company",
                                  2 : "Description",
                                  3 : "URL"})
    return jobs_df
    
# scraper(all_jobs_links)

In [None]:
def write_to_file(save_file_name, list_of_urls):
    with open(DATA_WRITE_PATH + f'mycareersfuture_job_url_query-{save_file_name}.txt', 'w') as f:
        for url in list_of_urls:
            f.write(url)
            f.write("\n")


def read_file(file_name):
    with open(DATA_WRITE_PATH + f'mycareersfuture_job_url_query-{file_name}.txt') as file:
        urls = [line.rstrip() for line in file]
    return urls

In [None]:
file_names = [query.replace('"', '').replace(' ', '_') for query in queries]

for index in range(len(queries)):
    driver = webdriver.Chrome(service=ser, options=op)
    print(f"Scraping website for {queries[index]} roles.")
    
    # 1. Get list of urls 
    list_of_urls = get_urls(queries[index])
    
    # 2. Save result as txt file
    write_to_file(file_names[index], list_of_urls)
    print(f".txt file created with list of URLS for {queries[index]}")
    
    # 3. Read URL text 
    formatted_urls = read_file(file_names[index])

     # 4. Scrape job information from URLs and create a DataFrame
    df = scraper(formatted_urls)

    # 5. Save DataFrame as a csv file
    df.to_csv(DATA_WRITE_PATH + f"mycareersfuture_query-{file_names[index]}.csv")
    print(f".csv file created for {queries[index]}")
    
    driver.quit()