In [None]:
# !pip install selenium
# !pip install webdriver-manager
# View your Google Chrome browser version: chrome://settings/help
# Download ChromeDriver that corresponds to your Google Chrome browser version: https://sites.google.com/chromium.org/driver/

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import re
import pandas as pd
import tqdm

In [None]:
CHROME_DRIVER_PATH = "/usr/local/bin/chromedriver_mac_arm64/chromedriver"
DATA_WRITE_PATH = "/Users/micolechan/Desktop/dsa3101/dsa3101-2220-12-ds/Backend/Data/jobs/"

In [None]:
ser = Service(CHROME_DRIVER_PATH)
op = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=ser, options=op)

In [None]:
def get_urls(query):
    start_time = time.time()
    all_jobs_links = []

    for page_num in range(0, 100):
        try:
            driver.get(f'https://www.mycareersfuture.gov.sg/search?search={query}&sortBy=relevancy&page={page_num}')
            urls = driver.find_elements(By.XPATH, "//a[@data-testid='job-card-link']")
            if not urls:
                # print("End of Search.")
                break
            all_jobs_links.extend([i.get_attribute("href") for i in urls])
            # print(f"Scraping of Page {page_num} is completed.")
            
        except:
            print("FAILED")
            driver.quit()
            break
                
    print(f"Job completed in {round(time.time() - start_time)} seconds")
    print(f"{len(all_jobs_links)} job listings found")

    return all_jobs_links

In [None]:
def scraper(list_of_urls):

    job_titles = []
    job_employers = []
    job_descriptions = []
    job_urls = []
    count = 0

    for i in tqdm.trange(len(list_of_urls)):
        url = list_of_urls[i]
        try:
            driver.get(url)
            job_title_element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.ID, "job_title"))
            )
            job_title = job_title_element.text
            company = driver.find_element(By.CSS_SELECTOR, "p[data-cy='company-hire-info__company']").text
            job_description_element = driver.find_element(By.ID, "job_description")
            job_description = job_description_element.find_element(By.ID, "description-content").text
            job_titles.append(job_title)
            job_employers.append(company)
            job_descriptions.append(job_description)
            job_urls.append(url)
        except Exception as e :
            count += 1 
            print(e)
            print(url)
            print(count)
    jobs_df = pd.DataFrame([job_titles, job_employers, job_descriptions, job_urls]).\
                transpose().\
                rename(columns = {0 : "Title",
                                  1 : "Company",
                                  2 : "Description",
                                  3 : "URL"})
    return jobs_df
    
# scraper(all_jobs_links)

In [None]:
def write_to_file(save_file_name, list_of_urls):
    with open(DATA_WRITE_PATH + f'mycareersfuture_job_url_query-{save_file_name}.txt', 'w') as f:
        for url in list_of_urls:
            f.write(url)
            f.write("\n")


def read_file(file_name):
    with open(DATA_WRITE_PATH + f'mycareersfuture_job_url_query-{save_file_name}.txt') as file:
        urls = [line.rstrip() for line in file]
    return urls

In [None]:
queries = ['"data analyst"','"data engineer"', '"data science"', '"machine learning engineer"']

file_names = [query.replace('"', '').replace(' ', '_') for query in queries]

for index in range(len(queries)):
    driver = webdriver.Chrome(service=ser, options=op)
    print(f"Scraping website for {queries[index]} roles.")
    
    # 1. Get list of urls 
    list_of_urls = get_urls(queries[index])
    
    # 2. Save result as txt file
    write_to_file(file_names[index], list_of_urls)
    print(f".txt file created with list of URLS for {queries[index]}")
    
    # 3. Read URL text 
    formatted_urls = read_file(file_names[index])
    df = scraper(formatted_urls)
    df.to_csv(DATA_WRITE_PATH + f"mycareersfuture_query-{file_names[index]}.csv")
    print(f".csv file created for {queries[index]}")
    
    driver.quit()

In [None]:
# TEST FOR ONE QUERY 
# query = '"data analyst"'
# save_file_name = "data_analyst"

# list_of_urls = get_urls(query)
# write_to_file(save_file_name, list_of_urls)
# print(f".txt file cretaed with list of URLS for {query}")
# formatted_urls = read_file(save_file_name)
# df = scraper(formatted_urls)
# df.to_csv(DATA_WRITE_PATH + f"mycareersfuture_query-{save_file_name}.csv")
# print(f".csv file created for {query}")