In [1]:
# !pip install selenium
# !pip install webdriver-manager
# View your Google Chrome browser version: chrome://settings/help
# Download ChromeDriver that corresponds to your Google Chrome browser version: https://sites.google.com/chromium.org/driver/

Useful links:
https://stackoverflow.com/questions/69875125/find-element-by-commands-are-deprecated-in-selenium

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import re
import pandas as pd
import tqdm

In [3]:
CHROME_DRIVER_PATH = "C:/Users/ernes/Downloads/chromedriver_win32/chromedriver.exe"
DATA_WRITE_PATH = "C:/Users/ernes/Git/dsa3101-2220-12-ds/Backend/Data/jobs/"

In [4]:
ser = Service(CHROME_DRIVER_PATH)
op = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=ser, options=op)

In [None]:
query = '"data analyst"'
save_file_name = "data_analyst"

# JobStreet Scraping for URLs

In [None]:
website = "https://www.jobstreet.com.sg/"
driver.get(website)

In [None]:
# Finding search bar
search = driver.find_element(By.ID, "searchKeywordsField")

# Remove all words in search bar
search.send_keys(Keys.CONTROL + "a")
search.send_keys(Keys.DELETE)

# Send query
search.send_keys(query)
search.send_keys(Keys.RETURN)

# Waiting for 10 seconds after sending keys to search bar
try:
    # Finding element by XPATH and class attribute since div class names are dynamic
    jobs = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, "//*[@data-automation='jobListing']"))
    )
    
except:
    driver.quit()

In [None]:
next_page_url = []
visited = []
queue = [driver.current_url]

In [None]:
start_time = time.time()
all_jobs_links = []
while len(queue) > 0: # REVISE
    
    # Visiting all pages
    
    # Dequeue next URL
    next_url = queue[-1]
    queue = queue[:-1]
    visited.append(next_url)
    print(f"Visiting {next_url}")
    driver.get(next_url)
    
    # Waiting for 10 seconds after sending keys to search bar
    try:
        # Finding element by XPATH and class attribute since div class names are dynamic
        jobs = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//*[@data-automation='jobListing']"))
        )
        
        # Getting the job postings on the page
        jobs_list = jobs.find_elements(By.XPATH, "./*")[:-2] # The last two results are invalid

        # Getting the links for each job posting
        jobs_links_found = [job.find_element(By.TAG_NAME, "h1").find_element(By.XPATH, "a[@href]").get_attribute("href") for job in jobs_list]
        all_jobs_links.extend(jobs_links_found)
        
    except:
        print(f"FAILED: {next_url}")
        driver.quit()
    
    # Finding new URLs
    next_page_url = [page.get_attribute("href") for page in driver.find_element(By.XPATH, "//*[@data-automation='pagination']").find_elements(By.XPATH, "a[@href]")]
    for url in next_page_url:
        if url not in visited:
            queue.append(url)
            
print(f"Job completed in {round(time.time() - start_time)} seconds")

In [None]:
driver.quit()

In [None]:
print(f"{len(all_jobs_links)} job sites found")
print(all_jobs_links[:5])

In [None]:
with open(DATA_WRITE_PATH + f'jobstreet_job_url_query-{save_file_name}.txt', 'w') as f:
    for url in all_jobs_links:
        f.write(url)
        f.write("\n")

## Scraping Job Information

In [7]:
with open(DATA_WRITE_PATH + f'jobstreet_job_url_query-{save_file_name}.txt') as file:
    all_jobs_links = [line.rstrip() for line in file]

In [8]:
CLEANR = re.compile('<.*?>') 
def cleanhtml(raw_html):
    clean_text = re.sub(CLEANR, ' ', raw_html)
    clean_text = re.sub("&nbsp;", " ", clean_text)
    clean_text = re.sub('\s+', " ", clean_text)
    if clean_text[0] == " ":
        clean_text = clean_text[1:]
    if clean_text[-1] == " ":
        clean_text = clean_text[:-1]
    return clean_text

In [9]:
job_titles = []
job_employers = []
job_descriptions = []

In [10]:
for i in tqdm.trange(len(all_jobs_links)):
    driver.get(all_jobs_links[i])
    
    # Getting job title & employer details
    title_element = driver.find_element(By.XPATH, "//*[@data-automation='detailsTitle']")
    while len(title_element.find_elements(By.XPATH, "./*")) == 1:
        title_element = title_element.find_element(By.XPATH, "./*")
        
    job_title, job_employer = [cleanhtml(title_element.find_elements(By.XPATH, "./*")[i].get_attribute('outerHTML')) for i in range(len(title_element.find_elements(By.XPATH, "./*")))]
    
    # Getting job description
    description_element = driver.find_element(By.XPATH, "//*[@data-automation='jobDescription']")
    html = description_element.get_attribute("outerHTML")
    job_description = cleanhtml(html)
    
    # Recording results
    job_titles.append(job_title)
    job_employers.append(job_employer)
    job_descriptions.append(job_description)

100%|████████████████████████████████████████████████████████████████████████████████| 540/540 [11:03<00:00,  1.23s/it]


In [11]:
driver.quit()

In [12]:
jobs_df = pd.DataFrame([job_titles, job_employers, job_descriptions, all_jobs_links]).\
                transpose().\
                rename(columns = {0 : "Title",
                                  1 : "Company",
                                  2 : "Description",
                                  3 : "URL"})

In [13]:
jobs_df.to_csv(DATA_WRITE_PATH + f"jobstreet_query-{save_file_name}.csv")