In [1]:
# library here
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as soup
import requests
import time
import re

In [2]:
# point chromDriver's path using Service 
# you need to download chromedrive in advance to use Selenium
# this path is different from the place of chromedriver.exe. Check out yours!
service = Service(r"C:\Users\Owner\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe") # r"" is a raw string literal

# activate webdriver (Selenium)
driver = webdriver.Chrome(service=service)

# access to simply hired
occupation = "data+scientist" # define occupation you want to search
# data+engineer, data+analyst, data+scientist

locate = "united+states" # define location you want to search

# the final url of simply hired 
url = f'https://www.simplyhired.com/search?q={occupation}&l={locate}'

# this line tells Selenium to open the URL in the browser (Selenium)
driver.get(url)

In [3]:
# list for storing job posts
job_list = []

# function to collect information in each job post
def get_jobs():
    
    # this line retrieves the entire HTML source code of the currently loaded webpage in the browser (Selenium)
    page_source = driver.page_source
    
    # this line parses the page_source (BeautifulSoup)
    soup_obj = soup(page_source, 'html.parser')
    
    # get all job lists in a page
    job_posts = soup_obj.find("ul", {'class': 'css-13ia03s'}).find_all('li', {'class': 'css-0'})

    
    # loop through all job lists and collect information
    for job in job_posts:
        job_post = {}

        # collect and store company name
        try:
            company_name = job.find('span', {'class': "css-lvyu5j"}).text.strip()
            cleaned_name = re.sub(r'&nbsp;|—', '', company_name)
            job_post["company_name"] = cleaned_name
        except AttributeError:
            job_post["company_name"] = None

        # collect and store job title
        try:
            job_post["title"] = job.find('a', {'class': "chakra-button css-1djbb1k"}).text.strip()
        except AttributeError:
            job_post["title"] = None

        # collect and store location
        try:
            job_post["location"] = job.find('span', {'class': "css-1t92pv"}).text.strip()
        except AttributeError:
            job_post["location"] = None

        # collect and store summary description
        try:
            job_post["summary"] = job.find('p', {'class': 'chakra-text css-jhqp7z'}).text.strip()
        except AttributeError:
            job_post["summary"] = None

        # collect and store salary
        try:
            job_post["salary"] = job.find('p', {'class': 'chakra-text css-1g1y608'}).text.strip()
        except AttributeError:
            job_post["salary"] = None

        # collect and store rate
        try:
            job_post["star"] = job.find('span', {'class': 'css-epvm6'}).find("span", {"class": "css-0"}).text.strip()
        except AttributeError:
            job_post["star"] = None

        
        # get further information by going to each company's job post link
        detail_url = job.find("a", {"class": "chakra-button css-1djbb1k"})["href"]
        base_url = "https://www.simplyhired.com"
        driver.get(base_url + detail_url)  # move to each company's job post website using Selenium

        # wait until the website is completely loaded
        WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CSS_SELECTOR, "span[data-testid='detailText']")))
        

        # get each company's website HTML
        page_source = driver.page_source
        detail_soup = soup(page_source, 'html.parser')

        # check if the detail job post is unavailable
        try:
            unavailable_message = detail_soup.find("div", {"class": "css-1uus8yr"}).find("p", {"class": "chakra-text css-c883ag"}).text.strip()
            if unavailable_message and "unavailable" in unavailable_message.lower():
                # set all columns to None if the job is unavailable
                job_post["company_name"] = None
                job_post["title"] = None
                job_post["location"] = None
                job_post["summary"] = None
                job_post["salary"] = None
                job_post["star"] = None
                job_post["work_style"] = None
                job_post["benefit"] = None
                job_post["qualification"] = None
                job_post["full description"] = None
                
                # go back to the previous page
                driver.back()

                # wait until the page is completely loaded
                WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.XPATH, "//ul[@class='css-13ia03s']")))
                
                job_list.append(job_post)       
                continue # move to next job post 
                
        except AttributeError:
            pass

        #collect and store work style
        try:
            #job_post["work style"] = job.find('span', {'class': 'chakra-stack css-xyzzkl'}).find("span").text.strip()
            job_post["work_style"] = detail_soup.find('span', {'data-testid': 'viewJobBodyJobDetailsJobType'}).find("span", {"data-testid": "detailText"}).text.strip()
        except AttributeError:
            job_post["work_style"] = None

        #collect and store Benefit
        try:
            benefits_section = detail_soup.find("div", {'data-testid': 'viewJobBodyJobBenefits'})
            if benefits_section:
                all_benefits = benefits_section.find_all("span", {"data-testid": "viewJobBenefitItem"})
                job_post["benefit"] = ", ".join([benefit.text.strip() for benefit in all_benefits])
            else:
                job_post["benefit"] = None
        except AttributeError:
            job_post["benefit"] = None

        # collect and store Qualification
        try:
            qualifications_section = detail_soup.find("div", {'data-testid': 'viewJobQualificationsContainer'})
            if qualifications_section:
                all_qualifications = qualifications_section.find_all("span", {"data-testid": "viewJobQualificationItem"})
                job_post["qualification"] = ", ".join([qualification.text.strip() for qualification in all_qualifications])
            else:
                job_post["qualification"] = None
        except AttributeError:
            job_post["qualification"] = None

        # collect and store full Job Description
        try:
            job_post["full description"] = detail_soup.find('div', {'class': 'css-cxpe4v'}).text.strip()
        except AttributeError:
            job_post["full description"] = None

        # after collecting the job details, go back to the previous page (the job listing page)
        driver.back()

        # wait until the page is completely loaded
        WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.XPATH, "//ul[@class='css-13ia03s']")))

        # sleep for a while to ensure the page has loaded
        time.sleep(5)
        
        # add the job list in job post
        job_list.append(job_post)

In [None]:
# get the job post in the first page
get_jobs()

# display the current page number and URL
print(f"Collecting data from page 1, Current URL: {url}")

# get current page number from the pagination
def get_current_page_number():
    current_page = driver.find_element(By.XPATH, "//span[@aria-current='true']").text
    return int(current_page)  # Convert to an integer

# initial page number
previous_url = driver.current_url
same_url_counter = 0
retry_counter = 0

while True:  # collect data from all job posts across pages
    try:
        # get the current page number dynamically
        current_page = get_current_page_number()
        #construct the next page number dynamically
        page_number = current_page + 1

        if page_number < 10:
            next_button = driver.find_element(By.XPATH, "//a[@aria-label='Next page']")
        else:
            next_button = driver.find_element(By.XPATH, f"//a[@aria-label='page {page_number}']")
        
        
        # wait until the next page link is clickable
        WebDriverWait(driver, 15).until(EC.element_to_be_clickable(next_button))
        
        # click the next page link
        next_button.click()
        
        # wait until the next page content is loaded
        WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.XPATH, "//ul[@class='css-13ia03s']")))

        # sleep for a while to ensure the page has loaded
        time.sleep(5)

        # skip if visited the same URL
        current_url = driver.current_url
        if current_url == previous_url:
            print(f"Page {page_number}, Same URL: {current_url}. Skipping to next page")
            same_url_counter += 1
            if same_url_counter >= 3:
                print("Reached the threshold of same URLs, exiting loop")
                break
            previous_url = current_url
            continue
        
        # collect job information from this page
        get_jobs()

        # display the current page number and URL
        print(f"Collecting data from page {page_number}, Current URL: {driver.current_url}")

        # reset counter and previous URL
        same_url_counter = 0
        previous_url = current_url
        retry_counter = 0

        # sleep for a while to ensure the page has loaded
        time.sleep(5)

    except Exception as e:
        print(f"Error occurred: {e}")

        # try one more time if an error occurs
        # when reached to page 50, errors occurs, so wirte these code
        if retry_counter == 0:
            print("Back to the previous page and retrying!")
            driver.back()
            time.sleep(5)
            retry_counter += 1
            continue
        
        print("No More Additional Pages")
        break  # break the loop if there's no next page



Collecting data from page 1, Current URL: https://www.simplyhired.com/search?q=data+scientist&l=united+states
Collecting data from page 2, Current URL: https://www.simplyhired.com/search?q=data+scientist&l=united+states&cursor=ABQAAQAUAAAAAAAAAAAAAAACQIpgMwEAEbdnqdQMGGIJD8bMmyGUR3Gkn%2BV3Et35PJ9MeiWh68HYU0yFMp533InHuk4KKTJmeouTWgdvaMQ%3D
Collecting data from page 3, Current URL: https://www.simplyhired.com/search?q=data+scientist&l=united+states&cursor=ABQAAgAoAAAAAAAAAAAAAAACQIpgMwEBAQsAk1EQs6CUjQP%2FF6%2FiLxU19%2F1h0qmMlqruElNQvIbTrXFlXEVanCSh%2FaCPGutCSSUG4bSeMYQIqo85ET8xItMW7XC49TpW3%2Fkb7DnFK2S9mXgtbLsSKyalWRBeTQ%3D%3D
Collecting data from page 4, Current URL: https://www.simplyhired.com/search?q=data+scientist&l=united+states&cursor=ABQAAwA8AAAAAAAAAAAAAAACQIpgMwEBARcArMs%2FH3wfTm8IWh2FJ83lc5RXGdx6I3alWJqEhEGHuzGfsoPhWfgWpGkf880pLhg2HTG19CguuU0ob8XgtFJY9yrD7yZFDHQ%2FmSCKHUgZW7t1NyBwNrVhSSVf7UGsyFAmr3CUZ0NKXJKEX%2Fc8cG0WQN0YtU7LoTb2RyjeXgM%3D
Collecting data from page 5, Current U

In [None]:
# # get the job post in the first page
# get_jobs()

# # display the current page number and URL
# print(f"Collecting data from page 1, Current URL: {url}")

# i=2

# previous_url = driver.current_url
# same_url_counter = 0

# while True: # collect all job information in all pages
#     try:
#         # move to next page
#         #this line searches for the anchor (<a>) tag that has an aria-label attribute set to 'Next page'
#         # the "next page" button is identified as [aria-label='Next page'] and is placed within [a] tag
#         # next_button = driver.find_element(By.XPATH, "//a[@aria-label='Next page']") #Find error, when I reach to page 10, it go back to page 1.
#         # Insted of the next page, I'll press the page number
#         if 10 <= i:
#             next_button = driver.find_element(By.XPATH, f"//a[@aria-label='page {i}']")
#         else:
#             next_button = driver.find_element(By.XPATH, "//a[@aria-label='Next page']")
        
#         # wait until the button is clickable
#         WebDriverWait(driver, 3).until(EC.element_to_be_clickable(next_button))
        
#         # click the next button
#         next_button.click()
            
        
#         # wait untile the next page is completely loaded
#         # this condition checks if an element, identified by the specified XPath ("//ul[@class='css-13ia03s']"), is present on the page
#         # What I want to collect is job posts and they are identified as [class='css-13ia03s'] in ul tag
#         WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.XPATH, "//ul[@class='css-13ia03s']")))

#         time.sleep(3)

#         # skip if visited the same url
#         current_url = driver.current_url
#         if current_url == previous_url:
#             print(f"Page {i}, Same URL: {current_url}. Skipping to next page")
#             same_url_counter += 1
#             if same_url_counter >= 3:
#                 print("Reached the threshold of same URLs")
#                 break
#             previous_url = current_url
#             i += 1
#             continue

        
#         # get all job information in the this page
#         get_jobs()
            
        
#         # wait for a little bit
#         time.sleep(1)

#         # display the current page number and URL
#         print(f"Collecting data from page {i}, Current URL: {driver.current_url}")

#         i+=1
#         same_url_counter = 0
#         previous_url = current_url

#         # # If we reach the 3rd page, stop the loop
#         # if i > 3: # adjust by yourself
#         #     break
        
#     except Exception as e:
#         print("No More Additional Pages", e)
#         break # if there is no next page, we get outside of the loop 

In [None]:
# # get the job post in the first page
# get_jobs()

# # display the current page number and URL
# print(f"Collecting data from page 1, Current URL: {url}")

# i = 2

# last_url = driver.current_url

# while True:  # collect all job information in all pages
#     try:
#         # move to next page
#         # this line searches for the anchor (<a>) tag that has an aria-label attribute set to 'Next page'
#         # the "next page" button is identified as [aria-label='Next page'] and is placed within [a] tag
#         if 10 <= i:
#             next_button = driver.find_element(By.XPATH, f"//a[@aria-label='page {i}']")
#         else:
#             next_button = driver.find_element(By.XPATH, "//a[@aria-label='Next page']")
        
#         # wait until the button is clickable
#         WebDriverWait(driver, 3).until(EC.element_to_be_clickable(next_button))
        
#         # click the next button
#         next_button.click()

#         # wait until the next page is completely loaded
#         # this condition checks if an element, identified by the specified XPath ("//ul[@class='css-13ia03s']"), is present on the page
#         # What I want to collect is job posts and they are identified as [class='css-13ia03s'] in ul tag
#         WebDriverWait(driver, 3).until(EC.presence_of_element_located((By.XPATH, "//ul[@class='css-13ia03s']")))

#         # get all job information on this page
#         get_jobs()

#         # check if the current URL is the same as the last URL
#         current_url = driver.current_url
#         if current_url == last_url:
#             print(f"Same URL found. Skipping to next page.")
#             i += 1  # If the URL is the same, increment i to move to the next page
        
#         # update last_url with the current URL for next comparison
#         last_url = current_url

#         # wait for a little bit
#         time.sleep(1)

#         # display the current page number and URL
#         print(f"Collecting data from page {i}, Current URL: {driver.current_url}")

#         # increment the page counter
#         i += 1

#     except Exception as e:
#         print("No More Additional Pages", e)
#         break  # if there is no next page, we get outside of the loop

In [None]:
# convert to dataframe
df = pd.DataFrame(job_list)
df = df.dropna(subset=['company_name'])  # Drop the null value in the company_name column
df.head()

In [None]:
df.info()

In [None]:
df[df.duplicated()]

In [None]:
df.to_csv("SimplyHired_webscraping_data_scientist_3_12.csv", index=False)