# Scrape full job details for each job

For each job (you can go by URL), please try to get the following pieces of information:
1. Job Description
2. Do they ask additional questions?
3. Company Information (May not exist for some firms)

• Company name

• Industry

• Firm size

• Company description

• Perks and benefits

• Average rating

• Number of reviews

• Any other information

Please get this information for the most recent 1000 jobs and report how long the
scraping takes

# Selenium

In [6]:
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd
from webdriver_manager.chrome import ChromeDriverManager
from selenium import webdriver
import re

In [7]:
# Set up the driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

def get_job_urls_from_page(page_num):
    URL = f"https://sg.jobstreet.com/jobs?page={page_num}&sortmode=ListedDate"
    driver.get(URL)
    time.sleep(3)  # Wait for the page to load

    job_urls = []
    job_cards = driver.find_elements(By.CSS_SELECTOR, '[data-testid="job-card"]')
    
    for job in job_cards:
        url_tag = job.find_element(By.CSS_SELECTOR, '[data-automation="job-list-view-job-link"]')
        job_url = url_tag.get_attribute('href') if url_tag else 'N/A'
        job_urls.append(job_url)
    
    return job_urls

# Get job URLs from multiple pages
job_urls = []
page_num = 1
while len(job_urls) < 1000:
    job_urls.extend(get_job_urls_from_page(page_num))
    page_num += 1
    if len(job_urls) >= 1000:
        break

# Remove duplicates (if any)
job_urls = list(set(job_urls))[:1000]


In [8]:
len(job_urls)

1000

In [10]:
def get_job_details(job_url):
    driver.get(job_url)
    time.sleep(3)  # Wait for the job page to load

    job_details = {}

    # Job Title (for clarity)
    try:
        job_title_tag = driver.find_element(By.CSS_SELECTOR, '[data-automation="job-detail-title"]')
        job_details['Job Title'] = job_title_tag.text.strip() if job_title_tag else 'N/A'
        
    except Exception as e:
       job_details['Job Title'] = 'N/A'

    # Advertiser Name (for clarity)
    try:
        company_title_tag = driver.find_element(By.CSS_SELECTOR, '[data-automation="advertiser-name"]')
        job_details['Advertiser Name'] = company_title_tag.text.strip() if company_title_tag else 'N/A'
    
    except Exception as e:
        job_details['Advertiser Name'] = 'N/A'

    # Job URL to trace back
    job_details['Job URL'] = job_url

    # Job Description
    try:
        description_tag = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, '[data-automation="jobAdDetails"]'))
        )
        job_details['Job Description'] = description_tag.text.strip() if description_tag else 'N/A'
    except Exception as e:
        job_details['Job Description'] = 'N/A'

    # Employer Questions
    try:
        questions_section = WebDriverWait(driver, 20).until(
            EC.presence_of_element_located((By.XPATH, "//h2[contains(text(), 'Employer questions')]/following-sibling::div//ul"))
        )
        question_items = questions_section.find_elements(By.XPATH, ".//li")
        questions = [item.text.strip() for item in question_items if item.text.strip()]
        job_details['Employer Questions'] = questions if questions else ['N/A']
    except Exception as e:
        job_details['Employer Questions'] = ['N/A']
    

    # Company Profile
    try:
        company_section = driver.find_element(By.CSS_SELECTOR, '[data-automation="company-profile"]')
        
        # Company Name
        try:
            company_name = company_section.find_element(By.CSS_SELECTOR, 'button h4')
            job_details["Company Name"] = company_name.text.strip()
        except Exception as e:
            job_details['Company Name'] = 'N/A'
        
        # Industry
        try:
            industry_tag = company_section.find_element(By.XPATH, "//span[@class='gepq850 eihuid5b']/span[@class='gepq850 eihuid4z eihuidr'][1]")
            job_details["Industry"] = industry_tag.text.strip()
        except Exception as e:
            job_details['Industry'] = 'N/A'
        
        # Firm Size
        try:
            firm_size = company_section.find_element(By.XPATH, "//span[contains(text(),'employees')]")
            firm_size_strip = firm_size.text.strip() if firm_size else 'N/A'
            firm_size_clean = firm_size_strip.replace(" employees", "")
            job_details["Firm Size (number of employees)"] = firm_size_clean
        except Exception as e:
            job_details['Firm Size (number of employees)'] = 'N/A'
        
        # Company Description
        try:
            description_elements = driver.find_elements(By.CSS_SELECTOR, 'p.gepq850.eihuidcz, p.gepq850.eihuidcb')
        
            company_description = ' '.join([desc.text.strip() for desc in description_elements])
            
            job_details["Company Description"] = company_description
        except Exception as e:
            job_details['Company Description'] = 'N/A'

        
        # Perks and Benefits
        try:
            perks_elements = driver.find_elements(By.CSS_SELECTOR, 'div.gepq850._2l3v4k0')
            perks = [perk.text.strip() for perk in perks_elements if perk.text.strip()] if perks_elements else 'N/A'
            job_details["Perks and Benefits"] = perks
        except Exception as e:
            job_details['Perks and Benefits'] = 'N/A'
        
        # Average Rating
        try:
            avg_rating = company_section.find_element(By.CSS_SELECTOR, '[data-automation="company-profile-review-rating"]')
            job_details["Average Rating (out of 5)"] = avg_rating.text.strip() if avg_rating else 'N/A'
        except Exception as e:
            job_details['Average Rating (out of 5)'] = 'N/A'
        
        # Number of Reviews
        try:
            num_reviews = company_section.find_element(By.CSS_SELECTOR, '[data-automation="company-profile-review-link"]')
            num_reviews_strip = num_reviews.text.strip() if num_reviews else 'N/A'
            number_of_reviews_clean = int(re.search(r'\d+', num_reviews_strip).group(0))
            
            job_details["Number of Reviews"] = number_of_reviews_clean
        except Exception as e:
            job_details['Number of Reviews'] = 'N/A'
        
    except Exception as e:
        job_details['Company Name'] = 'N/A'
        job_details['Industry'] = 'N/A'
        job_details['Firm Size (number of employees)'] = 'N/A'
        job_details['Company Description'] = 'N/A'
        job_details['Perks and Benefits'] = 'N/A'
        job_details['Average Rating (out of 5)'] = 'N/A'
        job_details['Number of Reviews'] = 'N/A'

    return job_details


# Extract details for each job URL
job_data = []
for job_url in job_urls:
    job_details = get_job_details(job_url)
    job_data.append(job_details)

# Create DataFrame
df = pd.DataFrame(job_data)


KeyboardInterrupt: 

In [11]:
df = pd.DataFrame(job_data)
df.head(10)

Unnamed: 0,Job Title,Advertiser Name,Job URL,Job Description,Employer Questions,Company Name,Industry,Firm Size (number of employees),Company Description,Perks and Benefits,Average Rating (out of 5),Number of Reviews
0,ICT Trainer cum Executive,Eco-ITS Private Limited,https://sg.jobstreet.com/job/83155692?type=sta...,The Personnel shall be competent to perform th...,[N/A],,Consulting Services,,Eco-ITS Private Limited is a Singapore-based ...,,,
1,Infant Care Centre Principal (part-time only),BRIGHT SPARKS INFANT CARE PTE. LTD.,https://sg.jobstreet.com/job/83156111?type=sta...,Bright Sparks Infant Care is a leading provide...,[N/A],,,,,,,
2,Customer Service Officer | Up to $2900,Recruit Now Singapore Pte Ltd,https://sg.jobstreet.com/job/83146153?type=sta...,Benefits:\nBasic + Multiple allowances + AWS +...,[Which of the following statements best descri...,Recruit Now,Human Resources & Recruitment,11-50,We provide solutions to all recruitment needs...,"[Medical, Miscellaneous allowance, Dental]",,
3,"Kitchen Supervisor – S$ 5,000 (ID: 668534)",PERSOLKELLY Singapore Pte Ltd (Formerly Kelly ...,https://sg.jobstreet.com/job/83153083?type=sta...,Oversee all kitchen operations to ensure effic...,[Which of the following statements best descri...,Persolkelly,Human Resources & Recruitment,51-100,PERSOLKELLY is one of the largest recru...,,2.7,7.0
4,F&B Executive,ATLAS,https://sg.jobstreet.com/job/83142176?type=sta...,About ATLAS Bar Singapore:\nNestled in the hea...,[Which of the following statements best descri...,Atlas Corporation,Hospitality & Tourism,11-50,"ATLAS, a grand lobby and bar celebrating th...","[Medical, Dental, Parking, % of Service Charge]",,
5,[Immediate] Assistant Finance Manager - Basic ...,STAFFKING PTE LTD,https://sg.jobstreet.com/job/83152792?type=sta...,"Summary Benefits:\nUrgent, Immediate Hire\nBas...",[Which of the following statements best descri...,StaffKing Pte Ltd,Human Resources & Recruitment,11-50,"Company Overview In StaffKing, we believe tha...",,5.0,3.0
6,Medical Technologist ( Various Labs Available),ALLIED SEARCH PTE. LTD.,https://sg.jobstreet.com/job/83160783?type=sta...,Competitive Remuneration Package\nGreat Career...,"[What's your expected monthly basic salary?, W...",Allied Search Pte. Ltd.,Human Resources & Recruitment,11-50,ALLIED SEARCH PTE. LTD is a leading ...,,,
7,Sales Assistant,S2-Mart,https://sg.jobstreet.com/job/83157170?type=sta...,Job summary:\nFlexible hours available\nNo exp...,[N/A],,,,,,,
8,Kitchen Assistant - Handle Alcohol (Weekdays O...,Search Personnel Pte Ltd,https://sg.jobstreet.com/job/83159450?type=sta...,Position : Kitchen Assistant - Hand...,[Which of the following statements best descri...,Search Personnel,Human Resources & Recruitment,11-50,Search Personnel is an e...,,5.0,31.0
9,Technician,Cube Payment Services Pte Ltd,https://sg.jobstreet.com/job/83153395?type=sta...,Job summary:\nFlexible hours available\n1 year...,[N/A],,Banking & Financial Services,11-50,Cube Payment Services (“CubePay”) is a pr...,,,


In [12]:
len(df)

854

In [17]:
df[df['Job Title'] == 'N/A']

Unnamed: 0,Job Title,Advertiser Name,Job URL,Job Description,Employer Questions,Company Name,Industry,Firm Size (number of employees),Company Description,Perks and Benefits,Average Rating (out of 5),Number of Reviews
19,,,https://sg.jobstreet.com/job/83155430?type=sta...,,[N/A],,,,,,,
187,,,https://sg.jobstreet.com/job/83156636?type=sta...,,[N/A],,,,,,,
192,,,https://sg.jobstreet.com/job/83156429?type=sta...,,[N/A],,,,,,,
282,,,https://sg.jobstreet.com/job/83156603?type=sta...,,[N/A],,,,,,,
338,,,https://sg.jobstreet.com/job/83156598?type=sta...,,[N/A],,,,,,,
374,,,https://sg.jobstreet.com/job/83155234?type=sta...,,[N/A],,,,,,,
646,,,https://sg.jobstreet.com/job/83156711?type=sta...,,[N/A],,,,,,,
694,,,https://sg.jobstreet.com/job/83155184?type=sta...,,[N/A],,,,,,,
745,,,https://sg.jobstreet.com/job/83155688?type=sta...,,[N/A],,,,,,,
777,,,https://sg.jobstreet.com/job/83144822?type=sta...,,[N/A],,,,,,,


In [18]:
# filter to remove rows with job title as N/A (those jobs dont exist)
df_clean = df[df['Job Title'] != 'N/A']
len(df_clean)

842

In [19]:
# Save to Excel
df_clean.to_excel("854_job_details_with_company_details_cyx.xlsx", index=False)
print("Job details saved to Excel file.")

Job details saved to Excel file.


In [20]:
driver.quit()

37m 10.2s for 200 jobs with job description and additional questions only

3m 26s for 20 jobs with job description and additional questions only

6m 51s for 20 jobs with all job details (with errors)

2m for 10 jobs with all job details (correct)

218m for 854 jobs