In [None]:
# install required packages 
#!pip install beautifulsoup4 lxml selenium webdriver-manager

In [30]:
# import necessary modules
import time
import math
import requests
from csv import writer
from bs4 import BeautifulSoup
from lxml import etree as et
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
# define job and location search keywords
job_search_keyword = ['Data+Scientist']#, 'Data+Analyst', 'Product+Analyst', 'BI+Analyst'] # check also Sweden and France 
location_search_keyword = ['New+York']#, 'Los+Angeles', 'Chicago'] # Just pick the 3 largest cities in each country 

# define base and pagination URLs
base_url = 'https://www.indeed.com'
paginaton_url = "https://www.indeed.com/jobs?q={}&l={}&radius=35&start={}"


In [51]:
# Function to get DOM from given URL
def get_dom(url):
    driver.get(url)
    time.sleep(3)  # Ensure page loads
    page_content = driver.page_source
    product_soup = BeautifulSoup(page_content, 'html.parser')
    dom = et.HTML(str(product_soup))
    return dom

# Function to extract job link
def get_job_link(job):
    try:
        return job.xpath('./descendant::h2/a/@href')[0]
    except Exception:
        return 'Not available'

# Function to extract job description
def get_job_desc(job_link):
    job_dom = get_dom(job_link)
    try:
        job_desc = job_dom.xpath('//*[@id="jobDescriptionText"]//text()')
        return " ".join(job_desc).strip() if job_desc else 'Not available'
    except Exception:
        return 'Not available'

# Function to extract job title
def get_job_title(job):
    try:
        job_title = job.xpath('./descendant::h2/a/span/text()')[0]
    except Exception:
        job_title = 'Not available'
    return job_title

# Function to extract the company name
def get_company_name(job):
    try:
        company_name = job.xpath('.//span[@data-testid="company-name"]/text()')[0]
    except Exception:
        company_name = 'Not available'
    return company_name

# Function to extract the company location
def get_company_location(job):
    try:
        company_location = job.xpath('.//div[@data-testid="text-location"]/text()')[0]
    except Exception:
        company_location = 'Not available'
    return company_location

# Function to extract salary
def get_salary(job_link):
    job_dom = get_dom(job_link)
    try:
        # Use the provided XPath to get the salary text
        salary = job_dom.xpath('//*[@id="salaryInfoAndJobType"]//text()')
        return " ".join(salary).strip() if salary else 'Not available'
    except Exception:
        return 'Not available'

# Function to get total number of pages
def get_total_pages(job_keyword, location_keyword):
    # URL of Indeed job search
    url = f"https://www.indeed.com/jobs?q={job_keyword}&l={location_keyword}"
    driver.get(url)

    try:
        # Wait for the element containing the job count to appear
        job_count_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//div[contains(@class, "jobsearch-JobCountAndSortPane-jobCount")]'))
        )
        
        # Extract the text from the element
        job_count_text = job_count_element.text
        print(f"Job count text: {job_count_text}")
        job_count = int(job_count_text.split('+')[0].replace(',', '').strip())  # Handle commas and extra spaces

        # Each page shows 15 jobs
        jobs_per_page = 15

        # Calculate the total number of pages
        total_pages = math.ceil(job_count / jobs_per_page)
        print(f"Total pages: {total_pages}")

        return total_pages

    except Exception as e:
        print(f"Error extracting job count: {e}")
        return 0  # Return 0 if there's an error

    finally:
        # Close the browser
        #driver.quit()
        pass

In [None]:
# Initialize Chrome webdriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

# Open URL
driver.get("https://www.indeed.com/q-USA-jobs.html")

# Open a CSV file to write the job listings data
with open('indeed_jobs1.csv', 'w', newline='', encoding='utf-8') as f:
    theWriter = writer(f)
    heading = ['page', 'job_link', 'search_keyword', 'search_location', 'job_title', 'company_name', 'company_location', 'salary', 'job_description']
    theWriter.writerow(heading)

    for job_keyword in job_search_keywords:
        for location_keyword in location_search_keywords:
            print(f"Searching for: {job_keyword} in {location_keyword}")
            all_jobs = []

            # Find total number of pages
            total_pages = get_total_pages(job_keyword, location_keyword)
            print(f"Total pages found: {total_pages}")  # Print total pages

            for page_no in range(total_pages):  # Modify range to get more pages if needed
                print(f"Fetching Page number: {page_no + 1}")  # Display page number (1-based)
                url = f"https://www.indeed.com/jobs?q={job_keyword}&l={location_keyword}&start={page_no * 10}"
                page_dom = get_dom(url)
                
                # Extract jobs from current page
                jobs = page_dom.xpath('//div[@class="job_seen_beacon"]')
                print(f"Jobs found on page {page_no + 1}: {len(jobs)}")  # Print number of jobs found
                
                # Process each job
                for job in jobs:
                    job_link = base_url + get_job_link(job)
                    job_title = get_job_title(job)
                    company_name = get_company_name(job)
                    company_location = get_company_location(job)
                    salary = get_salary(job_link)
                    job_desc = get_job_desc(job_link)  # Extract job description for this job

                    record = [page_no + 1, job_link, job_keyword, location_keyword, job_title, company_name, company_location, salary, job_desc]

                    print(page_no + 1, job_link)  # Print page number (1-based index) and job link
                    theWriter.writerow(record)  # Write the record to CSV

# Closing the web browser
driver.quit()


Searching for: Data Scientist in New York
Job count text: 300+ jobs
Total pages: 20
Total pages found: 20
Fetching Page number: 1
Jobs found on page 1: 15
1 https://www.indeed.com/rc/clk?jk=d1b96df5f19292fd&bb=e2ox52x9J55lgqIdlzHc-SLxLWai7SRtP__VmFCr8ZvEZMU7x_d4LnACCWr8W9-zHoQ2MBEJorECjzNT2W0_-2VtTXYakIOnnNlN42uGvS556fefqs6Ajg%3D%3D&xkcb=SoBn67M37HZDneygNx0LbzkdCdPP&fccid=a5b4499d9e91a5c6&vjs=3
1 https://www.indeed.com/rc/clk?jk=be8ea87cdf172e4a&bb=e2ox52x9J55lgqIdlzHc-bIFOeoRdU_OHa6_UhLhXC_bu964OBbRnzKQLKKkQu3L4bcEPeAF6ua0ZvF82PLWOZ4YzK5nezfGFGAQ5WtO8E4yHu9qO7AkUg%3D%3D&xkcb=SoDT67M37HZDneygNx0KbzkdCdPP&fccid=a5b4499d9e91a5c6&vjs=3
1 https://www.indeed.com/rc/clk?jk=99eba59d82a9848e&bb=e2ox52x9J55lgqIdlzHc-WG2_bXUxEMALMybEYImhENtpV7W5wkz9ZffSIqu23TxW5KoOnnmpVW34NK4UxLaNrMq39e8ZqU4TW25i-IaPcBDikuTHZH1jIzpvKK6mTX8&xkcb=SoBO67M37HZDneygNx0JbzkdCdPP&fccid=7d7b563c6a3a9653&vjs=3
1 https://www.indeed.com/rc/clk?jk=0d47ada5c09e40f9&bb=e2ox52x9J55lgqIdlzHc-e35g1hBDzKGeiWVRzxIgyAFga-P7CzjmTXPd

In [None]:
'''
# for debugging purposes, it might sometimes be good to look at the DOM 
# you can also get an idea of which xpaths to use by inspecting elements using developer mode 

# initialize Chrome webdriver using ChromeDriverManager
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

# open URL 
driver.get("https://www.indeed.com/q-USA-jobs.html?vjk=823cd7ee3c203ac3")
with open('indeed_jobs1_dom.txt', 'w', encoding='utf-8') as f:
    url = paginaton_url.format(job_search_keyword[0], location_search_keyword[0], page_no)
    time.sleep(3)
    page_dom = get_dom(url)  # Get the DOM of the page

    # Convert the DOM object to a string with pretty formatting
    dom_html = etree.tostring(page_dom, pretty_print=True, encoding='unicode')

    # Write the HTML string to the file
    f.write(dom_html)

# Save the full page DOM to a file for manual inspection
with open('indeed_full_page.html', 'w', encoding='utf-8') as f:
    time.sleep(4)
    page_source = driver.page_source
    f.write(page_source)
'''

In [None]:
# write code here to automate saving for different countries, cities, and job entries, and so on 

In [None]:
# ideas for visualization 
# number of listings per location
# average salary for each position 
# average salary by job AND location 

# keyword extraction
# frequency differences for different job titles 
# tech stack keywords 