In [None]:
# install required packages 
#!pip install beautifulsoup4 lxml selenium webdriver-manager

In [9]:
# import necessary modules
import time
import requests
import math
from csv import writer
from bs4 import BeautifulSoup
from lxml import etree as et
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

In [13]:
# define job and location search keywords
job_search_keywords = ['Data+Analyst', 'Data+Scientist', 'Product+Analyst', 'BI+Analyst']
location_search_keywords = ['New+York, NY', 'Los Angeles, CA', 'Chicago, IL'] #'New York' will just search the entire state 

# define base and pagination URLs
base_url = 'https://www.indeed.com'
paginaton_url = "https://www.indeed.com/jobs?q={}&l={}&radius=35&start={}" # is this one actually used later? 

# check also Sweden and France later
france_url = 'https://www.indeed.fr' 
italy_url = 'https://it.indeed.com'
sweden_url = 'https://se.indeed.com'
# the 3 largest cities in each country 
french_cities = ['Paris (75)', 'Marseille (13)', 'Lyon (69)']
italian_cities = ['Roma, Lazio', 'Milano, Lombardia', 'Napoli, Campania']
swedish_cities = ['Stockholm', 'Göteborg', 'Malmö']


In [14]:
# function to get DOM from given URL
'''
def get_dom(url):
    driver.get(url)
    #WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "jobDescriptionText")))
    time.sleep(3)  # ensure page loads
    page_content = driver.page_source
    product_soup = BeautifulSoup(page_content, 'html.parser')
    dom = et.HTML(str(product_soup))
    return dom
'''

def get_dom(url):
    global driver
    try:
        driver.get(url)
        time.sleep(3)  # Ensure page loads
        page_content = driver.page_source
        product_soup = BeautifulSoup(page_content, 'html.parser')
        dom = et.HTML(str(product_soup))
        return dom
    except WebDriverException as e:
        print("WebDriver disconnected, restarting the browser:", e)
        driver.quit()  # close current session
        driver = initialize_driver()  # restart the driver
        return None
        
# function to extract job link
def get_job_link(job):
    try:
        return job.xpath('./descendant::h2/a/@href')[0]
    except Exception:
        return 'Not available'

# function to extract job description
def get_job_desc(job_link):
    job_dom = get_dom(job_link)
    try:
        job_desc = job_dom.xpath('//*[@id="jobDescriptionText"]//text()')
        return " ".join(job_desc).strip() if job_desc else 'Not available'
    except Exception:
        return 'Not available'

# function to extract job title
def get_job_title(job):
   try:
       job_title = job.xpath('./descendant::h2/a/span/text()')[0]
   except Exception as e:
       job_title = 'Not available'
   return job_title

# function to extract  company name
def get_company_name(job):
   try:
       company_name = job.xpath('.//span[@data-testid="company-name"]/text()')[0]
   except Exception as e:
       company_name = 'Not available'
   return company_name

# function to extract company location
def get_company_location(job):
   try:
       company_location = job.xpath('.//div[@data-testid="text-location"]/text()')[0]
   except Exception as e:
       company_location = 'Not available'
   return company_location

# function to extract salary info
def get_salary(job_link):
    job_dom = get_dom(job_link)
    try:
        # Use the provided XPath to get the salary text
        salary = job_dom.xpath('//*[@id="salaryInfoAndJobType"]//text()')
        return " ".join(salary).strip() if salary else 'Not available'
    except Exception:
        return 'Not available'

# function to estimate number of pages for a specific search
def get_total_pages(job_keyword, location_keyword):
    # url of Indeed job search
    url = f"https://www.indeed.com/jobs?q={job_keyword}&l={location_keyword}"
    driver.get(url)
    try:
        # wait for the element containing the job count to appear
        job_count_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//div[contains(@class, "jobsearch-JobCountAndSortPane-jobCount")]'))
        )
        # extract the text from the element
        job_count_text = job_count_element.text
        print(f"Job count text: {job_count_text}")
        job_count = int(job_count_text.split('+')[0].replace(',', '').strip())  # Handle commas and extra spaces
        # each page shows approx. 15 jobs
        jobs_per_page = 15
        # calculate the total number of pages
        total_pages = math.ceil(job_count / jobs_per_page)
        print(f"Total pages: {total_pages}")
        return total_pages
    except Exception as e:
        print(f"Error extracting job count: {e}")
        return 0  # Return 0 if there's an error
    # is this really needed?
    finally:
        # Close the browser
        #driver.quit()
        pass

In [15]:
# initialize Chrome webdriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

# open URL
driver.get("https://www.indeed.com/q-USA-jobs.html")

# open a CSV file to write the job listings data
with open('indeed_jobs1.csv', 'w', newline='', encoding='utf-8') as f:
    theWriter = writer(f)
    heading = ['page', 'job_link', 'search_keyword', 'search_location', 'job_title', 'company_name', 'company_location', 'salary', 'job_description']
    theWriter.writerow(heading)

    for job_keyword in job_search_keywords:
        for location_keyword in location_search_keywords:
            print(f"Searching for: {job_keyword} in {location_keyword}")
            all_jobs = []

            # find total number of pages
            total_pages = get_total_pages(job_keyword, location_keyword)
            print(f"Total pages found: {total_pages}") 

            for page_no in range(total_pages):  # Modify range to get more or less pages, if needed
                print(f"Fetching Page number: {page_no + 1}")  # display page number (1-based)
                # per default this is set to within 25 miles 
                url = f"https://www.indeed.com/jobs?q={job_keyword}&l={location_keyword}&start={page_no * 10}"
                page_dom = get_dom(url)
                
                # extract jobs from current page
                jobs = page_dom.xpath('//div[@class="job_seen_beacon"]')
                print(f"Jobs found on page {page_no + 1}: {len(jobs)}") 
                
                # process each job
                for job in jobs:
                    job_link = base_url + get_job_link(job)
                    job_title = get_job_title(job)
                    company_name = get_company_name(job)
                    company_location = get_company_location(job)
                    salary = get_salary(job_link)
                    job_desc = get_job_desc(job_link)  # extract job description 
                    record = [page_no + 1, job_link, job_keyword, location_keyword, job_title, company_name, company_location, salary, job_desc]
                    print(page_no + 1, job_link)  # print page number (1-based index) and job link
                    time.sleep(2)
                    theWriter.writerow(record)  # write the record to CSV

# close the web browser
driver.quit()

Searching for: Data+Analyst in New+York, NY
Job count text: 200+ jobs
Total pages: 14
Total pages found: 14
Fetching Page number: 1
Jobs found on page 1: 15
1 https://www.indeed.com/rc/clk?jk=5bf7eb172149dfeb&bb=dksm-zU8diMzF9ysPOdQeYGjyBJmLABcSs-kh8RcHykIv100Q3sG0yUS9H6K_V8HNla7XKnYL6ExmcpPnEliy0F8eKcxsGJVNfJRcEMkU2-6j7bl2eSVyg%3D%3D&xkcb=SoCP67M37HwsVcQpS50KbzkdCdPP&fccid=dc08cb56cb2d6276&vjs=3
1 https://www.indeed.com/rc/clk?jk=6b99bd10f1316b74&bb=dksm-zU8diMzF9ysPOdQeYGjyBJmLABclkmUv96Vj9gx-0wEO4sIl2C28ydWcQxIX1jbngtMBaFXZsnm-T8yZw3U4f3QBWW0gjG75Knj7XPOf5EqaEzvTn7UFXwuSSPI&xkcb=SoAS67M37HwsVcQpS50JbzkdCdPP&fccid=848e72c84ce4a7a7&vjs=3
1 https://www.indeed.com/rc/clk?jk=d70c515431fa29c3&bb=dksm-zU8diMzF9ysPOdQeYchPMhjJ7nkSkD9cJKYYz-oQE4C6IAse6wPw_7Il6LAwKegKYchHvsmSAXzkQjb16or0OURhDWyEpuVOarJwteigM2Fe5lMGZLZ1HwXyU0B&xkcb=SoCm67M37HwsVcQpS50IbzkdCdPP&fccid=848e72c84ce4a7a7&vjs=3
1 https://www.indeed.com/rc/clk?jk=3aa26222eadd6a35&bb=dksm-zU8diMzF9ysPOdQec64XceqKZD1DqRZ7v5UYxv-bAGEAmW