In [65]:
# install required packages 
#!pip install beautifulsoup4 lxml selenium webdriver-manager

In [90]:
# import necessary modules
from bs4 import BeautifulSoup 
from lxml import etree
from csv import writer
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time

In [75]:
# define job and location search keywords
job_search_keyword = [' Data+Scientist']#, 'Data+Analyst', 'Product+Analyst', 'BI+Analyst']
location_search_keyword = ['New+York']#, 'Los+Angeles', 'Chicago']

# define base and pagination URLs
base_url = 'https://www.indeed.com'
paginaton_url = "https://www.indeed.com/jobs?q={}&l={}&radius=35&start={}"

# check also Sweden and France 
# Just pick the 3 largest cities in each country 

In [126]:
# function to get DOM from given URL
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def get_dom(url):
    driver.get(url)
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "jobDescriptionText")))
    page_content = driver.page_source
    product_soup = BeautifulSoup(page_content, 'html.parser')
    dom = et.HTML(str(product_soup))
    return dom

'''
def get_dom(url):
   driver.get(url)
   page_content = driver.page_source
   product_soup = BeautifulSoup(page_content, 'html.parser')
   dom = et.HTML(str(product_soup))
   return dom
'''

# functions to extract job link
def get_job_link(job):
   try:
       job_link = job.xpath('./descendant::h2/a/@href')[0]
   except Exception as e:
       job_link = 'Not available'
   return job_link

# functions to extract job title
def get_job_title(job):
   try:
       job_title = job.xpath('./descendant::h2/a/span/text()')[0]
   except Exception as e:
       job_title = 'Not available'
   return job_title

# functions to extract the company name
def get_company_name(job):
   try:
       company_name = job.xpath('.//span[@data-testid="company-name"]/text()')[0]
       #company_name = job.xpath('./descendant::span[@class="companyName"]/text()')[0]
   except Exception as e:
       company_name = 'Not available'
   return company_name

# functions to extract the company location
def get_company_location(job):
   try:
       company_location = job.xpath('.//div[@data-testid="text-location"]/text()')[0]
       #company_location = job.xpath('./descendant::div[@class="companyLocation"]/text()')[0]
   except Exception as e:
       company_location = 'Not available'
   return company_location

# functions to extract salary information
def get_salary(job):
   try:
       salary = job.xpath('./descendant::span[@class="estimated-salary"]/span/text()')
   except Exception as e:
       salary = 'Not available'
   if len(salary) == 0:
       try:
           salary = job.xpath('./descendant::div[@class="metadata salary-snippet-container"]/div/text()')[0]
       except Exception as e:
           salary = 'Not available'
   else:
       salary = salary[0]
   return salary

# functions to extract job type
def get_job_type(job):
   try:
       job_type = job.xpath('./descendant::div[@class="metadata"]/div/text()')[0]
   except Exception as e:
       job_type = 'Not available'
   return job_type

# functions to extract job description
def get_job_desc(job):
   try:
       job_desc = job.xpath('//*[@id="jobDescriptionText"]//div/ul/li/text()')
       #job_desc = job.xpath('.//div[contains(@class, "jobMetaDataGroup")]//ul/li/text()')
       #job_desc = job.xpath('./descendant::div[@class="job-snippet"]/ul/li/text()')
   except Exception as e:
       job_desc = ['Not available']
   if job_desc:
       job_desc = ",".join(job_desc)
   else:
       job_desc = 'Not available'
   return job_desc


In [127]:
# initialize Chrome webdriver using ChromeDriverManager
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

# open URL 
driver.get("https://www.indeed.com/q-USA-jobs.html?vjk=823cd7ee3c203ac3")

In [121]:
with open('indeed_jobs1_dom.txt', 'w', encoding='utf-8') as f:
    url = paginaton_url.format(job_search_keyword[0], location_search_keyword[0], page_no)
    time.sleep(3)
    page_dom = get_dom(url)  # Get the DOM of the page

    # Convert the DOM object to a string with pretty formatting
    dom_html = etree.tostring(page_dom, pretty_print=True, encoding='unicode')

    # Write the HTML string to the file
    f.write(dom_html)

# Save the full page DOM to a file for manual inspection
with open('indeed_full_page.html', 'w', encoding='utf-8') as f:
    time.sleep(4)
    page_source = driver.page_source
    f.write(page_source)

In [128]:
# Open a CSV file to write the job listings data
with open('indeed_jobs1.csv', 'w', newline='', encoding='utf-8') as f:
   theWriter = writer(f)
   heading = ['job_link', 'job_title', 'company_name', 'company_location', 'salary', 'job_type', 'job_description', 'searched_job', 'searched_location']
   theWriter.writerow(heading)
   for job_keyword in job_search_keyword:
       for location_keyword in location_search_keyword:
           print(f"Job title: {job_keyword}, location {location_keyword}")
           all_jobs = []
           for page_no in range(0, 2, 1): #modify range to get all entries 
               print(f"Page number: {page_no}")
               url = paginaton_url.format(job_keyword, location_keyword, page_no)
               page_dom = get_dom(url)
               if page_dom is None:
                   print(f"Failed to load page {page_no} for {job_keyword} in {location_keyword}")
                   continue  # Skip to the next page
               jobs = page_dom.xpath('//div[@class="job_seen_beacon"]')
               all_jobs = all_jobs + jobs
           for job in all_jobs:
               #time.sleep(2)
               job_link = base_url + get_job_link(job)
               print(f"Saving job from the link {job_link}")
               job_title = get_job_title(job)
               company_name = get_company_name(job)
               company_location = get_company_location(job)
               salary = get_salary(job)
               job_type = get_job_type(job)
               #rating = get_rating(job)
               time.sleep(2)
               job_desc = get_job_desc(job)
               record = [job_link, job_title, company_name, company_location, salary, job_type, job_desc, job_keyword, location_keyword]
               # try to inspect what is inside each job element
               job_html = etree.tostring(job, pretty_print=True).decode('utf-8')
               #time.sleep(2)
               print('*****HTML OUTPUT STARTS FOR ELEMENT*****')
               print(record)
               #print(job_html)
               print('*****HTML OUTPUT STOPS FOR ELEMENT*****')
               #print(company_name)
               theWriter.writerow(record)

# Closing the web browser
driver.quit()

Job title:  Data+Scientist, location New+York
Page number: 0
Page number: 1
Saving job from the link https://www.indeed.com/rc/clk?jk=be8ea87cdf172e4a&bb=15WWDbq3Mi-eGLU8FTrB4cdz6PRUgHYEt3Izj9qE8v5VIaASKJEC66VlijhseAe5jpRnIRwIeJWExz60YbMOgyQ5gf4rpR_ITDcZzIv0MC1JQQerJvJTwg%3D%3D&xkcb=SoAs67M37FmT1d3qa50LbzkdCdPP&fccid=a5b4499d9e91a5c6&vjs=3
*****HTML OUTPUT STARTS FOR ELEMENT*****
['https://www.indeed.com/rc/clk?jk=be8ea87cdf172e4a&bb=15WWDbq3Mi-eGLU8FTrB4cdz6PRUgHYEt3Izj9qE8v5VIaASKJEC66VlijhseAe5jpRnIRwIeJWExz60YbMOgyQ5gf4rpR_ITDcZzIv0MC1JQQerJvJTwg%3D%3D&xkcb=SoAs67M37FmT1d3qa50LbzkdCdPP&fccid=a5b4499d9e91a5c6&vjs=3', 'Data Scientist Manager II, Research', 'Google', 'New York, NY', 'Not available', 'Not available', "Master's degree in Statistics, Data Science, Mathematics, Physics, Economics, Operations Research, Engineering, or a related quantitative field.,8 years of work experience using analytics to solve product or business problems, coding (e.g., Python, R, SQL), querying datab

In [129]:
import time
import requests
from bs4 import BeautifulSoup
from lxml import etree as et
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# Function to get DOM from given URL
def get_dom(url):
    driver.get(url)
    time.sleep(3)  # Ensure page loads
    page_content = driver.page_source
    product_soup = BeautifulSoup(page_content, 'html.parser')
    dom = et.HTML(str(product_soup))
    return dom

# Function to extract job link
def get_job_link(job):
    try:
        return job.xpath('./descendant::h2/a/@href')[0]
    except Exception:
        return 'Not available'

# Function to extract job description
def get_job_desc(job_link):
    job_dom = get_dom(job_link)
    try:
        job_desc = job_dom.xpath('//*[@id="jobDescriptionText"]//div/text()')
        return " ".join(job_desc).strip() if job_desc else 'Not available'
    except Exception:
        return 'Not available'

# Initialize Chrome webdriver
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

# Open URL
driver.get("https://www.indeed.com/q-USA-jobs.html")

# Open a CSV file to write the job listings data
with open('indeed_jobs1.csv', 'w', newline='', encoding='utf-8') as f:
    theWriter = writer(f)
    heading = ['job_link', 'job_title', 'company_name', 'company_location', 'salary', 'job_type', 'job_description']
    theWriter.writerow(heading)

    # Example job search keywords and locations
    job_search_keywords = ['Data Scientist']  # Replace with actual keywords
    location_search_keywords = ['New York']  # Replace with actual locations

    for job_keyword in job_search_keywords:
        for location_keyword in location_search_keywords:
            print(f"Searching for: {job_keyword} in {location_keyword}")
            all_jobs = []
            for page_no in range(2):  # Modify range to get more pages if needed
                url = f"https://www.indeed.com/jobs?q={job_keyword}&l={location_keyword}&start={page_no * 10}"
                page_dom = get_dom(url)
                jobs = page_dom.xpath('//div[@class="job_seen_beacon"]')
                all_jobs.extend(jobs)

            seen_links = set()
            for job in all_jobs:
                job_link = base_url + get_job_link(job)
                if job_link in seen_links:
                    continue  # Skip duplicates
                seen_links.add(job_link)

                job_title = get_job_title(job)
                company_name = get_company_name(job)
                company_location = get_company_location(job)
                salary = get_salary(job)
                job_type = get_job_type(job)
                job_desc = get_job_desc(job_link)  # Extract job description from the job link

                record = [job_link, job_title, company_name, company_location, salary, job_type, job_desc]
                theWriter.writerow(record)

# Closing the web browser
driver.quit()

Searching for: Software Engineer in New York


In [None]:
# ideas for visualization 
# number of listings per location
# average salary for each position 
# average salary by job AND location https://www.indeed.com/rc/clk?jk=9e205f9634a5bfdb&bb=Kaksi2QKb_5G01a0IYYWte3dB7Sd8vTC2PsDgK8Wcrl-vt-ov1bmllcNU4eq-ARapdqrzuLgp5TwUhCEMRJS0x0bX5YFJls_0xhvnMGs1P2Xvjh_c0znjkjJ1X6wishm&xkcb=SoBo67M37Ey6rjTbpx0LbzkdCdPP&fccid=2e5243142d98319d&vjs=3


# keyword extraction
# frequency differences for different job titles 
# tech stack keywords 