In [None]:
# install required packages 
!pip install beautifulsoup4 lxml selenium webdriver-manager

In [39]:
# import necessary modules
from bs4 import BeautifulSoup
from lxml import etree as et
from csv import writer
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import time
from lxml import etree

In [59]:
# define job and location search keywords
job_search_keyword = [' Data+Scientist', 'Data+Analyst', 'Product+Analyst', 'BI+Analyst']
location_search_keyword = ['New+York', 'Los+Angeles', 'Chicago']

# define base and pagination URLs
base_url = 'https://www.indeed.com'
paginaton_url = "https://www.indeed.com/jobs?q={}&l={}&radius=35&start={}"

# check also Sweden and France 
# Just pick the 3 largest cities in each country 

In [62]:
# function to get DOM from given URL

def get_dom(url):
   driver.get(url)
   page_content = driver.page_source
   product_soup = BeautifulSoup(page_content, 'html.parser')
   dom = et.HTML(str(product_soup))
   return dom

# functions to extract job link
def get_job_link(job):
   try:
       job_link = job.xpath('./descendant::h2/a/@href')[0]
   except Exception as e:
       job_link = 'Not available'
   return job_link

# functions to extract job title
def get_job_title(job):
   try:
       job_title = job.xpath('./descendant::h2/a/span/text()')[0]
   except Exception as e:
       job_title = 'Not available'
   return job_title

# functions to extract the company name
def get_company_name(job):
   try:
       company_name = job.xpath('.//span[@data-testid="company-name"]/text()')[0]
       #company_name = job.xpath('./descendant::span[@class="companyName"]/text()')[0]
   except Exception as e:
       company_name = 'Not available'
   return company_name

# functions to extract the company location
def get_company_location(job):
   try:
       company_location = job.xpath('.//div[@data-testid="text-location"]/text()')[0]
       #company_location = job.xpath('./descendant::div[@class="companyLocation"]/text()')[0]
   except Exception as e:
       company_location = 'Not available'
   return company_location

# functions to extract salary information
def get_salary(job):
   try:
       salary = job.xpath('./descendant::span[@class="estimated-salary"]/span/text()')
   except Exception as e:
       salary = 'Not available'
   if len(salary) == 0:
       try:
           salary = job.xpath('./descendant::div[@class="metadata salary-snippet-container"]/div/text()')[0]
       except Exception as e:
           salary = 'Not available'
   else:
       salary = salary[0]
   return salary

# functions to extract job type
def get_job_type(job):
   try:
       job_type = job.xpath('./descendant::div[@class="metadata"]/div/text()')[0]
   except Exception as e:
       job_type = 'Not available'
   return job_type


'''
# functions to extract job rating
def get_rating(job):
   try:
       rating = job.xpath('./descendant::span[@class="ratingNumber"]/span/text()')[0]
   except Exception as e:
       rating = 'Not available'
   return rating
'''

# functions to extract job description

def get_job_desc(job):
   try:
       job_desc = job.xpath('.//div[contains(@class, "jobMetaDataGroup")]//ul/li/text()')
       #job_desc = job.xpath('./descendant::div[@class="job-snippet"]/ul/li/text()')
   except Exception as e:
       job_desc = ['Not available']
   if job_desc:
       job_desc = ",".join(job_desc)
   else:
       job_desc = 'Not available'
   return job_desc


In [63]:
# initialize Chrome webdriver using ChromeDriverManager
service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

# open URL 
driver.get("https://www.indeed.com/q-USA-jobs.html?vjk=823cd7ee3c203ac3")

In [None]:
# Open a CSV file to write the job listings data
with open('indeed_jobs1.csv', 'w', newline='', encoding='utf-8') as f:
   theWriter = writer(f)
   heading = ['job_link', 'job_title', 'company_name', 'company_location', 'salary', 'job_type', 'job_description', 'searched_job', 'searched_location']
   theWriter.writerow(heading)
   for job_keyword in job_search_keyword:
       for location_keyword in location_search_keyword:
           all_jobs = []
           for page_no in range(0, 3, 1): #modify range to get all entries 
               print(f"Page number: {page_no}")
               url = paginaton_url.format(job_keyword, location_keyword, page_no)
               page_dom = get_dom(url)
               if page_dom is None:
                   print(f"Failed to load page {page_no} for {job_keyword} in {location_keyword}")
                   continue  # Skip to the next page
               jobs = page_dom.xpath('//div[@class="job_seen_beacon"]')
               all_jobs = all_jobs + jobs
           for job in all_jobs:
               time.sleep(2)
               job_link = base_url + get_job_link(job)
               print(f"Saving job from the link {job_link}")
               job_title = get_job_title(job)
               company_name = get_company_name(job)
               company_location = get_company_location(job)
               salary = get_salary(job)
               job_type = get_job_type(job)
               #rating = get_rating(job)
               job_desc = get_job_desc(job)
               record = [job_link, job_title, company_name, company_location, salary, job_type, job_desc, job_keyword, location_keyword]
               # try to inspect what is inside each job element
               job_html = etree.tostring(job, pretty_print=True).decode('utf-8')
               time.sleep(2)
               print('*****HTML OUTPUT STARTS FOR ELEMENT*****')
               print(record)
               print(job_html)
               print('*****HTML OUTPUT STOPS FOR ELEMENT*****')
               #print(company_name)
               theWriter.writerow(record)

# Closing the web browser
driver.quit()

Page number: 0
Page number: 1
Page number: 2
Saving job from the link https://www.indeed.com/rc/clk?jk=82e4efce6644cdc7&bb=7gN6zRg3kz7iHAW3UyiP3TYfhUBlfPY55oKhY7qLf3YN9EENXEeb_981X8N03ZLjnDWlUdgDe-b9epjHOZr7Ud2OY4AZzHrNBFvw4nlVV_mZbuXJGQEtSI3JRAXta50R&xkcb=SoDM67M37E3sQA1D6J0LbzkdCdPP&fccid=13920672d471a7d1&vjs=3
*****HTML OUTPUT STARTS FOR ELEMENT*****
['https://www.indeed.com/rc/clk?jk=82e4efce6644cdc7&bb=7gN6zRg3kz7iHAW3UyiP3TYfhUBlfPY55oKhY7qLf3YN9EENXEeb_981X8N03ZLjnDWlUdgDe-b9epjHOZr7Ud2OY4AZzHrNBFvw4nlVV_mZbuXJGQEtSI3JRAXta50R&xkcb=SoDM67M37E3sQA1D6J0LbzkdCdPP&fccid=13920672d471a7d1&vjs=3', 'Product Data Scientist', 'Propel', 'Remote in Brooklyn, NY', 'Not available', 'Not available', 'Maintain reliable , pipelines to ensure trustworthy data.,Free access to Ginger mental health, Gympass, Headspace, One Medical, Rightway, Sofi, and Spring…', ' Data+Scientist', 'New+York']
*****HTML OUTPUT STOPS FOR ELEMENT*****
Saving job from the link https://www.indeed.com/rc/clk?jk=be8ea87cdf1

In [None]:
# ideas for visualization 
# number of listings per location
# average salary for each position 
# average salary by job AND location https://www.indeed.com/rc/clk?jk=9e205f9634a5bfdb&bb=Kaksi2QKb_5G01a0IYYWte3dB7Sd8vTC2PsDgK8Wcrl-vt-ov1bmllcNU4eq-ARapdqrzuLgp5TwUhCEMRJS0x0bX5YFJls_0xhvnMGs1P2Xvjh_c0znjkjJ1X6wishm&xkcb=SoBo67M37Ey6rjTbpx0LbzkdCdPP&fccid=2e5243142d98319d&vjs=3
