# Scraping Job Postings from LinkedIn

In [22]:
#Import packages
import time, os
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select

from bs4 import BeautifulSoup as bs
import requests
import re
import pickle

#Hide Warnings
import warnings
warnings.filterwarnings('ignore')

In [23]:
chromedriver = "/Applications/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)
source = 'https://www.linkedin.com/jobs'
driver.get(source)
# Get Location to Rancho Cordova
location_box_clear = driver.find_element_by_xpath('//*[@id="JOBS"]/section[2]/button')
location_box_clear.click()
location_box = driver.find_element_by_xpath('//*[@id="JOBS"]/section[2]/input')
location_box.click()
location_box.send_keys("Rancho Cordova, California, United States")
location_box.send_keys(Keys.RETURN)

In [17]:
## Can specify these later, but for now we'll scrape the first 1000 jobs within 25 miles of RC
# Anytime
# Distance
# Company
# Salary
# Location
# Job Type
# Experience Level
# On-site/Remote

In [24]:
#How many jobs are curently available within 25 miles of Rancho Cordova on LinkedIn
no_of_jobs = driver.find_element_by_css_selector('h1>span').get_attribute('innerText')

print('There are', no_of_jobs, 'jobs available within 10 miles of Rancho Cordova on LinkedIn over the past month.')

There are 44,000+ jobs available within 10 miles of Rancho Cordova on LinkedIn over the past month.


In [25]:
#Browse all jobs for the search.
i = 2
while i <= int(1000/25)+1: 
    #Scroll until hit the see more jobs button.
    driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
    i = i + 1
    try:
        #Click the see more jobs button and then keep scrolling.
        driver.find_element_by_xpath(f"//button[@aria-label='Load more results']").click()
        time.sleep(20)
        print(i,"loading")
    except:
        pass
        time.sleep(20)
        print(i,"not loading")

In [None]:
##### Create a list of all jobs in the search.

In [26]:
#Create a list of the jobs.
job_lists = driver.find_element_by_class_name('jobs-search__results-list')
jobs = job_lists.find_elements_by_tag_name('li')

In [27]:
#Test that it collected all jobs.
#If it significantly dropped, the sleep.time time may need to be increased to allow:
#More loading time or
#Not set off restrictions for the site.
print(len(jobs), 'were collected from the search')

998 were collected from the search


In [28]:
#Pull basic information from each job.
job_title = []
company_name = []
date = []
job_link = []

for job in jobs:
    job_title0 = job.find_element_by_css_selector('h3').get_attribute('innerText')
    job_title.append(job_title0)
 
    company_name0 = job.find_element_by_css_selector('h4').get_attribute('innerText')
    company_name.append(company_name0)
 
    date0 = job.find_element_by_css_selector('div>div>time').get_attribute('datetime')
    date.append(date0)
    
    job_link0 = job.find_element_by_css_selector('a').get_attribute('href')
    job_link.append(job_link0)

In [29]:
#See first 5 of each for verification.
print('Job Titles:',job_title[:5])
print(' ')
print('Company Names:',company_name[:5])
print(' ')
print('Date:', date[:5])

Job Titles: ['Administrative Analyst', 'Enrollment Advisor', 'Patient Services Representative', 'Youth Advocate, Juvenile Justice Intervention Services', 'Administrative Analyst (Community Engagement & Strategic Partnerships)']
 
Company Names: ['City of Sacramento', 'California State University-Sacramento', 'Sutter Health', 'Stanford Sierra Youth & Families', 'City of Sacramento']
 
Date: ['2022-01-22', '2022-01-22', '2022-02-04', '2022-01-28', '2022-02-04']


In [30]:
#### maybe use bs on the job_link instead of clicking here
#Inital job description and criteria lists.
jd = []
cl = []
#Get job criteria.

#Do in batches that end with longer timers 
#to work around StaleElementReferenceException.
batchsize = 10

for i in range(0, len(jobs), batchsize):
    
    batch = jobs[i:i+batchsize]

    for job in batch:
       
        job.click()
        
        detail_path = 'description__job-criteria-list'
        details = driver.find_element_by_class_name(detail_path).get_attribute('innerText')
        cl.append(details)
    
        time.sleep(5)
        
    time.sleep(30)

StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
  (Session info: chrome=98.0.4758.80)


In [31]:
#Verify criteria is correct.
print(cl[0])

Seniority level
Associate
Employment type
Full-time
Job function
Other
Industries
Government Administration


In [None]:
#Get job descriptions.

#Do in batches that end with longer timers 
#to work around StaleElementReferenceException.
batchsize = 10

for i in range(0, len(jobs), batchsize):
    
    batch = jobs[i:i+batchsize]

    for job in batch:
        
        job.click()
    
        jd_path = 'show-more-less-html__markup'
        jd0 = driver.find_element_by_class_name(jd_path).get_attribute('innerText')
        jd.append(jd0)
    
        time.sleep(1)
        
    time.sleep(5)

In [None]:
#Verify job description is correct.
print(jd[0])

In [32]:
# Verify that all lists are the same length.
print(len(job_title))
print(len(company_name))
print(len(date))
print(len(job_link))
print(len(cl))
# print(len(jd))

998
998
998
998
62


In [33]:
# Create and save a dataframe of the collected data.
job_post_data = pd.DataFrame({'Date': date,
                              'Company': company_name,
                              'Title': job_title,
                              'Job Link': job_link})

# job_post_data.to_csv('LinkedIn_Job_Postings.csv', index = False)

In [34]:
job_post_data.head()

Unnamed: 0,Date,Company,Title,Job Link
0,2022-01-22,City of Sacramento,Administrative Analyst,https://www.linkedin.com/jobs/view/administrat...
1,2022-01-22,California State University-Sacramento,Enrollment Advisor,https://www.linkedin.com/jobs/view/enrollment-...
2,2022-02-04,Sutter Health,Patient Services Representative,https://www.linkedin.com/jobs/view/patient-ser...
3,2022-01-28,Stanford Sierra Youth & Families,"Youth Advocate, Juvenile Justice Intervention ...",https://www.linkedin.com/jobs/view/youth-advoc...
4,2022-02-04,City of Sacramento,Administrative Analyst (Community Engagement &...,https://www.linkedin.com/jobs/view/administrat...


In [35]:
len(job_post_data)

998