In [1]:
# import modules
import numpy as np
import pandas as pd
import time

from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service as FirefoxService
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.firefox.options import Options


In [2]:
# use GeckoDriver manager to access firefox browser webpages
executable = FirefoxService(GeckoDriverManager().install())

# set web driver options
options = Options()
# find device user agent at: https://useragentstring.com/
custom_user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0'
# add params to options
options.add_argument(f'user-agent={custom_user_agent}')
options.add_argument('--headless')  # Add the --headless option to run Firefox in headless mode; helps with anti-bot
options.add_argument('connection=keep-alive')
options.add_argument('accept-encoding=gzip, deflate, br')
options.add_argument('accept-language=en-US,en;q=0.9,lt;q=0.8,et;q=0.7,de;q=0.6')
options.add_argument('accept=text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8')
# options.add_argument('--proxy-server=#ip:#port')      # using a proxy server can help get past anti-bot software

# create Selenium Firefox driver
driver = webdriver.Firefox(
                    options=options,
                    service=executable
                    )


[WDM] - Downloading: 19.2kB [00:00, 9.52MB/s]                   


In [3]:
# construct initial search link
'''
***Search Parameters***
q: query; usually a position type, or any combo of keys, like 'data engineer fulltime'
l: location
sc: employment type (full-time, part-time, contract, etc)
age: maximum weeks from posting; 4 weeks*7 days = max. 28 days old
'''
q = 'software engineer'
l = 'united states'
sc = 'fulltime'              
age = 4  
search_dict = {'q':q, 'l':l, 'sc':sc, 'fromage':str(age*7)}

# format search terms dict into callable link
search = 'https://www.indeed.com/jobs?'
for key, value in search_dict.items():
    value = value.replace(' ', '+')
    search += '{}={}&'.format(key, value)
search += 'start='

# initialize driver with search link
driver.get(search)

print(search)


https://www.indeed.com/jobs?q=software+engineer&l=united+states&sc=fulltime&fromage=28&start=


In [4]:

def page_to_soup():
    # scrape html for all job cards on page
    all_cards = driver.find_element(By.CSS_SELECTOR,".jobsearch-ResultsList")
    
    # convert to soup obj and pull cards that are actual job posts
    soup = BeautifulSoup(all_cards.get_attribute('innerHTML'), 'html.parser')
    job_cards = soup.find_all("div", {"class": "cardOutline"})
    
    return soup, job_cards, range(len(job_cards))

def find_salary(soup, class_name):
    container = soup.find("div", {"class": class_name})
    return container.text if container else None

def get_full_description(links):
    descriptions = []
    # pull description for each job card from reference link
    for i,l in enumerate(links):
        # go to href link page
        driver.get(l)
        # scrape full description
        description = driver.find_element(By.CLASS_NAME, 'jobsearch-jobDescriptionText').text
        descriptions.append(description)
        # wait and scroll to the bottom of the page to "act human"
        time.sleep(1.5)
        driver.execute_script("window.scrollTo(0,document.body.scrollHeight);")

        if i%10 == 0:
            print(f'scraped {i}/{len(titles)} descriptions...')   
        if i == len(titles)-1:
            print(f'\n{len(titles)}/{len(titles)} job posts collected.\n')
    
    return descriptions


# note: can add condition for if interrupted by anti-bot, i.e. can't find 'jobsearch-jobDescriptionText' class, /
# iterate time.sleep() higher and try again

In [5]:

def get_jobcard_stats(n_pages):
    for n in n_pages:
        print(f'scraping page {n+1}/{int(n_entries/15)+1}...')
        # go to page n
        driver.get(search+str(n*10))

        soup, job_cards, n_cards = page_to_soup()

        # get job title
        title = [s.text for s in soup.find_all("h2", {"class": "jobTitle"})]
        titles.extend(title)

        # get job location(s)
        location = [s.text for s in soup.find_all("div", {"class": "companyLocation"})]
        locations.extend(location)

        # get link to posting
        link = ['https://www.indeed.com'+job_cards[n].find('a').get('href') for n in n_cards]
        links.extend(link)

        # get company name if available, else n/a
        company = [card.find("span", {"class": "companyName"}).text or 'n/a' for card in job_cards]
        companies.extend(company)

        # get salary if available, else n/a
        salary = [find_salary(s, "salary-snippet-container") or find_salary(s, "estimated-salary-container") 
                    or 'n/a' for s in soup.find_all("td", {"class": "resultContent"})]
        salaries.extend(salary)

        if n == n_pages[-1]: print('\n')
            

In [6]:
# initialize containers for job postings data
titles, links, companies, locations, salaries, descriptions = [],[],[],[],[],[]
n_entries = 100                                     # minimum number of posts to collect
n_pages = range(int(n_entries/15)+1)                # 15 posts per page; 100/15 = 6.66 pages = 100 posts -> scrape 7 pages

print('\n')

# get job card info for title, location, company name, post link, and salary
get_jobcard_stats(n_pages)
    
# get full descriptions
descriptions = get_full_description(links)

# close driver after scraping is finished
driver.close()   

# format job posts info as dataframe
data = {
'title':titles,
'company':companies,
'location':locations,
'salary':salaries,
'link':links,
'description':descriptions
    }

df = pd.DataFrame(data)




scraping page 1/7...
scraping page 2/7...
scraping page 3/7...
scraping page 4/7...
scraping page 5/7...
scraping page 6/7...
scraping page 7/7...


scraped 0/105 descriptions...
scraped 10/105 descriptions...
scraped 20/105 descriptions...
scraped 30/105 descriptions...
scraped 40/105 descriptions...
scraped 50/105 descriptions...
scraped 60/105 descriptions...
scraped 70/105 descriptions...
scraped 80/105 descriptions...
scraped 90/105 descriptions...
scraped 100/105 descriptions...

105/105 job posts collected.



In [8]:
# duplicated posts (only one shown per duplicate pair), excluding post-specific links
df[df.drop('link', axis=1).duplicated()]


Unnamed: 0,title,company,location,salary,link,description
15,Software Engineer,Caterpillar,"Chicago, IL 60622 (West Town area)",Estimated $79.8K - $101K a year,https://www.indeed.com/rc/clk?jk=9dbe010b2fb71...,Career Area:\nDigital\nJob Description:\nAs a ...
17,Software Engineer (Remote),Climb Credit,"Remote in New York, NY 10013",Estimated $90.7K - $115K a year,https://www.indeed.com/rc/clk?jk=99842d8ad3075...,Overview of Role\nWe are looking for a softwar...
18,Junior Software Engineer,Config47: SME New Hire Training,"Remote in New York, NY 10041",Estimated $71.3K - $90.3K a year,https://www.indeed.com/rc/clk?jk=9cd4be0d13adf...,Overview:\nCollaborate with our business teams...
36,Lead Backend Engineer – Cards Digital Collection,OneMain Financial,"Hybrid remote in Charlotte, NC 28244",Estimated $125K - $132K a year,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,"As part of a demonstrated commitment, we are i..."
39,"Web Designer/Developer (Brunswick, OH)",Proximity Marketing Corporation,"Brunswick, OH 44212",,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,Proximity Marketing is looking for an experien...
43,Senior Software Engineer,Leidos,"Fairmont, WV 26554","$97,500 - $202,500 a year",https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,Description\nJob Description:\nLooking for an ...
51,"Web Designer/Developer (Brunswick, OH)",Proximity Marketing Corporation,"Brunswick, OH 44212",,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,Proximity Marketing is looking for an experien...
61,Mid-level Software Engineer,Peraton,"Remote in McLean, VA 22102","$86,000 - $138,000 a year",https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,Responsibilities:\nPeraton is seeking an exper...
69,"Web Designer/Developer (Brunswick, OH)",Proximity Marketing Corporation,"Brunswick, OH 44212",,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,Proximity Marketing is looking for an experien...
83,"Web Designer/Developer (Brunswick, OH)",Proximity Marketing Corporation,"Brunswick, OH 44212",,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,Proximity Marketing is looking for an experien...


In [11]:
# drop duplicates to get unique posts
unique_posts = df.loc[df.drop('link', axis=1).drop_duplicates().index].reset_index(drop=True)

# send dataframe to a csv file for text processing
unique_posts.to_csv('swe_us_ft_4wks_041923.csv', index=False)

unique_posts

Unnamed: 0,title,company,location,salary,link,description
0,Sr. Engineering Software Support Specialist,Particle Measuring Systems,"Boulder, CO 80301","$80,000 - $90,000 a year",https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,Do you want to be part of a business that genu...
1,Associate Software Engineer,Bosch Group,"Plymouth, MI 48170",Estimated $64.2K - $81.3K a year,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,Company Description\n\nDo you want beneficial ...
2,Senior Software Engineer,Fidelity Investments,"Merrimack, NH 03054",Estimated $100K - $127K a year,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,Job Description:\nSenior Software Engineer\nRo...
3,Software Engineer,Creative Financial Staffing,"Hybrid remote in Grandville, MI","$80,000 - $120,000 a year",https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,JOB DESCRIPTION\nPosition: Software Engineer\n...
4,Software Engineer - Backend,"Gametime United, Inc.",Remote,"$112,000 - $145,000 a year",https://www.indeed.com/rc/clk?jk=21bd327662ae0...,About us:\nLive experiences help make us human...
...,...,...,...,...,...,...
89,Software Engineer - ALL Levels,The Josef Group Inc.,"Reston, VA","$120,000 - $200,000 a year",https://www.indeed.com/rc/clk?jk=065d6a10bfbb0...,"Software Engineer\nReston, VA\nSecurity Cleara..."
90,Software Engineer I,Gleim,"Gainesville, FL 32606",Estimated $58K - $73.5K a year,https://www.indeed.com/rc/clk?jk=cd297fe604d62...,"Full-time, entry-level Software Engineer I pos..."
91,Software Engineer,FORVIS,Remote,Estimated $87.2K - $110K a year,https://www.indeed.com/rc/clk?jk=0e5c777024696...,"Overview:\n\nAt FORVIS, your career is designe..."
92,Software Engineer,Harmony Healthcare IT,"Remote in South Bend, IN 46635",Estimated $81.2K - $103K a year,https://www.indeed.com/rc/clk?jk=adc67b573f140...,Company Description:\nHarmony Healthcare IT (H...
