In [16]:
# This works as of 08/24/2019
# Used https://medium.com/@msalmon00/web-scraping-job-postings-from-indeed-96bd588dcb4b as motivation
# Had to change a few things as indeed structure has changed a bit since

import time
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [27]:
def get_entries(soup):
    #     entries = soup.find_all(name='div', attrs={'class':'row'}) # This way we can have a certain criteria
    entries = soup.find_all(name='div', class_='row') # This looks cleaner
    return entries
    
def get_job_title(entry):
    job_title_container = entry.find(name='a', attrs={'data-tn-element':'jobTitle'})
    job_title = job_title_container.text
    return job_title.strip()

def get_company(entry):
    company_list = []
    try:
        test_entry = entry.find(class_='company')
        company_list.append(test_entry.text.strip()) 
        company = company_list.pop()
    except:
        try:
            test_entry = entry.find(class_='result-link-source')
            company_list.append(test_entry.text.strip()) 
            company = company_list.pop()
        except:
            company = ' '
    return company

def get_location_info(entry):
    company_info = entry.find(class_='sjcl')
    location_info = company_info.find(class_='location')
    
    location = location_info.text.strip()

    # extract neightborhood info if it's there
    neighborhood = get_neighborhood(location_info)
    location = location.rstrip(neighborhood)
    neighborhood = neighborhood.strip('()')

    # extract the zipcode from location if it's there
    zipcode = get_zipcode(location)
    location = location.strip(zipcode)
    
    city, state = get_city_and_state(location)
    
    return city, state, zipcode, neighborhood

def get_city_and_state(location):
    city_state = location.split(', ')
    state = city_state.pop()
    city = city_state.pop()
    return city, state

def get_neighborhood(location_info):
    neighborhood_info = location_info.find(name='span')
    neighborhood = ' '
    if neighborhood_info:
        neighborhood = neighborhood_info.text
    return neighborhood

def get_zipcode(location):
    zipcode = ' '
    temp = [ s for s in location.split() if s.isdigit() ]
    if temp:
        zipcode = temp.pop()
    return zipcode
    
def get_salary(entry):
    salary_list = []
    salary = ''
    try:
        salary_list.append(entry.find('nobr').text.strip())
        salary = salary_list.pop()
    except:
        try:
            salary_container = entry.find(name='div', class_='salarySnippet')
            salary_temp = salary_container.find(name='span', class_='salary')
            salary_list.append(salary_temp.text.strip())
            salary = salary_list.pop()
        except:
            salary = ' '
    return salary

def get_job_summary(entry):
    return entry.find(class_='summary').text.strip()

def get_link(entry):
    link = entry['data-jk']
    return link 

def get_job_description(job_page):
    page = requests.get(job_page)
    time.sleep(1)  #ensuring at least 1 second between page grabs
    soup = BeautifulSoup(page.text, 'lxml')

    # Loop over posts/entries
    description = soup.find(name='div', id='jobDescriptionText')
    
    description = description.text.strip()
    description = description.replace('\n',' ')
    description = description.replace('\t',' ')
    return description

In [34]:
max_pages_per_city = 10
postings_per_page = 19 # Indeed's default 19 entries per page
postings_per_city = max_pages_per_city * postings_per_page 

# city_set = ['New+York','Chicago','Los+Angeles','Boston']
city_set = ['Boston']
# df.loc[num] = [title, company, city, state, zipcode, neighborhood, description, salary, link
columns = ['job_title', 'company_name', 'city', 'state', 'zipcode', 'neighborhood', 'description', 'salary', 'link']
# columns = ['job_title', 'company_name', 'location', 'description', 'salary', 'link']

In [35]:
df = pd.DataFrame(columns = columns)

URL_base = 'https://www.indeed.com/jobs?q=data+scientist+%2420%2C000'

# Loop over cities
for city_targ in city_set:
    URL_location = '&l=' + city_targ
    # Loop over pages
    for page_number in range(0, postings_per_city, postings_per_page):
        URL_page_start = '&start=' + str(page_number)
        URL = URL_base + URL_location + URL_page_start
        
        page = requests.get(URL)
        time.sleep(1)  #ensuring at least 1 second between page grabs
        soup = BeautifulSoup(page.text, 'lxml')
    
        # Loop over posts/entries
        entries = get_entries(soup)
        for entry in entries: 
            title = get_job_title(entry)
            company = get_company(entry)
            city, state, zipcode, neighborhood = get_location_info(entry)
            salary = get_salary(entry)
            link = get_link(entry)
            
            # summary = get_job_summary(entry)

            job_page = 'https://www.indeed.com/viewjob?jk='+link
            description = get_job_description(job_page)
            
            # Append the new row with data scraped
            num = (len(df) + 1)
            df.loc[num] = [title, company, city, state, zipcode, neighborhood, description, salary, link]

import datetime
date = str(datetime.date.today())

# saving dataframe as local csv file 
df.to_csv(date + '_indeed-ds-postings.csv', encoding='utf-8')
 

In [44]:
df.head()

Unnamed: 0,job_title,company_name,city,state,zipcode,neighborhood,description,salary,link
1,Data Scientist (Full-Time),proton.ai,Boston,MA,,,*Job Description Data Scientist (Full-Time)Tea...,"$75,000 - $120,000 a year",8cacfaa3c21d0129
2,Data Scientist,BD,Boston,MA,,,Job Description Summary Digital Health is a bu...,,4631c716fc96075a
3,Data Scientist,"MIB Group, Inc.",Braintree,MA,2184.0,,POSITION SUMMARY: MIB is committed to providin...,,c920345674fcc072
4,Data Scientist,Indeed Prime,Boston,MA,,,Indeed Prime is a free service that connects q...,,8c7e78291c43f729
5,Computational Biologist/Data Scientist,Goldfinch Bio,Cambridge,MA,,,Goldfinch Bio is a biotechnology company that ...,,f14bc6dec8b4f60f


In [47]:
df['salary'].value_counts()

                              156
$80,000 - $120,000 a year       4
$75,125 a year                  1
$115,000 - $145,000 a year      1
$93,400 - $134,100 a year       1
$35 - $38 an hour               1
$110,000 - $150,000 a year      1
$75,000 - $120,000 a year       1
$86,000 - $98,000 a year        1
$100,000 - $150,000 a year      1
$117,400 - $152,000 a year      1
Name: salary, dtype: int64