In [4]:
# This works as of 08/24/2019
# Used https://medium.com/@msalmon00/web-scraping-job-postings-from-indeed-96bd588dcb4b as motivation
# Had to change a few things as indeed structure has changed a bit since
import sys
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [13]:
def get_entries(soup):
    #     entries = soup.find_all(name='div', attrs={'class':'row'}) # This way we can have a certain criteria
    entries = soup.find_all(name='div', class_='row') # This looks cleaner
    return entries
    
def get_job_title(entry):
    job_title_container = entry.find(name='a', attrs={'data-tn-element':'jobTitle'})
    job_title = job_title_container.text
    return job_title.strip()

def get_company(entry):
    company_list = []
    try:
        test_entry = entry.find(class_='company')
        company_list.append(test_entry.text.strip()) 
        company = company_list.pop()
    except:
        try:
            test_entry = entry.find(class_='result-link-source')
            company_list.append(test_entry.text.strip()) 
            company = company_list.pop()
        except:
            company = ' '
    return company

def get_location_info(entry):
    company_info = entry.find(class_='sjcl')
    location_info = company_info.find(class_='location')
    
    location = location_info.text.strip()

    # extract neightborhood info if it's there
    neighborhood = get_neighborhood(location_info)
    location = location.rstrip(neighborhood)
    neighborhood = neighborhood.strip('()')
    return location, neighborhood


def get_neighborhood(location_info):
    neighborhood_info = location_info.find(name='span')
    neighborhood = ' '
    if neighborhood_info:
        neighborhood = neighborhood_info.text
    return neighborhood
    
def get_salary(entry):
    salary_list = []
    salary = ''
    try:
        salary_list.append(entry.find('nobr').text.strip())
        salary = salary_list.pop()
    except:
        try:
            salary_container = entry.find(name='div', class_='salarySnippet')
            salary_temp = salary_container.find(name='span', class_='salary')
            salary_list.append(salary_temp.text.strip())
            salary = salary_list.pop()
        except:
            salary = ' '
    return salary

def get_job_summary(entry):
    return entry.find(class_='summary').text.strip()

def get_link(entry):
    link = entry['data-jk']
    return link 

def get_job_description(job_page):
    page = requests.get(job_page)
    time.sleep(1)  # ensuring at least 1 second between page grabs
    soup = BeautifulSoup(page.text, 'lxml')
#     print('\n',job_page)
#     print(soup.prettify())

#     <div id="jobDescriptionText" class="jobsearch-jobDescriptionText">
#     description = soup.find(name='div', id='jobDescriptionText')
    description = soup.find(name='div', class_='jobsearch-jobDescriptionText')
    
#     try:
    description = description.text.strip()
    description = description.replace('\n',' ')
    description = description.replace('\t',' ')
#     except:
#         pass
    return description

In [46]:
max_pages_per_city = 60
POSTINGS_PER_PAGE = 17 # Indeed's default 19 entries per page
postings_per_city = max_pages_per_city * POSTINGS_PER_PAGE 

# city_set = ['New+York','Chicago','Los+Angeles','Boston']
city_set = ['Boston']
# df.loc[num] = [title, company, city, state, zipcode, neighborhood, description, salary, link
# columns = ['job_title', 'company_name', 'city', 'state', 'zipcode', 'neighborhood', 'description', 'salary', 'link']
columns = ['job_title', 'company_name', 'location', 'neighborhood', 'description', 'salary', 'link']

In [48]:
df = pd.DataFrame(columns = columns)

URL_base = 'https://www.indeed.com/jobs?q=data+scientist+%2420%2C000'

# Loop over cities
for city_targ in city_set:
    URL_location = '&l=' + city_targ
    # Loop over pages
    for page_number in range(0, postings_per_city, POSTINGS_PER_PAGE):
        URL_page_start = '&start=' + str(page_number)
        URL = URL_base + URL_location + URL_page_start
        
        page = requests.get(URL)
        time.sleep(1)  #ensuring at least 1 second between page grabs
        soup = BeautifulSoup(page.text, 'lxml')
    
        # Loop over posts/entries
        entries = get_entries(soup)
        for i,entry in enumerate(entries): 
            sys.stdout.write('\r' + ' page: ' + str(page_number//POSTINGS_PER_PAGE) 
                             + ' / ' + str(max_pages_per_city)  
                             + ', job posting: ' + str(i) + ' / ' + str(len(entries))
            )        
            title = get_job_title(entry)
            company = get_company(entry)
#             city, state, zipcode, neighborhood = get_location_info(entry)
            location, neighborhood = get_location_info(entry)
            salary = get_salary(entry)
            link = get_link(entry)
            
            # summary = get_job_summary(entry)

            job_page = 'https://www.indeed.com/viewjob?jk='+link
#             print(link)
            description = get_job_description(job_page)
            
            # Append the new row with data scraped
            num = (len(df) + 1)
            df.loc[num] = [title, company, location, neighborhood, description, salary, link]

import datetime
date = str(datetime.date.today())

# saving dataframe as local csv file 
df.to_csv(date + '_indeed-ds-postings.csv', encoding='utf-8')


 page: 0 / 60, job posting: 15 / 16
 page: 17 / 60, job posting: 16 / 17
 page: 34 / 60, job posting: 15 / 16
 page: 51 / 60, job posting: 17 / 18
 page: 68 / 60, job posting: 17 / 18
 page: 85 / 60, job posting: 17 / 18
 page: 102 / 60, job posting: 15 / 16
 page: 119 / 60, job posting: 16 / 17
 page: 136 / 60, job posting: 14 / 15
 page: 153 / 60, job posting: 15 / 16
 page: 170 / 60, job posting: 17 / 18
 page: 187 / 60, job posting: 18 / 19
 page: 204 / 60, job posting: 17 / 18
 page: 221 / 60, job posting: 14 / 15
 page: 238 / 60, job posting: 14 / 15
 page: 255 / 60, job posting: 15 / 16
 page: 272 / 60, job posting: 15 / 16
 page: 289 / 60, job posting: 14 / 15
 page: 306 / 60, job posting: 14 / 15
 page: 323 / 60, job posting: 16 / 17
 page: 340 / 60, job posting: 15 / 16
 page: 357 / 60, job posting: 16 / 17
 page: 374 / 60, job posting: 16 / 17
 page: 391 / 60, job posting: 16 / 17
 page: 408 / 60, job posting: 16 / 17
 page: 425 / 60, job posting: 16 / 17
 page: 442 / 60, j

In [49]:
df.head(10)
# df

Unnamed: 0,job_title,company_name,location,neighborhood,description,salary,link
1,"Statistical Genetics, Data Scientist",Camp4 Therapeutics Corporation,"Cambridge, MA",,Job Description CAMP4 is seeking a Data Scient...,,03bf439bfa53ee13
2,Data Scientist,"MIB Group, Inc.","Braintree, MA 02184",,POSITION SUMMARY: MIB is committed to providin...,,c920345674fcc072
3,Translational Medicine and Data Science Expert,Novartis,"Cambridge, MA",,20 petabytes of data. 2 million patient-years ...,,4fdbb9b5cb09e0d2
4,Computational Biologist/Data Scientist,Goldfinch Bio,"Cambridge, MA",,Goldfinch Bio is a biotechnology company that ...,,f14bc6dec8b4f60f
5,Principal Data Scientist,Verizon,"Boston, MA 02109",Central area,What you’ll be doing... As a Principal Data Sc...,,8d13390a342786e0
6,Data Scientist / Machine Learning Architect / ...,Profitect Inc.,"Burlington, MA",,Profitect’s Research and Development team is l...,,b7ae218bbb0b2a50
7,Data Scientist (Full-Time),proton.ai,"Boston, MA",,*Job Description Data Scientist (Full-Time)Tea...,"$75,000 - $120,000 a year",8cacfaa3c21d0129
8,"Data Scientist (Intern, Part-Time)",proton.ai,"Boston, MA",,"Data Scientist (Intern, Part-Time)Team: Data ...",,1a087273ecb3d9e3
9,Sr. Data Scientist,Cubic IT,"Boston, MA",,Job SummaryLooking for an experienced Machine ...,$90 - $100 a day,8b0554b4567cb6b0
10,Data Scientist,Park Jockey,"Boston, MA",,Who You’ll Work For REEF Technology is the eco...,,0cc7c0afb827e835


In [51]:
df['salary'].value_counts()
# len(df)

                                               975
Similar jobs pay $76,000 - $112,000 a year       3
$20.62 an hour                                   2
$69,929 - $102,939 a year                        2
$50,794 - $71,864 a year                         1
$103,106 - $134,038 a year                       1
$35 - $40 an hour                                1
$15 - $20 an hour                                1
$50,000 - $60,000 a year                         1
$100,000 a year                                  1
$150,000 - $220,000 a year                       1
Similar jobs pay $103,000 - $152,000 a year      1
Similar jobs pay $102,000 - $150,000 a year      1
$100,000 - $150,000 a year                       1
$61,982 - $83,592 a year                         1
$120,000 - $150,000 a year                       1
$63,189 - $92,072 a year                         1
$117,400 - $152,000 a year                       1
$47,378 - $68,083 a year                         1
$90 - $100 a day               

In [45]:
df.shape

(515, 7)