In [1]:
# This works as of 08/24/2019
# Used https://medium.com/@msalmon00/web-scraping-job-postings-from-indeed-96bd588dcb4b as motivation
# Had to change a few things as indeed structure has changed a bit since

import time
import pandas as pd
import requests
import bs4
from bs4 import BeautifulSoup


In [65]:
URL = 'https://www.indeed.com/jobs?q=data+scientist+%2420%2C000&l=New+York&start=10'
# conducting a request of the stated URL above:
page = requests.get(URL)
#specifying a desired format of “page” using the html parser - this allows python to read the various components of the page, rather than treating it as one long string.
soup = BeautifulSoup(page.text, 'html.parser')
#printing soup in a more structured tree format that makes for easier reading
# print(soup.prettify())

In [344]:
def get_entries(soup):
#     entries = soup.find_all(name='div', attrs={'class':'row'}) # This way we can have a certain criteria
    entries = soup.find_all(name='div', class_='row') # This looks cleaner
    return entries

    
def get_job_title(entry):
    job_title_container = entry.find(name='a', attrs={'data-tn-element':'jobTitle'})
    job_title = job_title_container.text
    job_title_clean = job_title.strip()
    return job_title_clean
    
#     return entry.find(name='a', attrs={'data-tn-element':'jobTitle'}).text.strip()
#     return entry.find_all(name='a', attrs={'data-tn-element':'jobTitle'})


def get_company(entry):
    company_list = []
    try:
        test_entry = entry.find(class_='company')
        company_list.append(test_entry.text.strip()) 
        company = company_list.pop()
    except:
        try:
            test_entry = entry.find(class_='result-link-source')
            company_list.append(test_entry.text.strip()) 
            company = company_list.pop()
        except:
            company = 'nothing_found'

    return company

def get_location_info(entry):
    company_info = entry.find(class_='sjcl')
    location = company_info.find(class_='location')
    try:
        sub_location = get_sub_location(location)
        return location.text.strip(sub_location), sub_location
    except:
        return location.text.strip(), ''

def get_sub_location(location):
    sub_location = location.find(name='span').text
    # Sub-locations are usually between parentheses.
    # Let's get rid of them
    sub_location = sub_location[sub_location.find("(")+1:sub_location.find(")")]
    return sub_location

def get_salary(entry):
    salary_list = []
    salary = ''
    try:
        salary_list.append(entry.find('nobr').text)
        salary = salary_list.pop()
    except:
        try:
            salary_container = entry.find(name='div', class_='salarySnippet')
            salary_temp = salary_container.find(name='span', class_='salary')
            salary_list.append(salary_temp.text.strip())
            salary = salary_list.pop()
        except:
            salary = 'no_salary_posted'
    return(salary)

def get_job_summary(entry):
    return entry.find(class_='summary').text.strip()

In [417]:
max_pages_per_city = 2
postings_per_page = 19 # 19 entries per page
postings_per_city = max_pages_per_city * postings_per_page 

# city_set = ['New+York','Chicago','Los+Angeles','Boston']
city_set = ['Boston']
columns = ['city', 'job_title', 'company_name', 'location', 'sub_location', 'summary', 'salary']


In [423]:
sample_df = pd.DataFrame(columns = columns)
# Loop over cities
for city in city_set:
    # Loop over pages
#     for page_number in range(0, 100, 10):    
    for page_number in range(0, postings_per_city, postings_per_page):
        page = requests.get('https://www.indeed.com/jobs?q=data+scientist+%2420%2C000&l=' \
                            + city + '&start=' + str(page_number))
        time.sleep(1)  #ensuring at least 1 second between page grabs
        soup = BeautifulSoup(page.text, 'lxml')
    
        # Loop over posts/entries
        entries = get_entries(soup)
        for entry in entries: 
            title = get_job_title(entry)
            company = get_company(entry)
            location, sub_location = get_location_info(entry)
            summary = get_job_summary(entry)
            salary = get_salary(entry)
            
            # Append the new row with data scraped
            num = (len(sample_df) + 1)
            sample_df.loc[num] = [city, title, company, location, sub_location, summary, salary]

# #saving sample_df as a local csv file — define your own local path to save contents 
sample_df.to_csv('test.csv', encoding='utf-8')
 

In [424]:
sample_df

Unnamed: 0,city,job_title,company_name,location,sub_location,summary,salary
1,Boston,Associate Data Scientist - INTERN - PART TIME,ENGIE Insight,"Boston, MA",,"ENGIE Insight, formerly Ecova, partners with m...",no_salary_posted
2,Boston,Data Scientist,VA Boston Healthcare System,"Boston, MA",,"VA Boston Healthcare System, Boston, Massachus...",no_salary_posted
3,Boston,Senior Data Scientist,Humana,"Boston, MA 02298",,"The Senior Data Scientist uses mathematics, st...",no_salary_posted
4,Boston,"Statistical Genetics, Data Scientist",Camp4 Therapeutics Corporation,"Cambridge, MA",,CAMP4 is seeking a Data Scientist specializing...,"Similar jobs pay $76,000 - $112,000 a year"
5,Boston,Deep Learning - Lead Data Scientist,Humana,"Boston, MA 02298",,"The Lead Data Scientist uses mathematics, stat...",no_salary_posted
6,Boston,"Assistant Director, Data Science",Liberty Mutual Insurance,"Boston, MA 02101",,The National Insurance Data Pioneering group i...,"$117,400 - $152,000 a year"
7,Boston,Lead Data Scientist,Humana,"Boston, MA 02298",,"The Lead Data Scientist uses mathematics, stat...",no_salary_posted
8,Boston,Data Scientist,"MIB Group, Inc.","Braintree, MA 02184",,MIB is committed to providing valued-added ser...,no_salary_posted
9,Boston,Data Scientist,Park Jockey,"Boston, MA",,REEF Technology is the ecosystem that connects...,no_salary_posted
10,Boston,Data Scientist,"Amazon.com Services, Inc.","Cambridge, MA",,Bachelor's degree in a relevant field. 2+ year...,no_salary_posted


# Testing Functionality

In [355]:
def extract_summary_from_result(soup): 
    summaries = []
    entries = get_entries(soup)
    for entry in entries:
        summaries.append(get_job_summary(entry))
    return(summaries)
extract_summary_from_result(soup)


['As a data scientist, you will be responsible for building and delivering innovative new products to market. For multiple integrated complex data streams.',
 'The Senior Data Scientist uses mathematics, statistics, modeling, business analysis, and technology to transform high volumes of complex data into advanced…',
 'The National Insurance Data Pioneering group is looking for data scientists to join its team and help promote a data-driven culture throughout National…',
 'Experience with data ETL, ability to overcome challenges in a fragmented data environment and ensuring data quality. 15% - Data exploration and preparation.',
 'The Principal Data Scientist will collaborate with key business leaders to understand business problems and formulate analytical solutions for problem solving…',
 'CAMP4 is seeking a Data Scientist specializing in Statistical Genetics. Collaborate and work independently, with a team of computational biologists and data…',
 '1-2 years of experience with curren

In [332]:
def extract_salary_from_result(soup):
    salaries = []
    entries = get_entries(soup)
    for entry in entries:
        salaries.append(get_salary(entry))
    return salaries
    
extract_salary_from_result(soup)

    

['no_salary_posted',
 'no_salary_posted',
 'no_salary_posted',
 'no_salary_posted',
 'no_salary_posted',
 'no_salary_posted',
 'no_salary_posted',
 'no_salary_posted',
 'no_salary_posted',
 'no_salary_posted',
 'no_salary_posted',
 'no_salary_posted',
 'no_salary_posted',
 'no_salary_posted',
 'no_salary_posted',
 '$100,000 - $150,000 a year']

In [327]:
def extract_location_from_result(soup): 
    locations = []
    sub_locations = []
    rows = get_entries(soup)

    for row in rows:
        location, sub_location = get_location_info(row)
        locations.append(location)
        sub_locations.append(sub_location)
    
    return locations, sub_locations

extract_location_from_result(soup)

(['Manhattan, NY',
  'New York, NY 10112 (Midtown area)',
  'New York, NY',
  'New York, NY',
  'Purchase, NY',
  'New York, NY',
  'New York, NY 10018 (Clinton area)',
  'New York, NY',
  'New York, NY',
  'New York, NY',
  'New York, NY',
  'New York, NY',
  'New York, NY',
  'New York, NY 10017 (Midtown area)',
  'New York, NY',
  'New York, NY 10006 (Financial District area)'],
 ['',
  'Midtown area',
  '',
  '',
  '',
  '',
  'Clinton area',
  '',
  '',
  '',
  '',
  '',
  '',
  'Midtown area',
  '',
  'Financial District area'])

In [293]:
def extract_job_title_from_result(soup): 
    jobs = []
    # Loop over entries
    posts = get_entries(soup)
    for post in posts:
        job_title = get_job_title(post)
        jobs.append(job_title)
    return jobs

yobs = extract_job_title_from_result(soup)
# print(len(yobs))
yobs

['Data Scientist',
 'Data Scientist - Hux',
 'Data Scientist - Retention Analytics',
 'Data Scientist I',
 'VIE Junior Data Scientist H/F',
 'Data Scientist',
 'Data Scientist',
 'Junior Data Scientist',
 'Customer Data Scientist (New York)',
 'Data Scientist',
 'Data Scientist',
 'Junior Data Scientist',
 'Junior Data Scientist',
 'Data Scientist- Machine Learning',
 'Enterprise Data - Quant Researcher (Machine Learning)',
 'Data Scientist']

In [329]:
def extract_company_from_result(soup): 
    companies = [] 
    for entry in soup.find_all(name='div', attrs={'class':'row'}): 
        company = get_company(entry)
        companies.append(company) 
    return companies

extract_company_from_result(soup)

['VISITING NURSE SERVICE OF NEW YORK',
 'Deloitte',
 'Disney Streaming Services',
 'AIG',
 'Atos',
 'Butterfly Network',
 'Custoria',
 'Remedy BPCI Partners, LLC.',
 'h2o.ai',
 'ERP Consulting',
 'DataDog',
 'Remedy Partners',
 'BerlandTeam',
 'Covera Health',
 'Bloomberg',
 'LEGENDS']

# Possible Useful Code

In [198]:
#             test_entry = entry.find_all(name='span', attrs={'class':'result-link-source'})