In [505]:
# This works as of 08/24/2019
# Used https://medium.com/@msalmon00/web-scraping-job-postings-from-indeed-96bd588dcb4b as motivation
# Had to change a few things as indeed structure has changed a bit since

import time
import pandas as pd
import requests
import bs4
from bs4 import BeautifulSoup


In [529]:
def get_entries(soup):
    #     entries = soup.find_all(name='div', attrs={'class':'row'}) # This way we can have a certain criteria
    entries = soup.find_all(name='div', class_='row') # This looks cleaner
    return entries
    
def get_job_title(entry):
    job_title_container = entry.find(name='a', attrs={'data-tn-element':'jobTitle'})
    job_title = job_title_container.text
    return job_title.strip()

def get_company(entry):
    company_list = []
    try:
        test_entry = entry.find(class_='company')
        company_list.append(test_entry.text.strip()) 
        company = company_list.pop()
    except:
        try:
            test_entry = entry.find(class_='result-link-source')
            company_list.append(test_entry.text.strip()) 
            company = company_list.pop()
        except:
            company = ''
    return company

def get_location_info(entry):
    company_info = entry.find(class_='sjcl')
    location_info = company_info.find(class_='location')
    try:
        sub_location = get_sub_location(location_info)
        # Sub-locations are usually between parentheses.
        # Let's get rid of them
        sub_location = sub_location[sub_location.find('(')+1:sub_location.find(')')]      
        location = location_info.text.strip(sub_location)
        return location, sub_location
    except:
        return location_info.text.strip(), ''

def get_sub_location(location_info):
    sub_location = location_info.find(name='span').text
    return sub_location

def get_salary(entry):
    salary_list = []
    salary = ''
    try:
        salary_list.append(entry.find('nobr').text)
        salary = salary_list.pop()
    except:
        try:
            salary_container = entry.find(name='div', class_='salarySnippet')
            salary_temp = salary_container.find(name='span', class_='salary')
            salary_list.append(salary_temp.text.strip())
            salary = salary_list.pop()
        except:
            salary = ''
    return salary

def get_job_summary(entry):
    return entry.find(class_='summary').text.strip()

def get_link(entry):
    link = entry['data-jk']
    return link 

def get_job_description(job_page):
    page = requests.get(job_page)
    time.sleep(1)  #ensuring at least 1 second between page grabs
    soup = BeautifulSoup(page.text, 'lxml')

    # Loop over posts/entries
    description = soup.find(name='div', id='jobDescriptionText')
    
    # This would be useful if all of the postings were the same format
    # The idea is to get rid of the redundant information like company name and location
#     redundant_info = description.find(name='p').text
#     print('REDUNDANT: ', redundant_info)
#     description = description.find_all(name='p')
#     description = description[1:]
#     description = [item for sublist in description[1:] for item in sublist]

    return description.text.strip()


In [533]:
max_pages_per_city = 2
postings_per_page = 19 # Indeed's default 19 entries per page
postings_per_city = max_pages_per_city * postings_per_page 

city_set = ['New+York','Chicago','Los+Angeles','Boston']
# city_set = ['Boston']
columns = ['city', 'job_title', 'company_name', 'location', 'sub_location', 'summary', 'salary', 'link']


In [534]:
sample_df = pd.DataFrame(columns = columns)
# Loop over cities
for city in city_set:
    # Loop over pages
#     for page_number in range(0, 100, 10):    
    for page_number in range(0, postings_per_city, postings_per_page):
        page = requests.get('https://www.indeed.com/jobs?q=data+scientist+%2420%2C000&l=' \
                            + city + '&start=' + str(page_number))
        time.sleep(1)  #ensuring at least 1 second between page grabs
        soup = BeautifulSoup(page.text, 'lxml')
    
        # Loop over posts/entries
        entries = get_entries(soup)
        for entry in entries: 
            title = get_job_title(entry)
            company = get_company(entry)
            location, sub_location = get_location_info(entry)
#             summary = get_job_summary(entry)
            salary = get_salary(entry)
            link = get_link(entry)
            
            job_page = 'https://www.indeed.com/viewjob?jk='+link
            summary = get_job_description(job_page)
            # Append the new row with data scraped
            num = (len(sample_df) + 1)
            sample_df.loc[num] = [city, title, company, location, sub_location, summary, salary, link]

# #saving sample_df as a local csv file — define your own local path to save contents 
sample_df.to_csv('test.csv', encoding='utf-8')
 

In [535]:
sample_df

Unnamed: 0,city,job_title,company_name,location,sub_location,summary,salary,link
1,New+York,Enterprise Data - Quant Researcher (Machine Le...,Bloomberg,"New York, NY",,"We're Bloomberg Enterprise Data - fast paced, ...",,906cee8bc6ee86cd
2,New+York,Data Scientist,VISITING NURSE SERVICE OF NEW YORK,"Manhattan, NY",,Overview\nThe Visiting Nurse Service of New Yo...,,5ef7293cc7779dd3
3,New+York,Research and Development Scientist,"Chembio Diagnostic Systems, Inc.","Medford, NY 11763",,We Improve Lives—that’s what drives us. It’s w...,,90bd8ed1add59d86
4,New+York,Data Scientist,Disney Streaming Services,"New York, NY",,The Data Scientist is a critical position with...,,fbbeed55473dfd05
5,New+York,Bench Scientist,Atlas,"Pearl River, NY",,This is a laboratory-based position to support...,$38 - $40 an hour,55ae9b75a34bf2d3
6,New+York,Data Scientist,New York City DEPT OF INFO TECH & TELECOMM,"Manhattan, NY",,The Mayor’s Office of the Chief Technology Off...,"$63,031 - $145,000 a year",0da0d97fcb637195
7,New+York,Data Scientist,Digitalogy,"New York, NY",,What you will do\n\nResponsible for assisting ...,$50 - $80 an hour,83e3f665b22e5695
8,New+York,MODA Data Scientist,New York City DEPT OF INFO TECH & TELECOMM,"Manhattan, NY",,The mission of the Mayor’s Office of Data Anal...,"$52,524 - $79,000 a year",387ffeda2dc9dd3c
9,New+York,Data Scientist,PepsiCo,"New York, NY",,PepsiCo operates in an environment undergoing ...,,b6e400d193c5beb4
10,New+York,Data Scientist,Essani International Client,"New York, NY",,Role: Data ScientistLevel: Junior/Mid-levelDur...,$60 - $70 an hour,b28c68ec83a3f1b2


# Testing Functionality

In [65]:
URL = 'https://www.indeed.com/jobs?q=data+scientist+%2420%2C000&l=New+York&start=10'
# conducting a request of the stated URL above:
page = requests.get(URL)
#specifying a desired format of “page” using the html parser - this allows python to read the various components of the page, rather than treating it as one long string.
soup = BeautifulSoup(page.text, 'html.parser')
#printing soup in a more structured tree format that makes for easier reading
# print(soup.prettify())

In [486]:
def extract_link_from_result(soup): 
    links = []
    entries = get_entries(soup)
    for entry in entries:
        links.append(get_link(entry))
    return(links)

In [355]:
def extract_summary_from_result(soup): 
    summaries = []
    entries = get_entries(soup)
    for entry in entries:
        summaries.append(get_job_summary(entry))
    return(summaries)
extract_summary_from_result(soup)


['As a data scientist, you will be responsible for building and delivering innovative new products to market. For multiple integrated complex data streams.',
 'The Senior Data Scientist uses mathematics, statistics, modeling, business analysis, and technology to transform high volumes of complex data into advanced…',
 'The National Insurance Data Pioneering group is looking for data scientists to join its team and help promote a data-driven culture throughout National…',
 'Experience with data ETL, ability to overcome challenges in a fragmented data environment and ensuring data quality. 15% - Data exploration and preparation.',
 'The Principal Data Scientist will collaborate with key business leaders to understand business problems and formulate analytical solutions for problem solving…',
 'CAMP4 is seeking a Data Scientist specializing in Statistical Genetics. Collaborate and work independently, with a team of computational biologists and data…',
 '1-2 years of experience with curren

In [332]:
def extract_salary_from_result(soup):
    salaries = []
    entries = get_entries(soup)
    for entry in entries:
        salaries.append(get_salary(entry))
    return salaries
    
extract_salary_from_result(soup)

    

['no_salary_posted',
 'no_salary_posted',
 'no_salary_posted',
 'no_salary_posted',
 'no_salary_posted',
 'no_salary_posted',
 'no_salary_posted',
 'no_salary_posted',
 'no_salary_posted',
 'no_salary_posted',
 'no_salary_posted',
 'no_salary_posted',
 'no_salary_posted',
 'no_salary_posted',
 'no_salary_posted',
 '$100,000 - $150,000 a year']

In [327]:
def extract_location_from_result(soup): 
    locations = []
    sub_locations = []
    rows = get_entries(soup)

    for row in rows:
        location, sub_location = get_location_info(row)
        locations.append(location)
        sub_locations.append(sub_location)
    
    return locations, sub_locations

extract_location_from_result(soup)

(['Manhattan, NY',
  'New York, NY 10112 (Midtown area)',
  'New York, NY',
  'New York, NY',
  'Purchase, NY',
  'New York, NY',
  'New York, NY 10018 (Clinton area)',
  'New York, NY',
  'New York, NY',
  'New York, NY',
  'New York, NY',
  'New York, NY',
  'New York, NY',
  'New York, NY 10017 (Midtown area)',
  'New York, NY',
  'New York, NY 10006 (Financial District area)'],
 ['',
  'Midtown area',
  '',
  '',
  '',
  '',
  'Clinton area',
  '',
  '',
  '',
  '',
  '',
  '',
  'Midtown area',
  '',
  'Financial District area'])

In [527]:
get_job_description('https://www.indeed.com/viewjob?jk=58bae9b6b4f17d90')

REDUNDANT:  VA Boston Healthcare System, Boston, Massachusetts


"Come join us in Boston at The VA Boston Healthcare System (VABHS), New England's premier referral center for Veterans' healthcare. VABHS is the largest recipient of VA research funds in the nation and ranked #3 among 146 VA facilities nationally for overall employee satisfaction. The Massachusetts Veterans Epidemiological Research and Information Center (MAVERIC) is an interdisciplinary research organization in VABHS. Its Informatics division is seeking a full-time Data Scientist who love working with data and want to use their data science skills to help VA researchers conduct high-impact research that expands our understanding of human health and improves healthcare outcomes for Veterans. You will join a strong, collegial, friendly, and talented team of computer scientists, statisticians, and clinicians at the VA.Core responsibilitiesUse advanced data analysis methods (from machine learning, artificial intelligence, applied statistics, etc) to assist in building predictive models of

In [458]:
def extract_job_title_from_result(soup): 
    jobs = []
    # Loop over entries
    posts = get_entries(soup)
    for post in posts:
        job_title = get_job_title(post)
        jobs.append(job_title)
    return jobs

yobs = extract_job_title_from_result(soup)
# print(len(yobs))
yobs

/pagead/clk?mo=r&ad=-6NYlbfkN0Alc63mq7XbG_XJWtX8RBizZIwo5v3DdAY7_u3VbNSkJxVCY8YYD1XnC0LFA3U1jqXXJ3CwvsNmWRCUwfjogvYRMAC1zoKWrQruN-h5adfVpK3QtGLm_J7y0Nfc1b7UyS0PvspOMCwADQmAx0lZt2z3qf4eqYMWcLhKQjy9v-2OrynzMC6J-JiB1LU2JMnuLn0yiNMrKVuvM7JOYZqOYro4TEwS0otKLHH8DGLVfdVdXcZ7GTw4JbLpeQ-jQa7xFdo9aGta0DNafh1Jy3kVptHalREU-1fHNFr6tbUSQnwWAxt59puHK9vkmaLGEFSP1hI0WZxe6_I_6P21dOomj3-RSJDQcH7-k4giso_qVwH34lnIAVex_W8_aOPJxtFBACo7JHWV9eXyP5Q7qlKEbLpw&p=0&fvj=0&vjs=3
/pagead/clk?mo=r&ad=-6NYlbfkN0Alc63mq7XbG_XJWtX8RBizZIwo5v3DdAY7_u3VbNSkJyf6IojzDTbtcfEVZqAqVQOep1jEFzURDGO6s3JhXIcnbUPsfWI8ZQ8drUW0i30nmFJg4ufmjTupvUsdZ8n7GMzdiNYeXgj5EKqHonI0mU6qtn7TTBBLWqhaG-b5qhWBMSjCqzM8ttLTzD3MdOl8__-XLrpIt4aHJ4bWQQ1qMajSlPrIwwG0hT2hlLZEPXBYv8Xo5m-WVA1UT9CIxAXUlMdpirDc5gpZsf3c25_T1QrmhqtgYw8qgCrs8WQH0Zvy_s6UXVs-6ht_thy8YdtdwLtsmGnkthBn9btTKH5AJ_V1bELt3XM03inTev2P2MhHDuAWDlt6vhISYzzkHc1BWmp2pguJm0wUvpy4Vv5OXaQevLQEHF1e9DQ=&p=1&fvj=0&vjs=3
/pagead/clk?mo=r&ad=-6NYlbfkN0A6-B3EZN7nxI2SOKd2oJXZ-wKsUrEyzb6go4T2DhbT4oonsM92tG

['Translational Research Scientist (with proteomic mass spec e...',
 'Scientist, Neurodegenerative Diseases',
 'Data Scientist',
 'Assistant Director, Data Science',
 'Data Scientist',
 'Data Scientist',
 'Linked Data Consultant / Field Application Scientist - Junio...',
 'Data Scientist',
 'Data Scientist',
 'LSP Board Scientist/Investigator',
 'Data Scientist',
 'Data Scientist (Labs)',
 'Data Scientist, NLP',
 'Perception Scientist for Marine Autonomy',
 'Data Analyst II',
 'Data Scientist']

In [329]:
def extract_company_from_result(soup): 
    companies = [] 
    for entry in soup.find_all(name='div', attrs={'class':'row'}): 
        company = get_company(entry)
        companies.append(company) 
    return companies

extract_company_from_result(soup)

['VISITING NURSE SERVICE OF NEW YORK',
 'Deloitte',
 'Disney Streaming Services',
 'AIG',
 'Atos',
 'Butterfly Network',
 'Custoria',
 'Remedy BPCI Partners, LLC.',
 'h2o.ai',
 'ERP Consulting',
 'DataDog',
 'Remedy Partners',
 'BerlandTeam',
 'Covera Health',
 'Bloomberg',
 'LEGENDS']

# Possible Useful Code

In [198]:
#             test_entry = entry.find_all(name='span', attrs={'class':'result-link-source'})
    
#     return entry.find(name='a', attrs={'data-tn-element':'jobTitle'}).text.strip()
#     return entry.find_all(name='a', attrs={'data-tn-element':'jobTitle'})
