<h1><center>Indeed Job Scraping</center></h1>

<h2>Part 1: Inspect Indeed Website</h2>

<h3>1.1 Import Libraries</h3>

In [1]:
import pandas as pd
import numpy as np

import re
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

import requests
from bs4 import BeautifulSoup

from time import time

<h3>1.2 Format URL</h3>

In [2]:
job_name = 'Data Scientist'
location = 'Irvine'

job_name = job_name.replace(' ','%20')
location = location.replace(' ','%20')

url = 'https://www.indeed.com/jobs?q=' + job_name + '&l=' + location

print(url)

https://www.indeed.com/jobs?q=Data%20Scientist&l=Irvine


In [4]:
def get_url(job_name, location):
    job_name = job_name.replace(' ','%20')
    location = location.replace(' ','%20')
    url = 'https://www.indeed.com/jobs?q=' + job_name + '&l=' + location
    return url

<h3>1.3 Get Page</h3>

In [51]:
t = time()
page = requests.get(url)
if page.status_code == 200:
    print('Page retrieved')
else:
    print('An error occurred.')
print(time()-t)

Page retrieved
0.3796968460083008


<h2>Part 2: Scrape HTML Contents</h2>

<h3>2.1 Parse Page using BeautifulSoup</h3>

In [52]:
t = time()

soup = BeautifulSoup(page.content,'html.parser')

print(time()-t)

0.10176229476928711


<h3>2.2 Find Results Container</h3>

In [53]:
results = soup.find(id='resultsBody')

In [12]:
def extract_results(url):
    try:
        page = requests.get(url)
        soup = BeautifulSoup(page.content,'html.parser')
        return soup.find(id='resultsBody')
    except:
        print('Cannot access the website.')

In [14]:
t = time()
extract_results(url)
print(time()-t)

0.4417150020599365


<h3>2.3 Find Number of Pages</h3>

In [25]:
pagecount_text = soup.find('div',id='searchCountPages').text
index_of = pagecount_text.index('of')
num_job = re.sub('[A-Za-z]','',pagecount_text[index_of+3:]).strip()
num_job = int(num_job)
num_page = int(num_job/10)

In [12]:
pages = [url]
for i in range(1,num_page+1):
    pages.append(url+'&start='+str(i*10))
print(pages)

['https://www.indeed.com/jobs?q=Data%20Scientist&l=Irvine', 'https://www.indeed.com/jobs?q=Data%20Scientist&l=Irvine&start=10', 'https://www.indeed.com/jobs?q=Data%20Scientist&l=Irvine&start=20', 'https://www.indeed.com/jobs?q=Data%20Scientist&l=Irvine&start=30', 'https://www.indeed.com/jobs?q=Data%20Scientist&l=Irvine&start=40', 'https://www.indeed.com/jobs?q=Data%20Scientist&l=Irvine&start=50', 'https://www.indeed.com/jobs?q=Data%20Scientist&l=Irvine&start=60']


In [18]:
def get_page_count(url):
    try:
        results = extract_results(url)
        pagecount_text = results.find('div',id='searchCountPages').text
        num_job = re.sub('[A-Za-z]','',pagecount_text[pagecount_text.index('of')+3:]).strip()
        num_job = int(num_job)
        num_page = int(num_job/10)
        return num_page, num_job
    except:
        print('Cannot access the website.')

def get_pages(url):
    try:
        num_page, num_job = get_page_count(url)
        pages = [url]
        for i in range(1,num_page+1):
            pages.append(url+'&start='+str(i*10))
        return pages
    except:
        print('Cannot access the website.')

In [19]:
t = time()
get_page_count(url)
print(time()-t)

0.4950752258300781


In [20]:
t = time()
get_pages(url)
print(time()-t)

0.6314396858215332


<h3>2.4 Find Job Card</h3>

In [28]:
t = time()

jobs = results.find_all('div',class_='jobsearch-SerpJobCard unifiedRow row result')

print(time()-t)

0.009655237197875977


In [32]:
job = jobs[0]

In [33]:
t = time()

job.find('h2',class_='title')\
        .find('a',{'data-tn-element':'jobTitle'}).text.replace('\n','')

print(time()-t)

0.0


In [37]:
t = time()

job_url = job.find('h2',class_='title')\
              .find('a',{'data-tn-element':'jobTitle'})['href']
job_url = 'https://www.indeed.com'+job_url

print(time()-t)

0.0


In [35]:
t = time()

company = job.find('span',class_='company').text.replace('\n','')

print(time()-t)

0.0


In [36]:
t = time()

location = job.find('div',class_='recJobLoc')['data-rc-loc'].replace('\n','')

print(time()-t)

0.0


In [46]:
t = time()

try:
    job_page = requests.get(job_url)
    job_soup = BeautifulSoup(job_page.content,'html.parser')
    job_description = job_soup.find('div',class_='jobsearch-jobDescriptionText')
except:
    print('An error occurred when accessing the page for job',job_title)

print(time()-t)

0.5891828536987305


In [15]:
job_dict = {
    'title':[],
    'company':[],
    'location':[],
    'url':[],
    'description':[]
}

for job in jobs:
    job_title = job.find('h2',class_='title')\
                .find('a',{'data-tn-element':'jobTitle'}).text.replace('\n','')
    job_url = job.find('h2',class_='title')\
              .find('a',{'data-tn-element':'jobTitle'})['href']
    job_url = 'https://www.indeed.com'+job_url
    company = job.find('span',class_='company').text.replace('\n','')
    location = job.find('div',class_='recJobLoc')['data-rc-loc'].replace('\n','')
    
    #Access job url page
    job_page = requests.get(job_url)
    description = []
    if job_page.status_code == 200:
        job_soup = BeautifulSoup(job_page.content,'html.parser')
        job_description = job_soup.find('div',class_='jobsearch-jobDescriptionText')
#         for p in job_description.find_all('p'):
#             description.append(p.text)
#         description = ' '.join(description).replace('\n','')
    else:
        print('An error occurred when accessing the page for job',job_title)
    job_dict['title'].append(job_title)
    job_dict['company'].append(company)
    job_dict['location'].append(location)
    job_dict['url'].append(job_url)
    job_dict['description'].append(job_description)

```Python
def get_description(job):
    qual_list = ['requirements','qualifications','required ','what you ll']
    job = re.sub(r'<.+?>','\n',str(job))
    description = []
    for p in job.split('\n'):
        description += [re.sub('[^A-Za-z]',' ',s).strip().lower() for s in sent_tokenize(p)]
    for desc in description:
        if any([qual in desc for qual in qual_list]):
            end_index = description.index(desc)
            break
        elif desc == 'skill':
            end_index = description.index(desc)
        else:
            end_index = len(description)
    job_description = ' '.join(description[:end_index])
    return job_description
```

In [58]:
def get_jobs(url, limit = 50):
    if True:
        count = 0
        job_dict = {
            'title':[],
            'company':[],
            'location':[],
            'url':[],
            'description':[]
        }
        pages = get_pages(url)
        num_page, num_job = get_page_count(url)
        limit = min(limit, num_job)
        for page in pages:
            results = extract_results(page)
            jobs = results.find_all('div',class_='jobsearch-SerpJobCard unifiedRow row result')
            for job in jobs:
                if count == limit:
                    break
                job_title = job.find('h2',class_='title')\
                            .find('a',{'data-tn-element':'jobTitle'}).text.replace('\n','')
                job_url = job.find('h2',class_='title')\
                          .find('a',{'data-tn-element':'jobTitle'})['href']
                job_url = 'https://www.indeed.com'+job_url
                company = job.find('span',class_='company').text.replace('\n','')
                location = job.find('div',class_='recJobLoc')['data-rc-loc'].replace('\n','')

                #Access job url page
                job_page = requests.get(job_url)
                try:
                    job_soup = BeautifulSoup(job_page.content,'html.parser')
                    job_description = job_soup.find('div',class_='jobsearch-jobDescriptionText')
                    job_description = re.sub(r'<.+?>','\n',str(job_description))
                except:
                    print('An error occurred when accessing the page for job',job_title)
                job_dict['title'].append(job_title)
                job_dict['company'].append(company)
                job_dict['location'].append(location)
                job_dict['url'].append(job_url)
                job_dict['description'].append(job_description)
                count += 1
        return job_dict
    else:
        print('Cannot access the website.')

In [59]:
t = time()
get_jobs(url)
print(time()-t)

28.951276540756226


In [18]:
job_dict = get_jobs(url)
pd.DataFrame(job_dict)

Unnamed: 0,title,company,location,url,description
0,Data Scientist,ULTIMATE STAFFING SERVICES,"Irvine, CA",https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,key responsibilities data science focusing on ...
1,Senior Financial Data Analyst w/PE backed Heal...,Alliance Resource Group,"Orange, CA",https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,senior financial data analyst w healthcare eco...
2,Data Scientist,Spireon,"Irvine, CA",https://www.indeed.com/rc/clk?jk=4c35157d313d5...,this is us we have a bold vision to connect ...
3,Data Scientist,Karma Automotive LLC,"Irvine, CA",https://www.indeed.com/rc/clk?jk=f7fd2a35541c6...,overview southern california based karma is mo...
4,Senior Data Scientist (Must be on W2),SoftNice Inc,"Santa Ana, CA",https://www.indeed.com/company/Ramy-Infotech-I...,description analytic data model development qu...
5,Sr. Data Analyst,Abtsus LLC,"Santa Ana, CA",https://www.indeed.com/company/Abtsus-LLC/jobs...,responsibilities include developing sql querie...
6,Data Scientist (Fraud & Game Surveillance),NCSOFT,"Aliso Viejo, CA",https://www.indeed.com/rc/clk?jk=11cf149979176...,who we are ncsoft is a premiere digital entert...
7,Data Scientist,Driveway,"Aliso Viejo, CA",https://www.indeed.com/rc/clk?jk=8fccd5560141c...,we are looking for a data scientist who will s...
8,Data Scientist Expert,SAP,"Newport Beach, CA",https://www.indeed.com/rc/clk?jk=c8987900476dd...,requisition id work area software design and ...
9,Senior Data Scientist,First American,"Santa Ana, CA",https://www.indeed.com/rc/clk?jk=75a9ed1f5c6ac...,join our team as a global leader in providing ...


In [18]:
def get_indeed_job(job_name, location, limit = 50):
    url = get_url(job_name, location)
    job_dict = get_jobs(url, limit = limit)
    return job_dict

In [19]:
t = time()
get_indeed_job('Data Analyst','Irvine')
print(time()-t)

37.53865075111389


In [24]:
get_indeed_job('Data Scientist','Irvine')

{'title': ['Data Scientist',
  'Senior Financial Data Analyst w/PE backed Healthcare company',
  'Data Scientist',
  'Data Scientist',
  'Sr. Data Analyst',
  'Senior Data Scientist (Must be on W2)'],
 'company': ['ULTIMATE STAFFING SERVICES',
  'Alliance Resource Group',
  'Spireon',
  'Karma Automotive LLC',
  'Abtsus LLC',
  'SoftNice Inc'],
 'location': ['Irvine, CA',
  'Orange, CA',
  'Irvine, CA',
  'Irvine, CA',
  'Santa Ana, CA',
  'Santa Ana, CA'],
 'url': ['https://www.indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0BhfrGGbcblirJ0_oD-V1jJ9SBvie1turFDKTAe6KCgN7BX82dLaSd1WaQVgK8U-NkNxRBIOR-46o3D8g-bUriwU9Rzou_w9Rmhiu7UmLSnZdr3JhqueoTh37ZuCnrIIHT4vAOu50K_Yt1voSKTGLrTKdX1NIV7ZQezoJrCJqlkB3ctQ60j6eaM8WZZNVVlRkJQ91KE7SRj_p5UfjU-G3cQlZyh7aUoBhrYS2fpYggjahEH3eJ_V27r8oqnb2WHso5sBAGRufRUyMr8PcG26grI_ibnLv30qaEtS3Uzx4YwZVNsMZceCFfjRE-ZdC04XDhQyudruj7yfXA_jRt1Wz-ciznmJJm7Y0z7GH2f0XNxb5nrzxbBLiHxL2jH9YKl8BCDEMFXe5rGADsNPh0pGYDK5MGBNK1yukxW2Rdyjmz378guZXRYFGQnro3Ty5bKjpIbtbPJ_SvCZ2IhcBs36bQDfI9sHO