# Job Getter
Scrapes Indeed job postings, saves info to excel

## Imports
We need *requests* to get our data, *BeautifulSoup* to parse our HTML, and *selenium* to programmatically nagivate Chrome. Last, we will use *openpyxl* to save everything to an Excel file.

In [14]:
import re
import requests

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

from bs4 import BeautifulSoup
from openpyxl import Workbook

## Helper Functions

In [15]:
def clean_summary(arr):
    """Removes \n char from the html elts 
    with class .summary"""
    new_str = ""
    if len(arr) <= 1:
        clean_str = arr[0].strip('\n').strip()                        
        new_str += clean_str
        return new_str
    for s in arr:
        try:
            clean_str = s.strip('\n').strip()                        
            new_str += clean_str
        except:
            new_str += s.contents[0].string
    return new_str

In [98]:
def last_row_index(sheet):
    """receives Excel worksheet and returns an int that represents the 
    last occupied row of the first column of that sheet""" 
    try:
        curr_cell = 0
        for row in sheet.iter_rows(min_row=1,max_col=1):
            for cell in row:                           
                curr_cell = cell.row
        return curr_cell      
    except:
        print("Invalid excel sheet")
        return False

In [96]:
def make_job_req(url):
    """Makes a req, returns False if
    req status-code is not 200"""
    req = requests.get(url)
    if req.status_code != 200:
        print("Invalid HTTP request from given URL")
        return False
    return req

In [97]:
def make_job_divs(req):
    """Takes in a request object from Indeed,
    returns divs containg job data"""
    try:
        soup = BeautifulSoup(req.text, 'html.parser')
        jobs = new_soup.find_all("div", class_="result")
    except:
        print("Invalid request object")
        return False
    
    return jobs

In [90]:
def make_job_list(job_divs):
    """"""
    job_list = []
    for i, j in enumerate(job_divs):
        new_job = {}
        new_job_summary = job_divs[i].find_all('div', class_='summary')[0].contents
        new_job['id'] = i
        new_job['summary'] = clean_summary(new_job_summary)
        new_job['title'] = job_divs[i].find_all('a')[0]['title']
        new_job['link'] = 'https://indeed.com' + job_divs[0].find_all('a')[0]['href']

        job_list.append(new_job)
    
    return job_list

In [91]:
def write_jobs(sheet, jobs, file_name):
    try:
        last_row = last_row_index(ws) + 1
        for job in jobs:    
            curr_col = 1
            for val in job.values():
                d = ws.cell(row=(job['id']+last_row), column=curr_col)
                d.value = val        
                curr_col+=1        
    except:
        return "There was an err"
    
    wb.save(file_name)
    return True

## Selenium

I am using chrome, but there are other options. Check the Selenium docs.

In [17]:
driver = webdriver.Chrome('./chromedriver')

Simple test to make sure the request worked

In [18]:
driver.get('https://www.indeed.com/')
driver.title

'Job Search | Indeed'

Send the job you want to search as a string to the `input` with `name:"q"` then press *RETURN* to search for jobs.

In [19]:
curr_job = "web developer"

elem = driver.find_element_by_name("q")
elem.send_keys(curr_job)
elem.send_keys(Keys.RETURN)

Another simple test to see the the query parameter contains our job. *Indeed* will use your location for the "where".

In [20]:
driver.current_url

'https://www.indeed.com/jobs?q=web+developer&l=Dumont%2C+NJ'

## Requests
Now that we have our url, lets grab our HTML using `requests`

Check to make sure the req was successful

In [21]:
req_string = driver.current_url

new_req = requests.get(req_string)
new_req.status_code

200

## BeautifulSoup
BeautifulSoup provides an awesome API for navigating HTML. This is how we will get the content from our search results and build our data. 

In [22]:
new_soup = BeautifulSoup(new_req.text, 'html.parser')

In [23]:
job_divs = new_soup.find_all("div", class_="result")

Let's take a look at how many jobs we are listed

In [25]:
len(job_divs)

19

## Build the Jobs List
Here we a building a list of jobs were each job in dictionary with the following keys:
- id
    - used to index the job in excel
- summary
    - a short summary of the job
- title
    - job title
- link
    - link to the job description

In [26]:
new_job_list = []
for i, j in enumerate(job_divs):
    new_job = {}
    new_job_summary = job_divs[i].find_all('div', class_='summary')[0].contents
    new_job['id'] = i
    new_job['summary'] = clean_summary(new_job_summary)
    new_job['title'] = job_divs[i].find_all('a')[0]['title']
    new_job['link'] = 'https://indeed.com' + job_divs[0].find_all('a')[0]['href']
    
    new_job_list.append(new_job)
    
new_job_list[0]   

{'id': 0,
 'link': 'https://indeed.com/pagead/clk?mo=r&ad=-6NYlbfkN0DnIQaT9_ideK20aGE-Gf5L-4mEV8Q0k7vqCcpbdP_vWWm99hxL5_fHY4xpZMP0Uhx5zsIIHC2JpM0ZxkCrbuPTXhftT08i6R8KTyzH84kb8ZTW1mFYdSyTp3t2dAwbTXA_-AAsx2CykGScHxOgvP7ajg4IWG4-yGRciVuEGMhx3H3X4ZwTs2TjxYzzz73bj0f_5e0yY9Bc6AUfFgYhSOHQH8NGygLjbJnHnkMXzRMfHLX4OFBdyk_NSqZwJyIDWMo_YxrSRKZXurgbLCPI4olpZNSxBhlaZkj463ATs3zlw4Ebc2_v2-ijqwpw5YvWg89g58scJZFKCCuxkRjMcuzh3nb93fC4P_hEUPeDlrlFZULzCimhBh89yrh2W6g0R9kZM8eYll6KEm8c0TlsFVhsyzPNcnHd_e6BfYty9Wmf-GsftzJae0YeJda4wP25yrtDbXMlz4X2pWXnjmXSSGYLLh2-Vlzr0T3fkHf5sDCi-Yvgp_n1sGKCaQ6B_gcrNvfPRthwpmdh1w==&vjs=3&p=1&fvj=0',
 'summary': 'Collaborate with back-enddevelopersto design and build a robust API. Michael Kors is always interested in hearing from talented, globally-minded individuals...',
 'title': 'Front-end Developer'}

## openpyxl

In [27]:
wb = Workbook()
ws = wb.active

Create a sheet with the same title as the current job search term

In [28]:
ws.title = curr_job
wb.sheetnames

['web developer']

In [33]:
last_row = last_row_index(ws) +1
for job in new_job_list:    
    curr_col = 1
    for val in job.values():
        d = ws.cell(row=(job['id']+last_row), column=curr_col)
        d.value = val        
        curr_col+=1

In [34]:
wb.save('jobs.xlsx')

## Selenium again to get the next page

In [38]:
elem = driver.find_element_by_class_name("np")
elem.click()

In [42]:
close_pop_up = driver.find_element_by_id('popover-close-link')
try:
    close_pop_up.click()
except:
    print('Pop up not there')

Pop up not there


Interesting. Looks like we may not need selenium anymore after all -- we can just modify the query parameter for *start*

In [43]:
driver.current_url

'https://www.indeed.com/jobs?q=web+developer&l=Dumont%2C+NJ&start=10#'

In [92]:
new_req = make_job_req(driver.current_url)

In [93]:
job_divs = make_job_divs(new_req)

In [94]:
job_list = make_job_list(job_divs)

In [95]:
write_jobs(ws, jobs=job_list, file="jobs.xlsx")

True