In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs

# make sure Google Sheets API is enabled
# https://developers.google.com/sheets/api/quickstart/python
# pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib

In [2]:
result = requests.get("https://jobs.lever.co/aquicore")
c = result.content
soup = bs(c)

Let's see if we can grab the titles from the *h5* tags. 

In [3]:
titles = soup.find_all('h5')
titles

[<h5>Account Executive</h5>,
 <h5>Solutions Architect</h5>,
 <h5>Data Scientist</h5>,
 <h5>Embedded Software Engineer</h5>,
 <h5>Full Stack Software Engineer</h5>,
 <h5>Product Manager</h5>,
 <h5>Quality Engineer</h5>,
 <h5>Technical Support Engineer</h5>]

Cool - that worked.

Now let's give them numbers.

In [4]:
l_titles = [title.text.strip() for title in titles]
l_titles

['Account Executive',
 'Solutions Architect',
 'Data Scientist',
 'Embedded Software Engineer',
 'Full Stack Software Engineer',
 'Product Manager',
 'Quality Engineer',
 'Technical Support Engineer']

Alright. Let's read that into a DataFrame. 

In [5]:
df = pd.DataFrame(l_titles)
df.columns=['Position']
df['Company'] = 'Aquicore'
df

Unnamed: 0,Position,Company
0,Account Executive,Aquicore
1,Solutions Architect,Aquicore
2,Data Scientist,Aquicore
3,Embedded Software Engineer,Aquicore
4,Full Stack Software Engineer,Aquicore
5,Product Manager,Aquicore
6,Quality Engineer,Aquicore
7,Technical Support Engineer,Aquicore


Now let's get the location and add it as a new column.

In [32]:
class_loc = soup.find_all(class_="sort-by-location")
class_loc

[<span class="sort-by-location posting-category small-category-label" href="#">Washington, DC</span>,
 <span class="sort-by-location posting-category small-category-label" href="#">Washington, DC</span>,
 <span class="sort-by-location posting-category small-category-label" href="#">Washington, DC</span>,
 <span class="sort-by-location posting-category small-category-label" href="#">Washington, DC</span>,
 <span class="sort-by-location posting-category small-category-label" href="#">Washington, DC</span>,
 <span class="sort-by-location posting-category small-category-label" href="#">Washington, DC</span>,
 <span class="sort-by-location posting-category small-category-label" href="#">Washington, DC</span>,
 <span class="sort-by-location posting-category small-category-label" href="#">Washington, DC</span>]

Alright, we've got a match.

In [16]:
df['Location'] = [lo.text.strip() for lo in class_loc]
df

Unnamed: 0,Position,Company,Location
0,Account Executive,Aquicore,"Washington, DC"
1,Solutions Architect,Aquicore,"Washington, DC"
2,Data Scientist,Aquicore,"Washington, DC"
3,Embedded Software Engineer,Aquicore,"Washington, DC"
4,Full Stack Software Engineer,Aquicore,"Washington, DC"
5,Product Manager,Aquicore,"Washington, DC"
6,Quality Engineer,Aquicore,"Washington, DC"
7,Technical Support Engineer,Aquicore,"Washington, DC"


Looking good. Let's see if we can make a function.

In [14]:
def get_jobs(c_list):
    """
    Scrape the job listings from company websites
    Put the key info in a DataFrame
    
    Parameters:
    c_list (list of dicts): contains company info for scraping
    company (str): company name
    url (str): url of the job listing webpage
    title_tag (str): the html identifier for the titles
    location_tag (str): the html identifier for the location_tag
    
    Returns: 
    big_df (pandas DataFrame) The results in a DataFrame
    """
    big_df = pd.DataFrame()
    
    # loop through list of dicts
    for c_dict in c_list:
        df = pd.DataFrame()
    
        soup = bs(requests.get(c_dict['url']).content)


         # TODO refactor so can handle any tag or class 
        titles = soup.find_all(c_dict['title_tag'])
        df['Title'] = [title.text.strip() for title in titles]

        # TODO refactor so can handle nay tag or class 
        class_loc = soup.find_all(class_=c_dict['location_class'])
        df['Locations'] = [lo.text.strip() for lo in class_loc]
        
        df['Company'] = c_dict['company']
        
        big_df = big_df.append(df)
    
    return(big_df)

# list of company dicts
c_list = [] 


# dict for each website
dict1 = {  
    "company": "Aquicore",
    "url": "https://jobs.lever.co/aquicore",
    "title_tag": ('h5'),
    "location_class": 'sort-by-location'
}

c_list.append(c_dict)

dict2 = {  
    "company": "TransitScreen",
    "url": "https://jobs.lever.co/transitscreen",
    "title_tag": ('h5'),
    "location_class": 'sort-by-location'
}

c_list.append(dict2)


df_all = get_jobs(c_list)

print(df_all)   
# TODO write DataFrame to csv - later S3 bucket with timestamp
    


                              Title       Locations        Company
0                 Account Executive  Washington, DC       Aquicore
1               Solutions Architect  Washington, DC       Aquicore
2                    Data Scientist  Washington, DC       Aquicore
3        Embedded Software Engineer  Washington, DC       Aquicore
4      Full Stack Software Engineer  Washington, DC       Aquicore
5                   Product Manager  Washington, DC       Aquicore
6                  Quality Engineer  Washington, DC       Aquicore
7        Technical Support Engineer  Washington, DC       Aquicore
0                 Account Executive  Washington, DC  TransitScreen
1  Sales Development Representative  Washington, DC  TransitScreen
2         Senior Front-End Engineer  Washington, DC  TransitScreen
