In [1]:
from bs4 import BeautifulSoup
import re, pandas as pd
from selenium import webdriver
#import sys, os
import json
#import time
from wordcloud import WordCloud, STOPWORDS
from matplotlib import pyplot as plt
from pprint import pprint

In [2]:
def get_soup(url):
    """
    Given the url of a page, this function returns the soup object.
    
    Arguments:
    url -- the link to get soup object for
    
    Returns:
    soup - soup object
    """
    driver = webdriver.Firefox()
    driver.get(url)
    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')
    driver.close()
    
    return soup

In [3]:
def grab_job_links(soup):
    """
    Grab all non-sponsored job posting links from a Indeed search result page using the given soup object
    
    Arguments:
    soup -- the soup object corresponding to a search result page
            e.g. https://ca.indeed.com/jobs?q=data+scientist&l=Toronto&start=20
    
    Returns:
    urls -- a python list of job posting urls
    
    """
    urls = []
    
    # Loop thru all the posting links
    for link in soup.find_all('h2', {'class': 'jobtitle'}):
        # Since sponsored job postings are represented by "a target" instead of "a href", no need to worry here
        partial_url = link.a.get('href')
        # This is a partial url, we need to attach the prefix
        url = 'https://ca.indeed.com' + partial_url
        # Make sure this is not a sponsored posting
        urls.append(url)
    
    return urls

In [4]:
def get_urls(query, num_pages, location):
    """
    Get all the job posting URLs resulted from a specific search.
    
    Arguments:
    query -- job title to query
    num_pages -- number of pages needed
    location -- city to search in
    
    Returns:
    urls -- a list of job posting URL's (when num_pages valid)
    max_pages -- maximum number of pages allowed ((when num_pages invalid))
    """
    # We always need the first page
    base_url = 'https://ca.indeed.com/jobs?q={}&l={}'.format(query, location)
    soup = get_soup(base_url)
    urls = grab_job_links(soup)
    
    # Get the total number of postings found 
    posting_count_string = soup.find(name='div', attrs={'id':"searchCount"}).get_text()
    posting_count_string = posting_count_string[posting_count_string.find('of')+2:].strip()
    #print('posting_count_string: {}'.format(posting_count_string))
    #print('type is: {}'.format(type(posting_count_string)))
    
    try:
        posting_count = int(posting_count_string)
    except ValueError: # deal with special case when parsed string is "360 jobs"
        posting_count = int(re.search('\d+', posting_count_string).group(0))
        #print('posting_count: {}'.format(posting_count))
        #print('\ntype: {}'.format(type(posting_count)))
    finally:
        posting_count = 330 # setting to 330 when unable to get the total
        pass
    
    # Limit nunmber of pages to get
    max_pages = round(posting_count / 10) - 3
    if num_pages > max_pages:
        print('returning max_pages!!')
        return max_pages
    
        # Additional work is needed when more than 1 page is requested
    if num_pages >= 2:
        # Start loop from page 2 since page 1 has been dealt with above
        for i in range(2, num_pages+1):
            num = (i-1) * 10
            base_url = 'https://ca.indeed.com/jobs?q={}&l={}&start={}'.format(query, location, num)
            try:
                soup = get_soup(base_url)
                # We always combine the results back to the list
                urls += grab_job_links(soup)
            except:
                continue

    # Check to ensure the number of urls gotten is correct
    #assert len(urls) == num_pages * 10, "There are missing job links, check code!"

    return urls     

In [5]:
def get_posting(url):
    """
    Get the text portion including both title and job description of the job posting from a given url
    
    Arguments:
    url -- The job posting link
        
    Returns:
    title -- the job title (if "data scientist" is in the title)
    posting -- the job posting content    
    """
    # Get the url content as BS object
    soup = get_soup(url)
    
    # The job title is held in the h3 tag
    title = soup.find(name='h3').getText().lower()
    posting = soup.find(name='div', attrs={'class': "jobsearch-JobComponent"}).get_text()

    return title, posting.lower()

        
    #if 'data scientist' in title:  # We'll proceed to grab the job posting text if the title is correct
        # All the text info is contained in the div element with the below class, extract the text.
        #posting = soup.find(name='div', attrs={'class': "jobsearch-JobComponent"}).get_text()
        #return title, posting.lower()
    #else:
        #return False
    
        # Get rid of numbers and symbols other than given
        #text = re.sub("[^a-zA-Z'+#&]", " ", text)
        # Convert to lower case and split to list and then set
        #text = text.lower().strip()
    
        #return text

In [6]:
def get_data(query, num_pages, location='Toronto'):
    """
    Get all the job posting data and save in a json file using below structure:
    
    {<count>: {'title': ..., 'posting':..., 'url':...}...}
    
    The json file name has this format: ""<query>.json"
    
    Arguments:
    query -- Indeed query keyword such as 'Data Scientist'
    num_pages - Number of search results needed
    location -- location to search for
    
    Returns:
    postings_dict -- Python dict including all posting data
    
    """
    # Convert the queried title to Indeed format
    query = '+'.join(query.lower().split())
    
    postings_dict = {}
    urls = get_urls(query, num_pages, location)
    
    #  Continue only if the requested number of pages is valid (when invalid, a number is returned instead of list)
    if isinstance(urls, list):
        num_urls = len(urls)
        for i, url in enumerate(urls):
            try:
                title, posting = get_posting(url)
                postings_dict[i] = {}
                postings_dict[i]['title'], postings_dict[i]['posting'], postings_dict[i]['url'] = \
                title, posting, url
            except: 
                continue
            
            percent = (i+1) / num_urls
            # Print the progress the "end" arg keeps the message in the same line 
            print("Progress: {:2.0f}%".format(100*percent), end='\r')

        # Save the dict as json file
        file_name = query.replace('+', '_') + '.json'
        with open(file_name, 'w') as f:
            json.dump(postings_dict, f)
        
        print('All {} postings have been scraped and saved!'.format(num_urls))    
        #return postings_dict
    else:
        print("Due to similar results, maximum number of pages is only {}. Please try again!".format(urls))

In [7]:
def load_data(file_name):
    """
    Open the saved json data file and load the data into a dict.
    
    Argument:
    file_name -- the saved file name, e.g. "machine_learning_engineer.json"
    
    Returns:
    postings_dict -- data in dict format   
    
    """

    with open(file_name, 'r') as f:
        postings_dict = json.load(f)
        return postings_dict

In [8]:
def clean_text(text):
    """
    Clean the text so that all words are root...
    
    Arguments:
    text -- list of job posting strings
        
    Returns:
    cleaned_text -- a text string for the wc plot
    """
    # Split the text based on slash, space and newline, then take set     
    #text = [set(re.split('/| |\n|', i)) for i in text]
    text = [set(re.split('\W', i)) for i in text]
    
    cleaned_text = []
    for i in text:
        cleaned_text += list(i)
    cleaned_text = ' '.join(cleaned_text)
    
    return cleaned_text   

In [28]:
def make_text_list(postings_dict, first_n_postings=100):
    
    text = []
    for i in range(0, first_n_postings+1):
        text.append(postings_dict[str(i)]['posting'])
    
    return text

In [29]:
def plot_wc(text, max_words=200, stopwords_list=[], to_file_name=None):
    """
    Make a word cloud plot using the given text.
    
    Arguments:
    text -- the text as a string
    
    Returns:
    None    
    """
    wordcloud = WordCloud().generate(text)
    stopwords = set(STOPWORDS)
    stopwords.update(stopwords_list)

    wordcloud = WordCloud(background_color='white',
                         stopwords=stopwords,
                         #prefer_horizontal=1,
                         max_words=max_words, 
                         min_font_size=6,
                         scale=1,
                         width = 800, height = 800, 
                         random_state=8).generate(text)
    plt.figure(figsize=[16,16])
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()
    
    if to_file_name:
        to_file_name = to_file_name + ".png"
        wordcloud.to_file(to_file_name)

In [9]:
get_data(query='machine learning engineer', num_pages=30)

All 290 postings have been scraped and saved!


In [10]:
data = load_data('machine_learning_engineer.json')

In [234]:
stopwords_list = ['accommodation', 'ago', 'application', 'based', 'canada', 'candidate', 'company', 'data', 'days', 
                  'education', 'employee', 'ensure', 'environment', 'et', 'etc', 'experience', 'help', 'including', 
                  'job', 'jobapply', 'life', 'location', 'microsoft', 'nowapply', 'office', 'preferred', 'people',
                  'qualifications', 'required', 'requirement', 'requirements', 'resume', 'review','reviews',
                  'reviewsread', 'role', 'save', 'saying', 'scientist', 'self', 'service', 'sitesave', 'skill', 
                  'skills', 'time', 'tool', 'toronto', 'understanding', 'us', 'well', 'will', 'work', 'working', 
                  'world', 'year', 'yearsjobapply']
stopwords_list

['accommodation',
 'ago',
 'application',
 'based',
 'canada',
 'candidate',
 'company',
 'data',
 'days',
 'education',
 'employee',
 'ensure',
 'environment',
 'et',
 'etc',
 'experience',
 'help',
 'including',
 'job',
 'jobapply',
 'life',
 'location',
 'microsoft',
 'nowapply',
 'office',
 'preferred',
 'people',
 'qualifications',
 'required',
 'requirement',
 'requirements',
 'resume',
 'review',
 'reviews',
 'reviewsread',
 'role',
 'save',
 'saying',
 'scientist',
 'self',
 'service',
 'sitesave',
 'skill',
 'skills',
 'time',
 'tool',
 'toronto',
 'understanding',
 'us',
 'well',
 'will',
 'work',
 'working',
 'world',
 'year',
 'yearsjobapply']

## Todo's
- stemming etc.
- docstring and comments
- OOP
- add progress update text
- single responsiblity principle for functions