This file updates the csv file with current job postings

In [23]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [24]:
searches = ['data intern', 'data analyst', 'data scientist', 'machine learning', 'data student', 'junior analyst', 'python analyst']
range = 3
radius = 100
location = 'vancouver'
province = 'BC'

In [25]:
def create_urls(location, province, searches, radius=100, range=14):
    """create a list of urls for multiple searches"""
    print('creating urls')
    urls_list = []
    for search in searches:
        search = search.split(" ")
        search = '+'.join(search)
        url = 'https://ca.indeed.com/jobs?q=' + search
        url = url + '&l=' + location + '%2C+' + province
        url = url + '&radius=' + str(radius) + '&sort=date'
        url = url + '&fromage=' + str(range) + '&filter=0'
        for i in [0, 10, 20, 30]:
            url = url + '&start=' + str(i)
            urls_list.append(url)
    return urls_list

In [26]:
urls = create_urls(location, province, searches, range=range, radius=radius)
urls[0]

creating urls


'https://ca.indeed.com/jobs?q=data+intern&l=vancouver%2C+BC&radius=100&sort=date&fromage=3&filter=0&start=0'

In [27]:
def get_pages(urls):
    """get the page data for each search"""
    print('getting pages')
    pages = []
    for url in urls:
        pages.append(requests.get(url))
        time.sleep(10)
        if len(pages) % 5 == 0:
            print(f'{len(pages)} pages found')
    soups = [BeautifulSoup(page.text, 'html.parser') for page in pages]
    return soups

In [28]:
soups = get_pages(urls)

getting pages
5 pages found
10 pages found
15 pages found
20 pages found
25 pages found


In [29]:
def extract_job_title_from_result(soups):
    print('getting job titles')
    jobs = []
    for soup in soups:
        for div in soup.find_all(name='div', attrs={'class': 'row'}):
            for a in div.find_all(name='a', attrs={'data-tn-element': 'jobTitle'}):
                jobs.append(a['title'])
    return(jobs)

In [30]:
job_titles = extract_job_title_from_result(soups)
job_titles[0]

getting job titles


'Business Analyst Co-op Student - Richmond, B.C. - R09857'

In [31]:
def extract_company_from_result(soups):
    print('getting company names')
    companies = []
    for soup in soups:
        for div in soup.find_all(name='div', attrs={'class': 'row'}):
            company = div.find_all(name='span', attrs={'class': 'company'})
            if len(company) > 0:
                for b in company:
                    companies.append(b.text.strip())
                else:
                    sec_try = div.find_all(name='span', attrs={'class': 'result-link-source'})
                    for span in sec_try:
                        companies.append(span.text.strip())
    return(companies)

In [32]:
companies = extract_company_from_result(soups)
companies[0]

getting company names


'MDA'

In [33]:
def extract_location_from_result(soups):
    print('getting locations')
    locations = []
    for soup in soups:
        spans = soup.findAll('span', attrs={'class': 'location'})
        for span in spans:
            locations.append(span.text)
    return(locations)

In [34]:
locations = extract_location_from_result(soups)
locations[0]

getting locations


'Richmond, BC'

In [35]:
old_df = pd.read_csv('job_postings.csv')
old_df.head()

Unnamed: 0,job_title,company,location,href,description,apply
0,Data Analyst Co-op (Spring term),Ridley College (Canada),"St. Catharines, ON",https://ca.indeed.com/rc/clk?jk=4aafa08c370b87...,[[<div><p><b>Position Title: Data Analyst Co-o...,Yes
1,Data Analytics Associate Summer Intern (MBA),Johnson & Johnson Family of Companies,"Toronto, ON",https://ca.indeed.com/rc/clk?jk=1d1d136f3b5263...,[[<div><p><b>Data Analytics Associate Intern –...,No
2,"Data Analyst, Summer 2021 Student Opportunities",RBC,"Toronto, ON",https://ca.indeed.com/rc/clk?jk=5bc75ed7e05b22...,[[<div><p><b>What is the opportunity?</b><br/>...,Yes
3,"Data Scientist, Summer Student 2021 Opportunities",RBC,"Toronto, ON",https://ca.indeed.com/rc/clk?jk=1bdf42b3d5b3e4...,[[<div><p><b>What is the opportunity?</b><br/>...,Yes
4,"Business/Operations Analyst, Summer 2021 Stude...",RBC,"Toronto, ON",https://ca.indeed.com/rc/clk?jk=76d9a17c168e02...,[[<div><p><b>What is the opportunity?</b></p><...,Yes


In [36]:
def extract_job_href_from_result(soups):
    print('getting job urls')
    href = []
    for soup in soups:
        for div in soup.find_all(name='div', attrs={'class': 'row'}):
            for a in div.find_all(name='a', attrs={'data-tn-element': 'jobTitle'}):
                href.append('https://ca.indeed.com' + str(a['href']))
    return(href)

In [37]:
href = extract_job_href_from_result(soups)
href[0]

getting job urls


'https://ca.indeed.com/company/MDA/jobs/Business-Analyst-Op-Student-1c85976738c57edf?fccid=11f7828cf965b07c&vjs=3'

In [38]:
len(href)

182

In [None]:
full_pages = get_pages(href)

getting pages
5 pages found
10 pages found
15 pages found
20 pages found
25 pages found
30 pages found
35 pages found
40 pages found
45 pages found
50 pages found
55 pages found
60 pages found
65 pages found
70 pages found
75 pages found
80 pages found
85 pages found
90 pages found
95 pages found
100 pages found
105 pages found
110 pages found
115 pages found
120 pages found
125 pages found
130 pages found
135 pages found
140 pages found
145 pages found
150 pages found
155 pages found
160 pages found
165 pages found
170 pages found


In [None]:
def get_job_description(full_pages):
    print('getting job descriptions')
    descriptions = []
    for page in full_pages:
        desc = []
        for div in page.find_all(attrs={'class': 'jobsearch-jobDescriptionText'}):
            desc.append(div.contents)
        descriptions.append(desc)
    return descriptions

In [None]:
descriptions = get_job_description(full_pages)

In [None]:
def check_for_duplicates(job_titles, companies, locations, href, old_df, descriptions):
    print('removing duplicates')
    columns = ["job_title", "company", "location", "href", 'description', 'apply']
    apply = ['NaN']*len(job_titles)
    df = pd.DataFrame(list(zip(job_titles, companies, locations, href, descriptions, apply)), columns=columns)
    jobs = len(df['job_title'])
    print(f'{jobs} total jobs found')
    df = pd.concat([old_df, df])
    df = df.drop_duplicates(subset=['job_title', 'company'], keep='first')
    jobs = len(df['job_title']) - len(old_df['job_title'])
    print(f'{jobs} new jobs found')
    return df

In [None]:
df = check_for_duplicates(job_titles, companies, locations, href, old_df, descriptions)

In [None]:
df.to_csv('job_postings.csv', index=False)