# Web Scraping - DS jobs in Canada

This code is created to automate the process of daily personal job search - DS job scraping. 

Requirements are:
    1. from indeed in cities of Toronto, Montreal, Calgary and Vancouver, Canada;
    2  Full time position only;
    3. title as "data scientist / data analyst / data engineer";
    4. information scraped include: application deadline, application link, job title, company name, whether require python skills; 
    5. sort by application deadline; 

In [1]:
# define interested city list
city_set = ['Calgary','Montreal','Toronto', 'Vancouver']

# define search time range: job posted within last # of days
n_days = 5

In [2]:
# import all required libraries
import pandas as pd
import time
import datetime

import requests
import numpy as np

import bs4
from bs4 import BeautifulSoup

from urllib.parse import urljoin
import lxml.html

In [3]:
def evaluate_job(job_url):
    try:
        job_html = requests.request('GET', job_url, timeout = 25)
    except:
        return 0
    
    job_soup = bs4.BeautifulSoup(job_html.content, 'lxml')
    soup_body = job_soup('body')[0]
    
    if soup_body.text.count('PYTHON') +soup_body.text.count('Python') + soup_body.text.count('python') > 0:
        python_count = 'Y'
    else:
        python_count = 'N'

    return python_count

In [4]:
start = time.time()-n_days*86400

linklist = list()
python_input = list()
deadline_input = list()
job_title = []
company_name = []
location = []

def getlinkindeed (joblist,df): 
    base_link = 'https://www.indeed.com/'
    start = time.time()
    for city in city_set:
        page = requests.get('https://ca.indeed.com/jobs?q='+str(joblist)+'&l=' + str(city) + '&amp;jt=fulltime&amp;sort=date'+'&start=' + str(start))
        soup = BeautifulSoup(page.text, 'lxml')#, from_encoding='utf-8')
        for div in soup.find_all(name="div", attrs={"class":"row"}):
            for a in div.find_all(name="a", attrs={'data-tn-element':'jobTitle'}):
                finallinktext = base_link+a['href']
                linklist.append(base_link+a['href'])
                    
                python_input.append(evaluate_job(finallinktext))
                    
                try:
                    job_title.append(a['title'])
                except:
                    job_title.append('NAN')
                    

            company = div.find_all(name="span", attrs={"class":"company"})
                
            if len(company) > 0:
                for b in company:
                    company_name.append(b.text.strip())
                    break
                else:
                    sec_try = div.find_all(name="span", attrs={"class":"result-link-source"})
                    for span in sec_try:
                        company_name.append(span.text)

            location.append(city)
    
    df['job_title'] = job_title
    df['company_name'] = company_name
    df['location'] = location
    df['job_link'] = linklist
    df['Require Python?'] = python_input
    
    return df.drop_duplicates(['job_title','company_name'],keep= 'last')

In [5]:
DS_job = getlinkindeed ('data+scientist',pd.DataFrame())
DS_job['Type'] = 'Data Scientist'

In [6]:
DA_job = getlinkindeed ('data+analyst',pd.DataFrame())
DA_job['Type'] = 'Data Analyst'

In [7]:
DE_job = getlinkindeed ('data+engineer',pd.DataFrame())
DE_job['Type'] = 'Data Engineer'

In [8]:
frames = [DS_job, DA_job, DE_job]
Final_List = pd.concat(frames).reset_index(drop=True)
Final_List

Unnamed: 0,job_title,company_name,location,job_link,Require Python?,Type
0,"Data Scientist - Calgary, AB",RS Energy Group,Calgary,https://www.indeed.com//rc/clk?jk=ac48b39d18dc...,Y,Data Scientist
1,Water Quality Data Analyst,Government of Alberta,Calgary,https://www.indeed.com//rc/clk?jk=1a1867ce4cbc...,N,Data Scientist
2,"Data Scientist,Research Computing Services",University of Calgary,Calgary,https://www.indeed.com//rc/clk?jk=3bc37336d84c...,Y,Data Scientist
3,Junior Data Analyst,AGAT Laboratories,Calgary,https://www.indeed.com//company/AGAT-Laborator...,N,Data Scientist
4,Data Scientist +Python Developer,MVP Talent Corp,Calgary,https://www.indeed.com//company/MVP-Talent-Cor...,Y,Data Scientist
5,Soil Scientist,WorleyParsons,Calgary,https://www.indeed.com//rc/clk?jk=e4bcfa3d3623...,N,Data Scientist
6,"Senior Research Scientist (Research Scientist,...",NOVA Chemicals,Calgary,https://www.indeed.com//rc/clk?jk=b672eb513914...,N,Data Scientist
7,DATA SCIENTIST,BOWEN,Calgary,https://www.indeed.com//rc/clk?jk=8feb3a462af5...,N,Data Scientist
8,Environmental Scientist - LM&R,WorleyParsons,Calgary,https://www.indeed.com//rc/clk?jk=7e6b1c8bb675...,N,Data Scientist
9,Water Quality Scientist,Matrix Solutions Inc.,Calgary,https://www.indeed.com//rc/clk?jk=446857233d64...,N,Data Scientist


In [9]:
# save to csv with labels
Final_List.to_csv('Job_List_CreatedAt' +str(datetime.datetime.now().strftime("%Y-%m-%d"))+'_Last'+str(n_days)+'Days.csv')