In [16]:
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import time
import datetime as dt

Based on, with light adjustments:
https://github.com/leowalker89/DataJobs/blob/main/BS4_Indeed_functional.py 

In [17]:
def web_scrape_api_call(url_to_scrape):
    url = "https://api.webscrapingapi.com/v1"
    params = {
    "api_key":'insert_api_key',
    "url":url_to_scrape
    }
    response = requests.request("GET", url, params=params)
    return response

In [18]:
def make_indeed_url(search_job, search_location):
    '''
    This function takes in 3 search parameters and inserts them into an
    indeed.com url to search for jobs in those parameters
    input:
        search_job (str): job title being searched for
        search_location (str): city, state being searched
        job_age (int): 3 or 7, max age of job posting in days
    output:
        indeed_job_url (str): url to indeed jobs of the given parameters
    '''
    job = search_job.replace(' ', '%20')
    location = search_location.replace(',', '%2C').replace(' ', '%20')
    # indeed_job_url = f'https://ca.indeed.com/jobs?q={job}&l={location}&limit=100&radius=100&vjk=b1e75865f696b55e'
    # WORKING indeed_job_url = f'https://ca.indeed.com/jobs?q=data%20Analyst&l=Toronto%2C%20ON&vjk=a03b94ed80e9a099'
    indeed_job_url = f'https://ca.indeed.com/jobs?q={job}&l={location}&limit=100&&vjk=a03b94ed80e9a099'
    return indeed_job_url

In [19]:
def scrape_job_card(job_meta):
    '''
    This function takes in a job_card_element from indeed.com and extracts the
    job title, company name, company location, and estimated salary
    input: 
        job_card_element, selenium webdriver object (specific to indeed.com)
    output: 
        - job_title, str
        - company_name, str
        - company_location, str
        - estimated_salary, str
    '''
    try:
        job_title = job_meta.find('h2', {'class':'jobTitle'}).get_text().lstrip('new\n')
    except:
        job_title = 'No job title found'
    try:
        company_name = job_meta.find('span',{'class':'companyName'}).get_text()
    except:
        company_name = 'No Company Name'
    try:
        company_location = job_meta.find('div', {'class':'companyLocation'}).get_text()
    except:
        company_location = 'No Location'
    try:
        estimated_salary = job_meta.find('div', {'class':'metadata salary-snippet-container'}).get_text()
    except:
        estimated_salary = 'No Estimated Salary'
    return job_title, company_name, company_location, estimated_salary

In [20]:
def scrape_job_description(job_desc_href):
    '''
    This function takes in a job_card_element from indeed.com and extracts the
    job description.
    input: 
        job_card_element: selenium webdriver object (specific to indeed.com)
    output: 
        job_desc, str, can be extremely long (avg 3,000-7,000 characters)
    '''
    try:
        page = web_scrape_api_call(job_desc_href)
        soup = BeautifulSoup(page.content, 'html.parser')
        job_desc = soup.find(id='jobDescriptionText')
        job_desc = job_desc.text.replace('\n', ' ').replace('\r', '')
    except:
        job_desc = 'No Job Description'
    return job_desc

In [21]:
def scrape_job_page_meta(job_page_html):
    '''
    This function takes in a html job page and uses beautiful soup to extract each jobs title, company name,
    estimated salary, job description href and then uses that href to open the job description page and
    extract that job description. While its looping through each job on the job page it is storing the 
    information in a pandas dataframe.
    input:
        job_page_html: html response from indeed search request
    output:
        jobs_df: pandas dataframe containing the scraped data from the job search page
    ''' 
    page_soup = BeautifulSoup(job_page_html.text, 'lxml')
    df_columns = ['job_title', 'company_name', 'company_location', 'est_salary', 'job_href','job_desc']
    jobs_df = pd.DataFrame(columns = df_columns)
    for job in page_soup.find_all('div',{"id":"mosaic-provider-jobcards"}): 
        # Lets find the job title
        for href_post in job.find_all('a', href=True):
            if href_post.find('a', href=True):
                #this is for the url for the job posting
                job_desc_href = 'https://ca.indeed.com'+str(href_post['href'])
                job_desc = scrape_job_description(job_desc_href)
                for job_meta in href_post.find_all('div',{"class":"job_seen_beacon"}):
                    job_title, company_name, company_location, estimated_salary = scrape_job_card(job_meta)
                    print(f'{job_title}, {job_desc_href}')        
                    job_dict = {'job_title': job_title,
                                'company_name': [company_name],
                                'company_location': [company_location],
                                'est_salary': [estimated_salary],
                                'job_href': [job_desc_href],
                                'job_desc': [job_desc]}
                    j_df = pd.DataFrame.from_dict(job_dict)
                    jobs_df= jobs_df.append(j_df, ignore_index=True)
    return jobs_df



In [22]:
def job_loc_scrape_loop(job_list,loc_list):
    '''
    This function takes in a list of job titles, locations and age.
    It then loops through each item of both lists, creates a url to call
    and then scrapes the given info from each page.
    At the end of each item in the loop it saves the job info to a dataframe
    '''        

    date = dt.datetime.today().strftime('%Y-%m-%d')
    df_columnsfinal = ['job_title', 'company_name', 'company_location', 'est_salary', 'job_href','job_desc']
    fin_df = pd.DataFrame(columns = df_columnsfinal)
    for job in job_list:
        for loc in loc_list:
            print(f'Scraping: {job} {loc}')
            indeed_url = make_indeed_url(job, loc)
            indeed_response = web_scrape_api_call(indeed_url)
            result_df = scrape_job_page_meta(indeed_response)
            # result_df['retrieve_date'] = date
            fin_df = fin_df.append(result_df, ignore_index=True) #<-Pesho attempt
            

    return fin_df

In [23]:
job_list = ["Business Intelligence Manager", "BI Manager", "BI Analyst", "Data Analyst"]
primary_city_state_list = ["Toronto, ON", "Ottawa, ON", "Vancouver, BC", "Kitchener-Waterloo, ON", "Montreal, QC",
"Calgary, AB", "Victoria, BC", "Halifax, NS", "Quebec City, QC", "Hamilton, ON", "Burnaby, BC", "North Vancouver, BC",
"New Westminster, BC", "Richmond, BC", "Delta, BC"]


final_output = job_loc_scrape_loop(job_list, primary_city_state_list)


Scraping: Business Intelligence Manager Toronto, ON
Scraping: Business Intelligence Manager Ottawa, ON
Scraping: Business Intelligence Manager Vancouver, BC
Business Intelligence Analyst - MP&A, https://ca.indeed.com/rc/clk?jk=7d03a28429817625&fccid=c899d1c8c8e6ff07&vjs=3
Solution Consultant, Business Intelligence, https://ca.indeed.com/company/Resolver/jobs/Solution-Consultant-340187b7173fdf92?fccid=d7c18b5ea9a454df&vjs=3
Senior Business Analyst, Business Intelligence, https://ca.indeed.com/rc/clk?jk=e82e0a11498e58d1&fccid=f66f721a44de3765&vjs=3
Financial Analyst, Business Intelligence, https://ca.indeed.com/company/Corza-Medical/jobs/Financial-Analyst-15a59864ccee6ba0?fccid=671342dca908b8b8&vjs=3
Business Intelligence (BI) Developer-Analyst, https://ca.indeed.com/rc/clk?jk=d9cf0c1748e22b22&fccid=e98b37c86f8eec83&vjs=3
Business Intelligence Engineer, https://ca.indeed.com/rc/clk?jk=466d74e262188c5d&fccid=fe2d21eef233e94a&vjs=3
Business Development Manager (Micromine Pitram), https://c

In [24]:
final_output_df = pd.DataFrame(final_output)

In [25]:
final_output_df.to_csv('jobsCND.csv')