In [200]:
# !pip3 install cloudscraper
# !pip3 install bs4

In [267]:
import cloudscraper
from bs4 import BeautifulSoup
import pandas as pd
import random
from urllib.parse import quote_plus, urlparse, ParseResult
import math
import time
import concurrent.futures

In [268]:
def get_headers():
    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0"
    ]

    headers = {
        "Connection": "keep-alive",
        "Cache-Control": "max-age=0",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": random.choice(user_agents),
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-User": "?1",
        "Sec-Fetch-Dest": "document",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8",
    }

    return headers

In [274]:
def scrape_jobs(job_search):
    """
    This function scrapes job postings from Simply Hired for a given job search term. 
    It paginates through all the available job listings, and for each job, it extracts 
    the job title, location, company name, link to the job post, and a summary of the job.
    
    The function makes use of the 'cloudscraper' and 'BeautifulSoup' libraries for 
    making HTTP requests and parsing the HTML response respectively.
    
    To avoid getting blocked, the function includes a delay of 3 seconds between 
    requests when paginating through job listings.
    
    The extracted job data is returned as a pandas DataFrame.

    Parameters:
    job_search (str): The job search term. Spaces and special characters in the term 
                      are URL-encoded.

    Returns:
    df (pandas.DataFrame): A DataFrame containing the scraped job data. Each row 
                           corresponds to a job post, and the columns are 'title',
                           'job_location', 'company_name', 'job_link', and 'job_summary'.
    """

    # Encode job_search for URL (replace spaces with '+' and other special characters)
    job_search = quote_plus(job_search)  

    # Base URL for Simply Hired
    base_url = "https://www.simplyhired.ca"

    # Construct URL with search query
    url = f"{base_url}/search?q={job_search}"

    # Create a scraper instance
    scraper = cloudscraper.create_scraper()

    # Get headers
    headers = get_headers()

    # Make an HTTP GET request to the URL
    response = scraper.get(url, headers=headers)

    # Parse the HTML content of the page with BeautifulSoup
    soup = BeautifulSoup(response.text, "html.parser")

    # Find total job count and calculate the number of pages to scrape
    total_job_count = int(soup.find('span', {'class': 'posting-total'}).text)
    job_pages = math.ceil(total_job_count/20)
    
    # List to store data for all jobs
    jobs_data = []

    # Loop through all pages
    for page in range(1, job_pages + 1):
        # Print the page number
        print(f"Scraping page {page}...")

        # If it's not the first page, make another request and parse it
        if page > 1:  
            # Sleep for 3 seconds between requests to avoid being blocked
            time.sleep(random.uniform(3, 6))
            
            url = f"{base_url}/search?q={job_search}&pn={page}"
            response = scraper.get(url, headers=headers)
            soup = BeautifulSoup(response.text, "html.parser")

        # Find the list of jobs on the page
        job_list = soup.find('ul', {'class': 'jobs'})
        jobs  = job_list.findAll('div', {'class': 'SerpJob-jobCard'})

        # Loop through all jobs on the page
        for job in jobs:
            # Extract job details and construct job URL
            title_tag = job.find('h3', {'class': 'jobposting-title'})
            title = title_tag.text
            link = title_tag.find('a').attrs['data-mdref']
            
            # Remove query parameters from the job URL
            parsed_link = urlparse(link)
            link = ParseResult(scheme=parsed_link.scheme, netloc=parsed_link.netloc, 
                               path=parsed_link.path, params=parsed_link.params, 
                               query='', fragment=parsed_link.fragment).geturl()

            # Extract other job details
            company_name = job.find('span', {'class': 'jobposting-company'}).text
            job_location = job.find('span', {'class': 'jobposting-location'}).text
            job_summary = job.find('p', {'class': 'jobposting-snippet'}).text
            date_of_job_post = job.find('time').attrs['datetime']
            
            url = f"https://www.simplyhired.ca{link}?isp=0&q={job_search}"

            # Store job details in a dictionary and append it to our list
            data = {
                'date_of_job_post': date_of_job_post,
                'title':  title,
                'job_location': job_location,
                'company_name': company_name,
                'job_link': url,
                'job_summary': job_summary
            }


            # jobs_data.append(data)
            yield data

In [240]:
jobs_df.drop_duplicates(subset='job_link', keep="first", inplace=True)


In [241]:
jobs_df.duplicated().sum()

0

In [242]:
jobs_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 507 entries, 0 to 611
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   date_of_job_post  507 non-null    object
 1   title             507 non-null    object
 2   job_location      507 non-null    object
 3   company_name      507 non-null    object
 4   job_link          507 non-null    object
 5   job_summary       507 non-null    object
dtypes: object(6)
memory usage: 27.7+ KB


In [243]:
jobs_df.to_csv('data/data_scientist_jobs.csv', index=False)

In [252]:
pd.set_option('display.max_colwidth', None)
jobs_df['job_link'].head(10)

0    https://www.simplyhired.ca/job/Tiz9efu8Gbf2yqVK4oCuf-uH2etT7S4517IS2UMqTonAM9lHWqIABQ?isp=0&q=Data+Scientist
1    https://www.simplyhired.ca/job/gnMlpa9NCMhcAHjRRpuwL4pMsTR_w9p4V7KqY91sXoL_QspI1c_psQ?isp=0&q=Data+Scientist
2    https://www.simplyhired.ca/job/dnDqxJfbA0BNLi3s7qXBBqvflRIknt3mZHDE05ilVEpYat7yGyHm0A?isp=0&q=Data+Scientist
3    https://www.simplyhired.ca/job/VAesZ_CXrODXtd7s3vMg6A8kV2wZZBuurPvnoAwGhRjbmZ1ZjhuYTg?isp=0&q=Data+Scientist
4    https://www.simplyhired.ca/job/hz4vgNdlPpqaBUZjfQNZ041OYmCoR6OlY19d_nh0E81eMPJcj8OCUg?isp=0&q=Data+Scientist
5    https://www.simplyhired.ca/job/NfZT4eMIwgIGaN76LkgqnkgsIQEsykQo2txVKRw6tHWUM_5mlZ32Zw?isp=0&q=Data+Scientist
6    https://www.simplyhired.ca/job/eDV2-7kb1ayYcwuRv5D7ODlcDt0sHR8PkxdCyTajEOMEnbnOE6CuAg?isp=0&q=Data+Scientist
7    https://www.simplyhired.ca/job/NkJjM_dzhGkWQLkM7IdXV0aWj00h65IyiG-to5x-YfMqQPol04oo-g?isp=0&q=Data+Scientist
8    https://www.simplyhired.ca/job/y1lefm-wNMoxBUDzkGAPikKUwYc_gVVbI6oGzOEd7rckT7jpTfsC

In [285]:
def get_job_details(job_url):
    """
    This function takes in a URL of a job posting, fetches the page,
    and extracts additional details like job type, qualifications, and description.
    """
    # Create a new scraper for each job detail page
    scraper = cloudscraper.create_scraper()
    headers = get_headers()
    response = scraper.get(job_url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    # Extract additional info, like job description, job type, qualifications, etc.
    job_details = soup.find('div', {'class': 'viewjob-content'})

    # Job type
    job_type_tag = job_details.find('span', {'class': 'viewjob-jobType'})
    job_type = job_type_tag.text if job_type_tag else None  # Default to "N/A" if tag is not found

    # Job qualifications
    qualifications_tags = job_details.findAll('li', {'class': 'viewjob-qualification'})
    job_qualifications = [qual_tag.text for qual_tag in qualifications_tags] if qualifications_tags else None

    # Job description
    job_description_tag = job_details.find('div', {'data-testid': 'VJ-section-content-jobDescription'})
    job_description = job_description_tag.text if job_description_tag else None

    return {
        'job_type': job_type,
        'job_qualifications': job_qualifications,
        'job_description': job_description
    }


In [286]:
if __name__=="__main__":
    job_data_generator = scrape_jobs("Data Scientist")
    jobs_df = pd.DataFrame(job_data_generator)

    #  Use ThreadPoolExecutor to parallelize the fetching of job details
    with concurrent.futures.ThreadPoolExecutor() as executor:
        # Fetch job details for each job link
        job_details_list = list(executor.map(get_job_details, jobs_df['job_link'].tolist()))

    # Create a DataFrame from the list of job details
    jobs_details_df = pd.DataFrame(job_details_list)

    # Combine the original DataFrame with the job details DataFrame
    jobs_df = pd.concat([jobs_df, jobs_details_df], axis=1)

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...


In [292]:
jobs_df.drop_duplicates(subset='job_link', keep="first", inplace=True)


In [293]:
jobs_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 508 entries, 0 to 596
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   date_of_job_post    508 non-null    object
 1   title               508 non-null    object
 2   job_location        508 non-null    object
 3   company_name        508 non-null    object
 4   job_link            508 non-null    object
 5   job_summary         508 non-null    object
 6   job_type            377 non-null    object
 7   job_qualifications  508 non-null    object
 8   job_description     508 non-null    object
dtypes: object(9)
memory usage: 39.7+ KB


In [290]:
 # Save the enriched DataFrame as a CSV file
jobs_df.to_csv('data/data_scientist_jobs.csv', index=False)

In [212]:
# # Indeed jobs
# count = window.mosaic.providerData['mosaic-provider-jobcards'].metaData.mosaicProviderJobCardsModel.tierSummaries[0].jobCount
# window.mosaic.providerData['mosaic-provider-jobcards'].metaData.mosaicProviderJobCardsModel.results


In [None]:
import cloudscraper
from bs4 import BeautifulSoup
import pandas as pd
import random
from urllib.parse import quote_plus, urlparse, ParseResult
import math
import time
import concurrent.futures

class SimplyHiredJobScraper:
    def __init__(self, job_roles, locations):
        self.job_roles = job_roles
        self.locations = locations
        self.base_url = "https://www.simplyhired.ca"

    @staticmethod
    def get_headers():
        user_agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0"
        ]

        headers = {
            "Connection": "keep-alive",
            "Cache-Control": "max-age=0",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": random.choice(user_agents),
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Sec-Fetch-Site": "none",
            "Sec-Fetch-Mode": "navigate",
            "Sec-Fetch-User": "?1",
            "Sec-Fetch-Dest": "document",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8",
        }

        return headers
    

    def scrape_jobs(self, job_role, location):
        # Encode job_search for URL (replace spaces with '+' and other special characters)
        job_role = quote_plus(job_role)

        # Encode location
        location = quote_plus(location)

        # Construct URL with search query
        url = f"{self.base_url}/search?q={job_role}&l={location}"

        # Create a scraper instance
        scraper = cloudscraper.create_scraper()
        


    def get_job_details(self, job_url):
        pass