In [12]:
import cloudscraper
from bs4 import BeautifulSoup
import pandas as pd
import random
from urllib.parse import quote_plus, urlparse, ParseResult
import math
import time
import concurrent.futures
import os

In [15]:
%pwd

'/Users/macbookpro/Desktop/pixi_hr_project/pixi_hr/research'

In [16]:
os.chdir("../")

In [17]:
%pwd

'/Users/macbookpro/Desktop/pixi_hr_project/pixi_hr'

In [5]:
# !pip3 install cloudscraper
# !pip3 install bs4

In [24]:
class DataSourcing:
    """
    A class used to scrape job postings from SimplyHired based on specific job titles and locations.

    The class supports scraping multiple job titles across multiple locations and will only scrape new job postings
    not present in an existing dataset.

    Attributes:
    ----------
    base_url : str
        The base URL for SimplyHired job search (default is "https://www.simplyhired.ca").
    job_searches : list
        A list of job titles to search for (default is ["Data Scientist"]).
    locations : list
        A list of locations to search within (default is ["Toronto"]).
    existing_job_links : list
        A list of job links already present in the dataset, used to prevent redundant scraping.

    Methods:
    -------
    _load_existing_job_links():
        Load job links from the existing dataset and return them.
    _get_headers():
        Generate random headers for web requests to mimic real browser requests.
    ensure_directory_exists(filepath):
        Ensure the directory for the given filepath exists. If not, create it.
    _construct_url(job_search, location, page=1):
        Construct the URL based on the job search query, location, and page number.
    scrape_jobs():
        Generator function that yields job data for each page and job, excluding jobs already present in the dataset.
    _get_soup(url):
        Fetch and parse the HTML content of a given URL.
    _extract_job_data(job):
        Extract essential job details from a BeautifulSoup job element and return as a dictionary.
    _get_job_details(job_url):
        Fetch and return additional job details for a given job URL.
    main():
        Main function to scrape job data, fetch job details, and save to CSV.
    """
   
    def __init__(self, base_url="https://www.simplyhired.ca", job_searches=["Data Scientist"], locations=["Toronto"]):
        """
        Initialize the DataSourcing object.

        Parameters:
        - base_url (str): The base URL for the job search.
        - job_searches (list): List of job titles to search for.
        - locations (list): List of locations to search within.
        """
        self.base_url = base_url
        self.job_searches = job_searches
        self.locations = locations
        self.existing_job_links = self._load_existing_job_links()

        # More attributes can be added as needed

    
    def _load_existing_job_links(self):
        """Load job links from the existing dataset and return them.
        
        Returns:
        - list: A list of existing job links.
        """
        file_path = 'artifacts/sourcing/data_scientist_jobs.csv'
        if os.path.exists(file_path):
            existing_jobs_df = pd.read_csv(file_path)
            return existing_jobs_df['job_link'].tolist()
        return []
    

    @staticmethod
    def _get_headers():
        """Generate random headers for web requests to mimic real browser requests.
        
        Returns:
        - dict: Dictionary of headers.
        """

        # A list of popular user-agents to mimic real browser requests.
        user_agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36",
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0"
        ]

        headers = {
            "Connection": "keep-alive",
            "Cache-Control": "max-age=0",
            "Upgrade-Insecure-Requests": "1",
            "User-Agent": random.choice(user_agents),
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Sec-Fetch-Site": "none",
            "Sec-Fetch-Mode": "navigate",
            "Sec-Fetch-User": "?1",
            "Sec-Fetch-Dest": "document",
            "Accept-Encoding": "gzip, deflate, br",
            "Accept-Language": "en-GB,en-US;q=0.9,en;q=0.8",
        }

        return headers
    

    @staticmethod
    def ensure_directory_exists(filepath):
        """Ensure the directory for the given filepath exists. If not, create it.

        Parameters:
        - filepath (str): The path to check.
        """
        directory = os.path.dirname(filepath)
        if not os.path.exists(directory):
            os.makedirs(directory)


    def _construct_url(self, job_search, location, page=1):
        """
        Construct the URL based on the job search query, location, and page number.

        Parameters:
        - job_search (str): The job title to search for.
        - location (str): The location to search within.
        - page (int): The page number.

        Returns:
        - str: Constructed URL.
        """
        job_query = quote_plus(job_search)
        location_query = quote_plus(location)
        return f"{self.base_url}/search?q={job_query}&l={location_query}&pn={page}"
    

    def scrape_jobs(self):
        """
        Generator function that yields job data for each page and job, excluding jobs already present in the dataset.

        Yields:
        - dict: Dictionary containing job details.
        """
        soup = self._get_soup(self._construct_url(self.job_search, self.location))
        total_job_count = int(soup.find('span', {'class': 'posting-total'}).text.replace(',', ''))
        job_pages = math.ceil(total_job_count/20)

        for page in range(1, job_pages + 1):
            print(f"Scraping page {page}...")
            soup = self._get_soup(self._construct_url(self.job_search, self.location, page))
            for job in soup.find('ul', {'class': 'jobs'}).findAll('div', {'class': 'SerpJob-jobCard'}):
                job_data = self._extract_job_data(job)
                # Skip the job if it's already in the existing dataset
                if job_data['job_link'] in self.existing_job_links:
                    continue
                yield job_data


    def _get_soup(self, url):
        """Fetch and parse the HTML content of a given URL.

        Parameters:
        - url (str): The URL to fetch.

        Returns:
        - BeautifulSoup object: Parsed HTML content.
        """
        scraper = cloudscraper.create_scraper()
        headers = self._get_headers()
        response = scraper.get(url, headers=headers)
        return BeautifulSoup(response.text, "html.parser")
    

    @staticmethod
    def _extract_job_data(job):
        """Extract essential job details from a BeautifulSoup job element and return as a dictionary.

        Parameters:
        - job (BeautifulSoup element): The job element to extract data from.

        Returns:
        - dict: Dictionary containing essential job details.
        """
        title_tag = job.find('h3', {'class': 'jobposting-title'})
        title = title_tag.text
        link = title_tag.find('a').attrs['data-mdref']
        parsed_link = urlparse(link)
        link = ParseResult(scheme=parsed_link.scheme, netloc=parsed_link.netloc, 
                           path=parsed_link.path, params=parsed_link.params, 
                           query='', fragment=parsed_link.fragment).geturl()

        return {
            'date_of_job_post': job.find('time').attrs['datetime'],
            'title':  title,
            'job_location': job.find('span', {'class': 'jobposting-location'}).text,
            'company_name': job.find('span', {'class': 'jobposting-company'}).text,
            'job_link': f"https://www.simplyhired.ca{link}",
            'job_summary': job.find('p', {'class': 'jobposting-snippet'}).text
        }
    


    def _get_job_details(self, job_url):
        """Fetch and return additional job details for a given job URL.

        Parameters:
        - job_url (str): The URL of the job to fetch details for.

        Returns:
        - dict: Dictionary containing additional job details.
        """

        # Create a new scraper for each job detail page
        scraper = cloudscraper.create_scraper()
        headers = self._get_headers()
        response = scraper.get(job_url, headers=headers)
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract additional info, like job description, job type, qualifications, etc.
        job_details = soup.find('div', {'class': 'viewjob-content'})

        # Job type
        job_type_tag = job_details.find('span', {'class': 'viewjob-jobType'})
        job_type = job_type_tag.text if job_type_tag else None  # Default to "N/A" if tag is not found

        # Job qualifications
        qualifications_tags = job_details.findAll('li', {'class': 'viewjob-qualification'})
        job_qualifications = [qual_tag.text for qual_tag in qualifications_tags] if qualifications_tags else None

        # Job description
        job_description_tag = job_details.find('div', {'data-testid': 'VJ-section-content-jobDescription'})
        job_description = job_description_tag.text if job_description_tag else None

        return {
            'job_type': job_type,
            'job_qualifications': job_qualifications,
            'job_description': job_description
        }
    

    def main(self):
        """
        Main function to:
        1. Scrape job data.
        2. Fetch job details.
        3. Save to CSV, appending to existing data and ensuring no duplicates.
        """
        all_jobs = []

        # Iterate over each combination of job title and location
        for job_search in self.job_searches:
            for location in self.locations:
                self.job_search = job_search  # Update the current job search
                self.location = location      # Update the current location

                job_data_generator = self.scrape_jobs()
                jobs_df = pd.DataFrame(job_data_generator)

                with concurrent.futures.ThreadPoolExecutor() as executor:
                    job_details_list = list(executor.map(self._get_job_details, jobs_df['job_link'].tolist()))

                jobs_details_df = pd.DataFrame(job_details_list)
                all_jobs.append(pd.concat([jobs_df, jobs_details_df], axis=1))

        # Combine all job data
        all_jobs_df = pd.concat(all_jobs, ignore_index=True)

        file_path = 'artifacts/sourcing/data_scientist_jobs.csv'

        # Read existing data if file exists
        if os.path.exists(file_path):
            existing_jobs_df = pd.read_csv(file_path)
            all_jobs_df = pd.concat([existing_jobs_df, all_jobs_df], ignore_index=True)

        # Ensure the directory exists before saving the file
        self.ensure_directory_exists(file_path)
        
        all_jobs_df.to_csv(file_path, index=False)

In [54]:
if __name__ == "__main__":
    job_titles = ["Data Scientist", "Data Engineer", "Machine Learning Engineer"]
    cities = ["Toronto", "Vancouver", "Montreal"]
    
    data_sourcing = DataSourcing(job_searches=job_titles, locations=cities)
    data_sourcing.main()

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 1...
Scraping page 1...
Scraping page 1...
Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping p

In [44]:
df = pd.read_csv("artifacts/data_transformation/train_data.csv")

In [53]:
df.isna().count().

date_of_job_post    1976
title               1976
job_location        1976
company_name        1976
job_link            1976
                    ... 
qual_xpath          1976
qual_xslt           1976
qual_xunit          1976
qual_zbrush         1976
qual_zookeeper      1976
Length: 556, dtype: int64

In [45]:
# Check for NaN values in the DataFrame
nan_columns = df.columns[df.isna().any()]
nan_count = df[nan_columns].isna().sum()

# Display columns with NaN values and their respective counts
nan_count

date_of_job_post    1
title               1
job_location        1
company_name        1
job_link            1
                   ..
qual_xpath          1
qual_xslt           1
qual_xunit          1
qual_zbrush         1
qual_zookeeper      1
Length: 556, dtype: int64

In [46]:
rows_with_nan = df[df.isna().any(axis=1)]


In [42]:
rows_with_nan

Unnamed: 0,date_of_job_post,title,job_location,company_name,job_link,job_summary,job_type,job_description,date_of_job_post_temp,month_of_job_post,...,qual_woodworking,qual_wordpress,qual_workday,qual_writingskills,qual_xml,qual_xpath,qual_xslt,qual_xunit,qual_zbrush,qual_zookeeper
76,2023-08-13 01:03:15+00:00,169.0,22.0,727.0,https://www.simplyhired.ca/job/viByG-pZnRAAaKb...,Dans le cadre de ses ententes avec ses différe...,0.0,Dans le cadre de ses ententes avec ses différe...,2023-08-13 01:03:15+00:00,8.0,...,,,,,,,,,,
1161,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
