In [1]:
import pandas as pd
import requests
import time

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
# parse filters

import json

def read_config(file_path):
    with open(file_path, 'r') as file:
        config = json.load(file)
    return config

In [3]:
# Template Class
from abc import ABC, abstractmethod

class JobsScraper(ABC):

    # apply filters method?
    # essentially just appends filters to the url
    # maybe appending jos should be here? Config maybe?
    
    @abstractmethod
    def __init__(self, job, config):
        self.job = job
        self.config = config

    @abstractmethod
    def parse_url(self):
        pass

    @abstractmethod
    def scrape_site(self):
        pass

    # add data to df translation function or not?

    def refine_data(self, df):
        # By default just returns the same dataframe for when refinement
        # is not needed but can be overriden by concrete classes
        return df

    @classmethod
    def get_jobs(cls, job, config):
        instance = cls(job, config)

        url = instance.parse_url()
        df = instance.scrape_site(url)
        df = instance.refine_data(df)
        return df

In [4]:
class JobStreetScrapper(JobsScraper):

    def __init__(self, job, config):
        super().__init__(job, config)
        self.base_url = config['url']['jobstreet']

    def parse_url(self):
        filters = self.config['filters']
        # jobstreet_config = config['jobstreet']
        url = f'{self.config['url']['jobstreet']}/{self.job}-jobs'

        if filters['location']:
            url = f'{url}/in-{filters['location']}'

        if filters['daterange']:
            url = f'{url}?daterange={filters['daterange']}'

        return url
    
    def scrape_site(self, url):
        job_listings = []
        page = 1
        jobs_collected = 0
        status_code = 200
        number_of_jobs = 10

        # while status_code == 200:
            # url = f'{url}page={page}'
        response = requests.get(url)
        status_code = response.status_code

        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract job information
        #TODO: Validation to check that job title, company name, and job link should always be present
        #TODO: check for "next" page
        for job_card in soup.find_all(attrs={"data-automation":"normalJob"}):
            job_title = getattr(job_card.find(attrs={"data-automation":"jobTitle"}), 'text', None)
            company_name = getattr(job_card.find(attrs={"data-automation":"jobCompany"}), 'text', None)
            salary = getattr(job_card.find(attrs={"data-automation":"jobSalary"}), 'text', None)
            job_link = job_card.find(attrs={"data-automation":"jobTitle"})['href']

            job_listings.append({
                'job_title': job_title,
                'company_name': company_name,
                'salary': salary,
                'job_link': f'{self.base_url}{job_link}'
            })
            
            # jobs_collected += 1
            # if jobs_collected >= number_of_jobs:
            #     break

        # page += 1

        # else:
        #     if status_code == 404:
        #         print('No more pages to scrape')

        df = pd.DataFrame(job_listings)
        return df


In [5]:
class LinkedInScrapper(JobsScraper):

    def __init__(self, job, config):
        super().__init__(job, config)
        self.base_url = config['url']['linkedin']

    def parse_url(self):
        url = f'{self.base_url}/jobs/search?keywords={self.job}'

        filters = self.config['filters']

        if filters['location']:
            url = f'{url}&location={filters['location']}'

        if filters['daterange']:
            # convert days to seconds to match linkedin url syntax
            daterange = int(filters['daterange']) * 24 * 60 * 60
            url = f'{url}&f_TPR=r{daterange}'

        return url
    
    def scrape_site(self, url):
        #TODO: Add validation to check if we're in the expected page
        number_of_jobs = 10 #temp for testing
        # url = f'https://www.linkedin.com/jobs/search?keywords={job}&location=Philippines&trk=public_jobs_jobs'
        browser = webdriver.Chrome()
        browser.get(url)
        time.sleep(2)

        elem = browser.find_element(By.TAG_NAME, "body")
        job_cards = []

        max_iterations = 100 # to avoid an infinite loop
        iteration = 0
        job_cards_len = 0

        while (iteration < max_iterations) or (len(job_cards) == job_cards_len):
            browser.find_element(By.TAG_NAME, 'body').send_keys(Keys.PAGE_DOWN)

            job_cards = elem.find_elements(By.CLASS_NAME, 'base-card')
            
            if len(job_cards) == 0:
                print('No Items found')


            WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
            job_cards_len = len(job_cards)

            iteration += 1

        # while len(job_cards)<=number_of_jobs:
        #     job_cards = elem.find_elements(By.CLASS_NAME, 'base-card')
        #     elem.send_keys(Keys.PAGE_DOWN)
        #     time.sleep(2)

        # job_cards = job_cards[:number_of_jobs] #temp cut while filters are not yet implemented
        job_cards = [job_card.get_attribute('outerHTML') for job_card in job_cards]
        print(job_cards)
        browser.quit()

        job_listings = []
        for job_card in job_cards:
            job_card = BeautifulSoup(job_card,'html.parser')
            job_title = job_card.find(class_='base-search-card__title').text
            company_name = getattr(job_card.find(attrs={'data-tracking-control-name':'public_jobs_jserp-result_job-search-card-subtitle'}), 'text', None)
            salary = getattr(job_card.find(class_='job-search-card__salary-info'), 'text', None)
            job_link = job_card.find('a')['href']

            job_listings.append({
                'job_title': job_title,
                'company_name': company_name,
                'salary': salary,
                'job_link': job_link
            })

        df = pd.DataFrame(job_listings)

        return df
    
    def refine_data(self, df):
        df = df.replace('\n ', '', regex=True)
        df['job_title'] = df['job_title'].str.strip()
        df['company_name'] = df['company_name'].str.strip()
        return df
    
    # add wait before retry to avoid getting flagged and putting too much load on server

In [6]:
job = 'software engineer'
config = read_config('config.json')

In [8]:
jobstreet_jobs = JobStreetScrapper.get_jobs(job, config)

In [9]:
jobstreet_jobs

Unnamed: 0,job_title,company_name,salary,job_link
0,Software Engineer,REED ELSEVIER SHARED SERVICES (PHILIPPINES) INC.,,https://www.jobstreet.com.ph/job/73061650?type...
1,Java Developer,Innovations Group,"₱110,000 – ₱150,000 per month",https://www.jobstreet.com.ph/job/73061012?type...
2,Technical Support/Application Support Speciali...,KMC Solutions,,https://www.jobstreet.com.ph/job/73061006?type...
3,Civil Design Engineer,Meralco Industrial Engineering Services Corpor...,,https://www.jobstreet.com.ph/job/73060978?type...
4,Front End Developer,Innovations Group,"₱90,000 – ₱130,000 per month",https://www.jobstreet.com.ph/job/73060994?type...
5,RPA Developer (Blue Prism),"Indra Philippines, Inc.",,https://www.jobstreet.com.ph/job/73061905?type...
6,Software Asset Management Analyst,REED ELSEVIER SHARED SERVICES (PHILIPPINES) INC.,,https://www.jobstreet.com.ph/job/73061200?type...
7,Senior Data Engineer,Starbucks Philippines,,https://www.jobstreet.com.ph/job/73061037?type...
8,Virtual Assistant (Web Designer) Temp WFH - Op...,Personiv,,https://www.jobstreet.com.ph/job/73061050?type...
9,Web Designer | Temp WFH | (Open for Fresh Grad...,Personiv,,https://www.jobstreet.com.ph/job/73061029?type...


In [10]:
linkedin_jobs = LinkedInScrapper.get_jobs(job, config)

break condidtion entered
['<div class="base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card job-search-card--active" data-entity-urn="urn:li:jobPosting:3807534836" data-search-id="1zGKQSHQPR5XovGwiR2/pQ==" data-tracking-id="/94i+adEbi6aw9zWv1EKVQ==" data-column="1" data-row="1" data-visible-time="1705431886154" data-largest-intersection-ratio="1">\n        \n\n        <a class="base-card__full-link absolute top-0 right-0 bottom-0 left-0 p-0 z-[2]" href="https://ph.linkedin.com/jobs/view/software-developer-c%23-net-nz-software-solutions-home-based-at-connectos-3807534836?refId=1zGKQSHQPR5XovGwiR2%2FpQ%3D%3D&amp;trackingId=%2F94i%2BadEbi6aw9zWv1EKVQ%3D%3D&amp;position=1&amp;pageNum=0&amp;trk=public_jobs_jserp-result_search-card" data-tracking-control-name="public_jobs_jserp-result_search-card" data-tracking-client-ingraph="" data-tracking-will-navigate="">\n          \n          <span class="sr-only">\n   

In [11]:
linkedin_jobs

Unnamed: 0,job_title,company_name,salary,job_link
0,Software Developer - C#/.NET (NZ Software Solu...,ConnectOS,,https://ph.linkedin.com/jobs/view/software-dev...
1,"Front End Developer (AU Digital Health, Hybrid)",ConnectOS,,https://ph.linkedin.com/jobs/view/front-end-de...
2,Software Engineer,Dyson,,https://ph.linkedin.com/jobs/view/software-eng...
3,Full Stack Engineer,Sinch,,https://ph.linkedin.com/jobs/view/full-stack-e...
4,FULL STACK DEVELOPER,Outsource Accelerator,,https://ph.linkedin.com/jobs/view/full-stack-d...
5,Full stack Java Developer,Genpact,,https://ph.linkedin.com/jobs/view/full-stack-j...
6,Software Development Engineer in Test (HMO on ...,Genpact,,https://ph.linkedin.com/jobs/view/software-dev...
7,Junior DevOps Engineer,CoreBridge Solutions,,https://ph.linkedin.com/jobs/view/junior-devop...
8,Back-End Developer (Officer),East West Banking Corporation,,https://ph.linkedin.com/jobs/view/back-end-dev...
9,Full stack Developer,William Hill,,https://ph.linkedin.com/jobs/view/full-stack-d...


In [12]:
jobs_df = pd.concat([jobstreet_jobs, linkedin_jobs]).reset_index(drop=True)

In [13]:
jobs_df

Unnamed: 0,job_title,company_name,salary,job_link
0,Software Engineer,REED ELSEVIER SHARED SERVICES (PHILIPPINES) INC.,,https://www.jobstreet.com.ph/job/73061650?type...
1,Java Developer,Innovations Group,"₱110,000 – ₱150,000 per month",https://www.jobstreet.com.ph/job/73061012?type...
2,Technical Support/Application Support Speciali...,KMC Solutions,,https://www.jobstreet.com.ph/job/73061006?type...
3,Civil Design Engineer,Meralco Industrial Engineering Services Corpor...,,https://www.jobstreet.com.ph/job/73060978?type...
4,Front End Developer,Innovations Group,"₱90,000 – ₱130,000 per month",https://www.jobstreet.com.ph/job/73060994?type...
5,RPA Developer (Blue Prism),"Indra Philippines, Inc.",,https://www.jobstreet.com.ph/job/73061905?type...
6,Software Asset Management Analyst,REED ELSEVIER SHARED SERVICES (PHILIPPINES) INC.,,https://www.jobstreet.com.ph/job/73061200?type...
7,Senior Data Engineer,Starbucks Philippines,,https://www.jobstreet.com.ph/job/73061037?type...
8,Virtual Assistant (Web Designer) Temp WFH - Op...,Personiv,,https://www.jobstreet.com.ph/job/73061050?type...
9,Web Designer | Temp WFH | (Open for Fresh Grad...,Personiv,,https://www.jobstreet.com.ph/job/73061029?type...
