In [1]:
import pandas as pd
import requests
import time

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

In [2]:
# parse filters

import json

def read_config(file_path):
    with open(file_path, 'r') as file:
        config = json.load(file)
    return config

In [3]:
job = 'software engineer'

In [None]:
# linkedin_config = config['linkedin']

url = f'{config['url']['linkedin']}/jobs/search?keywords={job}'

if filters['location']:
    url = f'{url}&location={filters['location']}'

if filters['daterange']:
    # convert days to seconds to match linkedin url syntax
    daterange = int(filters['daterange']) * 24 * 60 * 60
    url = f'{url}&f_TPR=r{daterange}'

url

In [4]:
# Template Class
from abc import ABC, abstractmethod

class JobsScraper(ABC):

    # apply filters method?
    # essentially just appends filters to the url
    # maybe appending jos should be here? Config maybe?
    
    @abstractmethod
    def __init__(self, job, config):
        self.job = job
        self.config = config

    @abstractmethod
    def parse_url(self):
        pass

    @abstractmethod
    def scrape_site(self):
        pass

    # add data to df translation function or not?

    def refine_data(self, df):
        # By default just returns the same dataframe for when refinement
        # is not needed but can be overriden by concrete classes
        return df

    @classmethod
    def get_jobs(cls, job, config):
        instance = cls(job, config)

        url = instance.parse_url()
        df = instance.scrape_site(url)
        df = instance.refine_data(df)
        return df

In [5]:
class JobStreetScrapper(JobsScraper):

    def __init__(self, job, config):
        super().__init__(job, config)
        self.base_url = config['url']['jobstreet']

    def parse_url(self):
        filters = self.config['filters']
        # jobstreet_config = config['jobstreet']
        url = f'{self.config['url']['jobstreet']}/{self.job}-jobs'

        if filters['location']:
            url = f'{url}/in-{filters['location']}'

        if filters['daterange']:
            url = f'{url}?daterange={filters['daterange']}'

        return url
    
    def scrape_site(self, url):
        # jobstreet_url = 'https://www.jobstreet.com.ph'
        job_listings = []
        page = 1
        jobs_collected = 0
        status_code = 200
        number_of_jobs = 10

        # while status_code == 200:
            # url = f'{url}page={page}'
        response = requests.get(url)
        status_code = response.status_code

        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract job information
        #TODO: Validation to check that job title, company name, and job link should always be present
        #TODO: check for "next" page
        for job_card in soup.find_all(attrs={"data-automation":"normalJob"}):
            job_title = getattr(job_card.find(attrs={"data-automation":"jobTitle"}), 'text', None)
            company_name = getattr(job_card.find(attrs={"data-automation":"jobCompany"}), 'text', None)
            salary = getattr(job_card.find(attrs={"data-automation":"jobSalary"}), 'text', None)
            job_link = job_card.find(attrs={"data-automation":"jobTitle"})['href']

            job_listings.append({
                'job_title': job_title,
                'company_name': company_name,
                'salary': salary,
                'job_link': f'{self.base_url}{job_link}'
            })
            
            # jobs_collected += 1
            # if jobs_collected >= number_of_jobs:
            #     break

        # page += 1

        # else:
        #     if status_code == 404:
        #         print('No more pages to scrape')

        df = pd.DataFrame(job_listings)
        return df


In [None]:
class LinkedInScrapper(JobsScraper):
    
    def scrape_site(self, job):
        #TODO: Add validation to check if we're in the expected page
        number_of_jobs = 10 #temp for testing
        url = f'https://www.linkedin.com/jobs/search?keywords={job}&location=Philippines&trk=public_jobs_jobs'
        browser = webdriver.Chrome()
        browser.get(url)
        time.sleep(2)

        elem = browser.find_element(By.TAG_NAME, "body")
        job_cards = []

        while len(job_cards)<=number_of_jobs:
            job_cards = elem.find_elements(By.CLASS_NAME, 'base-card')
            elem.send_keys(Keys.PAGE_DOWN)
            time.sleep(2)

        job_cards = job_cards[:number_of_jobs] #temp cut while filters are not yet implemented
        job_cards = [job_card.get_attribute('outerHTML') for job_card in job_cards]
        browser.quit()

        job_listings = []
        for job_card in job_cards:
            job_card = BeautifulSoup(job_card,'html.parser')
            job_title = job_card.find(class_='base-search-card__title').text
            company_name = job_card.find(attrs={'data-tracking-control-name':'public_jobs_jserp-result_job-search-card-subtitle'}).text
            salary = getattr(job_card.find(class_='job-search-card__salary-info'), 'text', None)
            job_link = job_card.find('a', class_='base-card__full-link')['href']

            job_listings.append({
                'job_title': job_title,
                'company_name': company_name,
                'salary': salary,
                'job_link': job_link
            })

        df = pd.DataFrame(job_listings)

        return df
    
    def refine_data(self, df):
        df = df.replace('\n ', '', regex=True)
        df['job_title'] = df['job_title'].str.strip()
        df['company_name'] = df['company_name'].str.strip()
        return df
    
    # add wait before retry to avoid getting flagged and putting too much load on server

In [6]:
job = 'software engineer'
config = read_config('config.json')

In [7]:
jobstreet_jobs = JobStreetScrapper.get_jobs(job, config)

In [8]:
jobstreet_jobs

Unnamed: 0,job_title,company_name,salary,job_link
0,Full Stack .Net Developer,Transworld Systems Customer Services Philippin...,"₱100,000 – ₱120,000 per month",https://www.jobstreet.com.ph/job/73028412?type...
1,Senior Dashboard Developer - World's Biggest S...,Concentrix Philippines,,https://www.jobstreet.com.ph/job/73029523?type...
2,Senior Back-End Developer,"Eisenbach Consulting, LLC",,https://www.jobstreet.com.ph/job/73029412?type...
3,Automation Quality Assurance Engineer (WFH - W...,"Ehrlich IT Services, Inc.","₱50,000 – ₱70,000 per month",https://www.jobstreet.com.ph/job/73028273?type...
4,Transfer & Payment Technology Lead (Officer),East West Banking Corporation,,https://www.jobstreet.com.ph/job/73029527?type...
5,Quality Test Analyst,REED ELSEVIER SHARED SERVICES (PHILIPPINES) INC.,,https://www.jobstreet.com.ph/job/73028269?type...
6,Tester,"Eisenbach Consulting, LLC",,https://www.jobstreet.com.ph/job/73029433?type...
7,Product Owner for Non-Application (Officer),East West Banking Corporation,,https://www.jobstreet.com.ph/job/73029548?type...
8,Scrum Master (Officer),East West Banking Corporation,,https://www.jobstreet.com.ph/job/73029153?type...
9,URGENT HIRING! Release Train Engineer,Tech Mahindra Limited,"₱80,000 – ₱100,000 per month",https://www.jobstreet.com.ph/job/73029798?type...


In [None]:
linkedin_jobs = LinkedInScrapper.get_jobs(job)
linkedin_jobs

In [None]:
jobs_df = pd.concat([jobstreet_jobs, linkedin_jobs]).reset_index(drop=True)

In [None]:
jobs_df