In [1]:
import pandas as pd
import requests
import time

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

In [2]:
# parse filters

import json

def read_config(file_path):
    with open(file_path, 'r') as file:
        config = json.load(file)
    return config

In [10]:
job = 'software engineer'

In [4]:
config = read_config('config.json')
# note to self, have filters as one and not separate.

jobstreet_config = config['jobstreet']
url = f'{jobstreet_config['url']}/{job}-jobs'

if jobstreet_config['location']:
    url = f'{url}/in-{jobstreet_config['location']}'

if jobstreet_config['daterange']:
    url = f'{url}?daterange={jobstreet_config['daterange']}'

In [5]:
linkedin_config = config['linkedin']

url = f'{linkedin_config['url']}/jobs/search?keywords={job}'

if linkedin_config['location']:
    url = f'{url}&location={linkedin_config['location']}'

if linkedin_config['daterange']:
    # convert days to seconds to match linkedin url syntax
    daterange = int(linkedin_config['daterange']) * 24 * 60 * 60
    url = f'{url}&f_TPR=r{daterange}'

In [6]:
# Template Class
from abc import ABC, abstractmethod

class JobsScraper(ABC):

    # apply filters method?
    # essentially just appends filters to the url
    # maybe appending jos should be here? Config maybe?
    
    @abstractmethod
    def scrape_site(self, job):
        pass

    # add data to df translation function or not?

    def refine_data(self, df):
        # By default just returns the same dataframe for when refinement
        # is not needed but can be overriden by concrete classes
        return df

    @classmethod
    def get_jobs(cls, job):
        instance = cls()

        df = instance.scrape_site(job)
        df = instance.refine_data(df)
        return df

In [7]:
class JobStreetScrapper(JobsScraper):
    
    def scrape_site(self, job):
        jobstreet_url = 'https://www.jobstreet.com.ph'
        job_listings = []
        page = 1
        jobs_collected = 0
        status_code = 200
        number_of_jobs = 10

        while status_code == 200 and jobs_collected < number_of_jobs:
            url = f'{jobstreet_url}/en/job-search/{job}-jobs?page={page}'
            response = requests.get(url)
            status_code = response.status_code

            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract job information
            #TODO: Validation to check that job title, company name, and job link should always be present
            for job_card in soup.find_all(attrs={"data-automation":"normalJob"}):
                job_title = getattr(job_card.find(attrs={"data-automation":"jobTitle"}), 'text', None)
                company_name = getattr(job_card.find(attrs={"data-automation":"jobCompany"}), 'text', None)
                salary = getattr(job_card.find(attrs={"data-automation":"jobSalary"}), 'text', None)
                job_link = job_card.find(attrs={"data-automation":"jobTitle"})['href']

                job_listings.append({
                    'job_title': job_title,
                    'company_name': company_name,
                    'salary': salary,
                    'job_link': f'{jobstreet_url}{job_link}'
                })
                
                jobs_collected += 1
                if jobs_collected >= number_of_jobs:
                    break

            page += 1

        else:
            if status_code == 404:
                print('No more pages to scrape')

        df = pd.DataFrame(job_listings)
        return df


In [8]:
class LinkedInScrapper(JobsScraper):
    
    def scrape_site(self, job):
        #TODO: Add validation to check if we're in the expected page
        number_of_jobs = 10 #temp for testing
        url = f'https://www.linkedin.com/jobs/search?keywords={job}&location=Philippines&trk=public_jobs_jobs'
        browser = webdriver.Chrome()
        browser.get(url)
        time.sleep(2)

        elem = browser.find_element(By.TAG_NAME, "body")
        job_cards = []

        while len(job_cards)<=number_of_jobs:
            job_cards = elem.find_elements(By.CLASS_NAME, 'base-card')
            elem.send_keys(Keys.PAGE_DOWN)
            time.sleep(2)

        job_cards = job_cards[:number_of_jobs] #temp cut while filters are not yet implemented
        job_cards = [job_card.get_attribute('outerHTML') for job_card in job_cards]
        browser.quit()

        job_listings = []
        for job_card in job_cards:
            job_card = BeautifulSoup(job_card,'html.parser')
            job_title = job_card.find(class_='base-search-card__title').text
            company_name = job_card.find(attrs={'data-tracking-control-name':'public_jobs_jserp-result_job-search-card-subtitle'}).text
            salary = getattr(job_card.find(class_='job-search-card__salary-info'), 'text', None)
            job_link = job_card.find('a', class_='base-card__full-link')['href']

            job_listings.append({
                'job_title': job_title,
                'company_name': company_name,
                'salary': salary,
                'job_link': job_link
            })

        df = pd.DataFrame(job_listings)

        return df
    
    def refine_data(self, df):
        df = df.replace('\n ', '', regex=True)
        df['job_title'] = df['job_title'].str.strip()
        df['company_name'] = df['company_name'].str.strip()
        return df
    
    # add wait before retry to avoid getting flagged and putting too much load on server

In [11]:
jobstreet_jobs = JobStreetScrapper.get_jobs(job)
jobstreet_jobs

Unnamed: 0,job_title,company_name,salary,job_link
0,Software Engineer/Programmer,"GF Micro Optics Philippines, Inc. (formerly NS...",,https://www.jobstreet.com.ph/job/72915163?type...
1,Software Engineer,FIRSTMAC OPERATIONS CENTER PTY LTD-PHILIPPINE ...,,https://www.jobstreet.com.ph/job/72864136?type...
2,Embedded Software Engineer (Fresh Graduates),Sercomm Philippines Inc.,,https://www.jobstreet.com.ph/job/72842712?type...
3,Software Test Engineer,Innovations Group,"₱70,000 – ₱100,000 per month",https://www.jobstreet.com.ph/job/72871061?type...
4,Lead Software Engineer - Full Stack,Time Access International,,https://www.jobstreet.com.ph/job/72868198?type...
5,Senior Java / ReactJS software engineer,Tangerpay,"₱150,000 per month",https://www.jobstreet.com.ph/job/72847960?type...
6,Pre-Sales Engineer - Software,"Phil-Data Business Systems, Inc.",,https://www.jobstreet.com.ph/job/72807495?type...
7,Software Developer 2,"Crystal Steel Fabricators Phils., Inc",,https://www.jobstreet.com.ph/job/72813656?type...
8,Junior Software Engineer,CRESS MIRAI INC.,"₱25,000 – ₱37,000 per month",https://www.jobstreet.com.ph/job/72863015?type...
9,Java Software Engineer,AVALOQ Philippines Operating Headquarters,,https://www.jobstreet.com.ph/job/72877492?type...


In [12]:
linkedin_jobs = LinkedInScrapper.get_jobs(job)
linkedin_jobs

Unnamed: 0,job_title,company_name,salary,job_link
0,Software Engineer,Cebu Pacific Air,,https://ph.linkedin.com/jobs/view/software-eng...
1,Software Engineer | Makati,MedGrocer,,https://ph.linkedin.com/jobs/view/software-eng...
2,Web Developer,INQUIRER.net,,https://ph.linkedin.com/jobs/view/web-develope...
3,Web Developer,Talent Disruptors,,https://ph.linkedin.com/jobs/view/web-develope...
4,Junior Web Developer,Liss Solutions,"$14,400 - $21,600",https://ph.linkedin.com/jobs/view/junior-web-d...
5,Junior Software Developer,"REVLV Solutions, Inc.",,https://ph.linkedin.com/jobs/view/junior-softw...
6,Software Developer (Fresh Graduate - Manila),DXC Technology,,https://ph.linkedin.com/jobs/view/software-dev...
7,Software Engineer (Python),"Xurpas, Inc.","PHP720,000 - PHP1,200,000",https://ph.linkedin.com/jobs/view/software-eng...
8,Backend Developer (Permanent WFH),ConnectOS,,https://ph.linkedin.com/jobs/view/backend-deve...
9,Software Engineer (Typescript) - Remote,Whispir,,https://ph.linkedin.com/jobs/view/software-eng...


In [13]:
jobs_df = pd.concat([jobstreet_jobs, linkedin_jobs]).reset_index(drop=True)

In [14]:
jobs_df

Unnamed: 0,job_title,company_name,salary,job_link
0,Software Engineer/Programmer,"GF Micro Optics Philippines, Inc. (formerly NS...",,https://www.jobstreet.com.ph/job/72915163?type...
1,Software Engineer,FIRSTMAC OPERATIONS CENTER PTY LTD-PHILIPPINE ...,,https://www.jobstreet.com.ph/job/72864136?type...
2,Embedded Software Engineer (Fresh Graduates),Sercomm Philippines Inc.,,https://www.jobstreet.com.ph/job/72842712?type...
3,Software Test Engineer,Innovations Group,"₱70,000 – ₱100,000 per month",https://www.jobstreet.com.ph/job/72871061?type...
4,Lead Software Engineer - Full Stack,Time Access International,,https://www.jobstreet.com.ph/job/72868198?type...
5,Senior Java / ReactJS software engineer,Tangerpay,"₱150,000 per month",https://www.jobstreet.com.ph/job/72847960?type...
6,Pre-Sales Engineer - Software,"Phil-Data Business Systems, Inc.",,https://www.jobstreet.com.ph/job/72807495?type...
7,Software Developer 2,"Crystal Steel Fabricators Phils., Inc",,https://www.jobstreet.com.ph/job/72813656?type...
8,Junior Software Engineer,CRESS MIRAI INC.,"₱25,000 – ₱37,000 per month",https://www.jobstreet.com.ph/job/72863015?type...
9,Java Software Engineer,AVALOQ Philippines Operating Headquarters,,https://www.jobstreet.com.ph/job/72877492?type...
