In [9]:
import pandas as pd
import requests
import time

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

In [10]:
# Template Class
from abc import ABC, abstractmethod

class JobsScraper(ABC):

    # apply filters method?
    # essentially just appends filters to the url
    # maybe appending jos should be here? Config maybe?
    
    @abstractmethod
    def scrape_site(self, job):
        pass

    # add data to df translation function or not?

    def refine_data(self, df):
        # By default just returns the same dataframe for when refinement
        # is not needed but can be overriden by concrete classes
        return df

    @classmethod
    def get_jobs(cls, job):
        instance = cls()

        df = instance.scrape_site(job)
        df = instance.refine_data(df)
        return df

In [11]:
class JobStreetScrapper(JobsScraper):
    
    def scrape_site(self, job):
        jobstreet_url = 'https://www.jobstreet.com.ph'
        job_listings = []
        page = 1
        jobs_collected = 0
        status_code = 200
        number_of_jobs = 10

        while status_code == 200 and jobs_collected < number_of_jobs:
            url = f'{jobstreet_url}/en/job-search/{job}-jobs?page={page}'
            response = requests.get(url)
            status_code = response.status_code

            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract job information
            #TODO: Validation to check that job title, company name, and job link should always be present
            for job_card in soup.find_all(attrs={"data-automation":"normalJob"}):
                job_title = getattr(job_card.find(attrs={"data-automation":"jobTitle"}), 'text', None)
                company_name = getattr(job_card.find(attrs={"data-automation":"jobCompany"}), 'text', None)
                salary = getattr(job_card.find(attrs={"data-automation":"jobSalary"}), 'text', None)
                job_link = job_card.find(attrs={"data-automation":"jobTitle"})['href']

                job_listings.append({
                    'job_title': job_title,
                    'company_name': company_name,
                    'salary': salary,
                    'job_link': f'{jobstreet_url}{job_link}'
                })
                
                jobs_collected += 1
                if jobs_collected >= number_of_jobs:
                    break

            page += 1

        else:
            if status_code == 404:
                print('No more pages to scrape')

        df = pd.DataFrame(job_listings)
        return df


In [12]:
class LinkedInScrapper(JobsScraper):
    
    def scrape_site(self, job):
        #TODO: Add validation to check if we're in the expected page
        number_of_jobs = 10 #temp for testing
        url = f'https://www.linkedin.com/jobs/search?keywords={job}&location=Philippines&trk=public_jobs_jobs'
        browser = webdriver.Chrome()
        browser.get(url)
        time.sleep(2)

        elem = browser.find_element(By.TAG_NAME, "body")
        job_cards = []

        while len(job_cards)<=number_of_jobs:
            job_cards = elem.find_elements(By.CLASS_NAME, 'base-card')
            elem.send_keys(Keys.PAGE_DOWN)
            time.sleep(2)

        job_cards = job_cards[:number_of_jobs] #temp cut while filters are not yet implemented
        job_cards = [job_card.get_attribute('outerHTML') for job_card in job_cards]
        browser.quit()

        job_listings = []
        for job_card in job_cards:
            job_card = BeautifulSoup(job_card,'html.parser')
            job_title = job_card.find(class_='base-search-card__title').text
            company_name = job_card.find(attrs={'data-tracking-control-name':'public_jobs_jserp-result_job-search-card-subtitle'}).text
            salary = getattr(job_card.find(class_='job-search-card__salary-info'), 'text', None)
            job_link = job_card.find('a', class_='base-card__full-link')['href']

            job_listings.append({
                'job_title': job_title,
                'company_name': company_name,
                'salary': salary,
                'job_link': job_link
            })

        df = pd.DataFrame(job_listings)

        return df
    
    def refine_data(self, df):
        df = df.replace('\n ', '', regex=True)
        df['job_title'] = df['job_title'].str.strip()
        df['company_name'] = df['company_name'].str.strip()
        return df
    
    # add wait before retry to avoid getting flagged and putting too much load on server

In [6]:
job = 'python engineer'

In [7]:
jobstreet_jobs = JobStreetScrapper.get_jobs(job)
jobstreet_jobs

Unnamed: 0,job_title,company_name,salary,job_link
0,Cloud Engineer/DevOps (AWS/Python) - Alabang,"Stefanini Philippines, Inc.",,https://www.jobstreet.com.ph/job/72696348?type...
1,"Software Engineer- Python, React or Go (Homeba...",Outsourced Quality Assured Services Inc. (ISO ...,"₱120,000 – ₱140,000 per month",https://www.jobstreet.com.ph/job/72688981?type...
2,Software Engineer - Python (Homebased 1995155578),Outsourced Quality Assured Services Inc. (ISO ...,"₱80,000 – ₱120,000 per month",https://www.jobstreet.com.ph/job/72688201?type...
3,Senior Software Engineer (Python/Java/Golang),TREND MICRO INCORPORATED-PHILIPPINE BRANCH,,https://www.jobstreet.com.ph/job/72375113?type...
4,Senior Full Stack Engineer (Python & React) | ...,Emapta,,https://www.jobstreet.com.ph/job/72659751?type...
5,"Software Engineer (Python, Java, C++ or C#,SQL...","John Clements Consultants, Inc.",,https://www.jobstreet.com.ph/job/72521733?type...
6,Software Automation Engineer (Python/PowerShel...,Connext Global Solutions Inc,,https://www.jobstreet.com.ph/job/71971436?type...
7,Software Engineer (Python Full stack Development),ITRS (PHILS.) INC.,,https://www.jobstreet.com.ph/job/72320921?type...
8,Junior Python Software Engineer l Eastwood Site,MicroSourcing,,https://www.jobstreet.com.ph/job/72404757?type...
9,Senior Software Engineer - Python l Flexible W...,MicroSourcing,,https://www.jobstreet.com.ph/job/72404861?type...


In [13]:
linkedin_jobs = LinkedInScrapper.get_jobs(job)
linkedin_jobs

Unnamed: 0,job_title,company_name,salary,job_link
0,Software Engineer - Python (Junior-Senior),"VCC Link, Inc.","PHP600,000 - PHP1,200,000",https://ph.linkedin.com/jobs/view/software-eng...
1,Junior Python Data Engineer (FT),Xelure Technologies,"PHP40,000 - PHP75,000",https://ph.linkedin.com/jobs/view/junior-pytho...
2,Python Software Engineer,VISEO ASIA,,https://ph.linkedin.com/jobs/view/python-softw...
3,Python Engineer,Eclaro,,https://ph.linkedin.com/jobs/view/python-engin...
4,Python Software Engineer - WFH,ACCPRO INTERNATIONAL,,https://ph.linkedin.com/jobs/view/python-softw...
5,Sr. Python Software Engineer (Hybrid),Sprout Solutions,,https://ph.linkedin.com/jobs/view/sr-python-so...
6,Senior Python Backend Engineer,ProSource,,https://ph.linkedin.com/jobs/view/senior-pytho...
7,Python/Selenium QA Test Engineer (Remote- Phil...,DomainTools,,https://ph.linkedin.com/jobs/view/python-selen...
8,Python Data Engineer,Eastvantage,,https://ph.linkedin.com/jobs/view/python-data-...
9,Senior Python Software Engineer,Rimes,,https://ph.linkedin.com/jobs/view/senior-pytho...
