In [21]:
import json
import pandas
import csv
import random

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.firefox.service import Service as FirefoxService
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver import DesiredCapabilities
from selenium.webdriver.common.by import By
from selenium_stealth import stealth
from selenium.webdriver.remote.webelement import WebElement
from tqdm import tqdm
from typing import List
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.proxy import Proxy
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.firefox.options import FirefoxProfile




In [None]:
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

In [22]:
# Selenium Configuration
PROXY_HOST = "12.12.12.234"
PROXY_PORT = "1234"

profile = webdriver.FirefoxProfile()
profile.set_preference("network.proxy.type", 1)
profile.set_preference("network.proxy.http", PROXY_HOST)
profile.set_preference("network.proxy.http_port", int(PROXY_PORT))
profile.set_preference("dom.webdriver.enabled", False)
profile.set_preference('useAutomationExtension', False)

desired = DesiredCapabilities.FIREFOX

options = webdriver.FirefoxOptions()
options.set_preference("dom.webnotifications.serviceworker.enabled", True)
options.set_preference("dom.webnotifications.enabled", False)
profile.set_preference('devtools.jsonview.enabled', False)
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument("--incognito")
options.add_argument('--ignore-certificate-errors')
options.add_argument("--disable-blink-features")
options.add_argument("--disable-blink-features=AutomationControlled")

service = FirefoxService(GeckoDriverManager().install())

In [52]:


def gather_job_category_links() -> List[str]:
    try:
        job_categories: List[str] = []
        service = FirefoxService(GeckoDriverManager().install())
        driver = webdriver.Firefox(
            service=service,
            firefox_profile=profile,
            desired_capabilities=desired)

        driver.delete_all_cookies()

        driver.get("https://www.upwork.com/cat/dev-it")

        elements = driver.find_elements(by=By.XPATH, value="//div[@role='listitem']/a")

        for element in elements:
            link = element.get_attribute("href")
            job_categories.append(link)

        cleaned_job_categories = cleansing_job_category_links(job_categories)
        driver.close()
        
        return cleaned_job_categories;
    except Exception as e:
        print(e)
        return []
    
def gather_skill_sets(link: str) -> List[object]:
    try:
        skill_sets: List[object] = []
        
        driver = webdriver.Firefox(
            service=service,
            firefox_profile=profile,
            desired_capabilities=desired)

        driver.delete_all_cookies()

        driver.get(link)

        elements = driver.find_elements(by=By.XPATH, value="//ul[@data-qa=\"skills-container\"]/li")
        job_title = driver.find_element(by= By.XPATH, value= "//h1[@data-qa=\"title\"]")

        if(len(elements) != 0 and job_title != ""):
            for element in elements:
                skill = element.text
                title = job_title.text.replace("Hire the best ", "")
                skill_sets.append({"job": title, "skill": skill})

        cleaned_skill_sets = cleansing_skill_datasets(skill_sets)
        driver.close()
        
        return cleaned_skill_sets
    except Exception as e:
        print(e)
        return []
    
def gather_skills_from_job_search(job_title: str):
    try:
        title = job_title.replace(" ", "%20")
        URI= f"https://www.upwork.com/nx/jobs/search/?q={title}&sort=recency"
        
        
         
        
    except Exception as e:
        print(e)
        return []

def cleansing_skill_datasets(skills: List[object]) -> List[object]:
    remove_empty = [item for item in skills if item["skill"]]
    remove_duplicate = list({item["skill"]:item for item in skills}.values())
    
    return remove_duplicate

def cleansing_job_category_links(job_link: List[str]) -> List[str]:
    remove_services = [x for x in job_link if "services" not in x]

    return remove_services
    


In [None]:
# Executing
job_categories = gather_job_category_links()

range_job_list = len(job_categories)

skill_sets: List[object] = []
for item in tqdm(range(range_job_list)):
    skills = gather_skill_sets(job_categories[item])
    skill_sets = skill_sets + skills

In [None]:
with open("extract/jobs-category/skill_extraction.json", "w", encoding="utf-8") as f:
    json.dump(skill_sets, f, ensure_ascii= False)
    
with open('extract/jobs-category/skill_extraction.csv', 'w',) as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Job', 'Skill'])
    for item in skill_sets:
        writer.writerow([item["job"], item["skill"]])

In [None]:
    
import math

def overall_page_number(job_title: str) -> int:
    title = job_title.replace(" ", "%20")
    URI = f"https://www.upwork.com/nx/jobs/search/?q={title}&sort=recency&page=1"
    service = FirefoxService(GeckoDriverManager().install())
    driver = webdriver.Firefox(
        service=service,
        firefox_profile=profile,
        desired_capabilities=desired)

    driver.delete_all_cookies()

    driver.get(URI)

    job_numbers = driver.find_element(by= By.XPATH, value="//span[@data-test=\"jobs-count\"]/strong")
    job_numbers = job_numbers.text.replace(",","")

    page_number = math.ceil(int(job_numbers)/10)
    
    driver.close()
    return page_number

In [62]:
import time
def gather_skill_from_job_title(job_title: str, page_number: int) -> List[object]:

    title = job_title.replace(" ", "%20")
    skill_sets: List[object] = []
    skill_handler: List[str] = []

    for page in tqdm(range(page_number)):
            URI = f"https://www.upwork.com/nx/jobs/search/?q={title}&sort=recency&page={page}"
        
            driver = webdriver.Firefox(
                    service=service,
                    firefox_profile=profile,
                    desired_capabilities=desired)

            driver.delete_all_cookies()

            driver.get(URI)
            time.sleep(6)
            skills = driver.find_elements(by= By.XPATH, value="//div[@class=\"up-skill-wrapper\"]/a[@class=\"up-skill-badge text-muted\"]")
            skills = [(lambda x: x.text)(x) for x in skills]

            skill_handler = skill_handler + skills

            driver.close()
  
    skill_handler = set(skill_handler)
    for skill in skill_handler:
        skill_sets.append({"job": job_title, "skill": skill})
        
    return skill_sets
        


In [65]:
job_title = "QA Tester"
list = gather_skill_from_job_title(job_title= job_title, page_number= 2)



100%|██████████| 2/2 [00:27<00:00, 13.85s/it]


In [64]:
list

[{'job': 'QA Tester', 'skill': 'Jira'},
 {'job': 'QA Tester', 'skill': 'Software Quality Assurance'},
 {'job': 'QA Tester', 'skill': 'Performance Testing'},
 {'job': 'QA Tester', 'skill': 'Risk Analysis'},
 {'job': 'QA Tester', 'skill': 'User Acceptance Testing'},
 {'job': 'QA Tester', 'skill': 'Workday'},
 {'job': 'QA Tester', 'skill': 'Manual Testing'},
 {'job': 'QA Tester', 'skill': 'Mobile App Testing'},
 {'job': 'QA Tester', 'skill': 'Automated Testing'},
 {'job': 'QA Tester', 'skill': 'Usability Testing'},
 {'job': 'QA Tester', 'skill': 'End-to-End Testing'},
 {'job': 'QA Tester', 'skill': 'Web Testing'},
 {'job': 'QA Tester', 'skill': 'Functional Testing'},
 {'job': 'QA Tester', 'skill': 'JUnit'},
 {'job': 'QA Tester', 'skill': 'Apache JMeter'},
 {'job': 'QA Tester', 'skill': 'Test Case Design'},
 {'job': 'QA Tester', 'skill': 'Cucumber'},
 {'job': 'QA Tester', 'skill': 'Testing'},
 {'job': 'QA Tester', 'skill': 'Software Testing'},
 {'job': 'QA Tester', 'skill': 'Mobile App Dev