In [1]:
# imports
from selenium import webdriver

# Starting/Stopping Driver: can specify ports or location but not remote access
from selenium.webdriver.chrome.service import Service as ChromeService

# Manages Binaries needed for WebDriver without installing anything directly
from webdriver_manager.chrome import ChromeDriverManager

# Allows searchs similar to beautiful soup: find_all
from selenium.webdriver.common.by import By

# Try to establish wait times for the page to load
from selenium.webdriver.support.ui import WebDriverWait

# Wait for specific condition based on defined task: web elements, boolean are examples
from selenium.webdriver.support import expected_conditions as EC

# Used for keyboard movements, up/down, left/right,delete, etc
from selenium.webdriver.common.keys import Keys

# Locate elements on page and throw error if they do not exist
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
from selenium.common.exceptions import ElementClickInterceptedException


from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options


import math
import time
import pandas as pd
from datetime import datetime

In [2]:
# setup chrome options

# Create service
webdriver_service = Service(ChromeDriverManager().install())
option= webdriver.ChromeOptions()
driver=0

In [3]:
# Variables to make the search, the link and the job location
pagination_url = "https://www.occ.com.mx/empleos/de-{}/en-{}/?page={}"
pagination_url_id = "https://www.occ.com.mx/empleos/de-{}/en-{}/?page={}&jobid={}"
location='Mexico'

# Selenium selectors: used to find an element in the web page, xpath selector, css selector, class name selector
xpath_resultados1 = "/html/body/main/div[4]/aside/div/div[1]/p"
xpath_resultados2 = "/html/body/div[1]/div[1]/div[7]/div/div[1]/div[1]/div[1]/p"
xpath_noresuultados = "/html/body/main/div[3]/div/div/div[1]/h1"
cssSelector_Jobcard = "div[id^=jobcard]"
cssSelector_jobinfo = "body > main > div.sm\:container.sm\:mx-auto.grid.grid-cols-12 > div > div > div.mb-8.break-words"
cssSelector_jobinfo2= "#jobbody"
cssSelector_JobcardCompany = "div[class=\"fresnel-container fresnel-greaterThanOrEqual-sm\"][class=\"fresnel-greaterThanOrEqual-sm\"]"
className_JobcardCompany2 = "fresnel-container fresnel-greaterThanOrEqual-sm"
className_JobcardCompany3 = "\"fresnel-greaterThanOrEqual-sm\""
cssSelector_jobcardTitle = "#{} > div > h2"

In [4]:
# Occupations to search
joblist_infocomm = [
    "ict sales professional",
    "marketing manager",
    "product analyst",
    "product manager",
    "product designer",
    "business intelligence professional",
    "infrastructure engineer",
    "computer systems analyst",
    "software infrastructure architect",
    "web developer",
    "software developer",
    "app developer",
    "user interface designer",
    "software engineer",
    "software architect",
    "software quality assurance analysts and testers",
    "embedded systems engineer",
    "web and digital interface designers",
    "database infrastructure engineer",
    "network architect",
    "database administrator",
    "database architect",
    "network and computer systems administrator",
    "artificial intelligence engineer",
    "machine learning engineer",
    "data science engineer",
    "data analyst",
    "data scientist",
    "artificial intelligence scientist",
    "data architect",
    "ict security specialist",
    "it security operations",
    "information security analyst",
    "product security and it security integration specialist",
    "product risk specialist",
    "security architect",
    "database support engineer",
    "data center operations engineer",
    "support systems engineer",
    "computer network support specialist"
]

In [5]:
# findJobElements: recibe the h2 element, save the text to jobTitleList, clicks the
# job card to find the description, if the description is found using the first selector, saves all the text in decriptionList
# if its not found with the first selector it will search the element with the selector and do the same to save the text
# returns: decriptionList,jobTitleList,jobUrlList
def findJobElements(h2,decriptionList,jobTitleList,jobUrlList):
    titulo = h2[0].text
    jobTitleList.append(titulo)
    try:
        h2[0].click()
        jobUrl = driver.current_url #obtain the current url of the page and save it to jobUrlList
        jobUrlList.append(jobUrl)
        time.sleep(1.3)
        #encuentra info de jobs
        descripcion = driver.find_elements(by=By.CSS_SELECTOR, value = cssSelector_jobinfo)
        if len(descripcion) > 0:
            for descripText in descripcion:
                decriptionList.append(descripText.text)
        else:
            time.sleep(0.5)
            descripcion = driver.find_elements(by=By.CSS_SELECTOR, value = cssSelector_jobinfo2)
            if len(descripcion) > 0:
                for descripText in descripcion:
                    decriptionList.append(descripText.text)
            else:#NoDesciptionFound
                decriptionList.append("")#If no description is found it will add an emtpy string to the list
    except ElementClickInterceptedException:
        print("Error: ElementClickInterceptedException") # Handling the exception if the click fails 
        jobUrlList.append("error")
        decriptionList.append("error")
    return decriptionList,jobTitleList,jobUrlList

In [6]:
# Obtain_descriptions: search the h2 with the job title with the ids if not found search the h2 by the tag name "h2": return 3 list, titlesList, urlList and descriptionList
def obtain_descriptions(jobsFoundList,jobsIds):
    #Lists to save the jobs information
    decriptionList = []
    jobTitleList = []
    jobUrlList = []
    # For each job card it will find the title in the job card by id, if found calls the function findJobElements, if not found search the h2 by the tag name "h2"
    for jobcard,id_html in zip(jobsFoundList,jobsIds):
        soloId = id_html.split("-")[1]
        soloId=soloId.strip()
        titulo = ""
        cssSelector = cssSelector_jobcardTitle.format(id_html)
        h2 = jobcard.find_elements(by=By.CSS_SELECTOR,value=cssSelector)
        if len(h2) > 0:
            decriptionList,jobTitleList,jobUrlList = findJobElements(h2,decriptionList,jobTitleList,jobUrlList)
        else:
            time.sleep(0.5)
            h2 = jobcard.find_elements(by=By.TAG_NAME,value="h2")
            if len(h2) > 0:
                decriptionList,jobTitleList,jobUrlList = findJobElements(h2,decriptionList,jobTitleList,jobUrlList)
            else:
                print("No Job Title to click")
    return jobTitleList, decriptionList, jobUrlList

In [7]:
# the prepareDataFrame function is given the urls, titles and jobs descriptions to create a pandas df and save it to a CSV file
# the csv title will be the occpation and the date of the search, example: web-developer-2024-04-16.csv
def prepareDataFrame(titlesList,urlsList,descriptionsList, occup):
    current_date = datetime.now().strftime("%Y-%m-%d")
    csvRoute = "CSVInfo2V2/"
    csvFileName = csvRoute + occup + "-" + str(current_date) +".csv"
    df = pd.DataFrame({'Job_Title': titlesList, 'Job_Description': descriptionsList, 'Job_Url': urlsList})
    df.to_csv(csvFileName, index=False)
    

In [8]:
# calculate_npags, given the number of results found by the search it will calculate how many pages the scrapper will search
def calculate_npags(num_resultados, max_resultados=100):
    total_pages = math.ceil(num_resultados / 20)
    total_pages = min(total_pages, math.ceil(max_resultados / 20))
    return total_pages


In [9]:
# scrap_job_infos, given the occupation, the scrapper visit the url with de occupation an the location given
# try to find the number or results of the search and then calculate tje pages
# if the number of pages is 1 try to find the job cards, to obtain the descriptions, titles and urls, to prepare the df
# if nPags > 1 it will iterate through pages saving the job cards and obtaining the job information and saves it to the respective list
# to create the df
def scrap_job_infos(job_):
    #print(f"EMPLEO: {job_}")
    driver.get(pagination_url.format(job_,location,1))
    time.sleep(1.5)
    resultados = driver.find_elements(by=By.XPATH, value=xpath_resultados1)
    if len(resultados) == 0:
        resultados = driver.find_elements(by=By.XPATH, value=xpath_resultados2)
    total=0
    nResultados = int((resultados[0].text.split(" ")[0]).replace(",","")) if resultados else int("0")

    #print(f"Resultados encontrados: {nResultados}\n")
    nPags = calculate_npags(nResultados, max_resultados=100)
    if(nPags==1):
        descriptionsList=[]
        jobsIDs = []
        jobsFoundList = driver.find_elements(by=By.CSS_SELECTOR, value=cssSelector_Jobcard)
        for i in jobsFoundList:
            try:
                id_html = i.get_attribute("id")
                jobsIDs.append(id_html)
                #print(f"ID encontrado: {id_html}")
            except StaleElementReferenceException:
                print("ID No encontrado")
        if len(jobsFoundList)>0:
            listOfTitlesList, descriptionsList, listOfUrlsList = obtain_descriptions(jobsFoundList,jobsIDs)
            total = total + len(descriptionsList)
            prepareDataFrame(listOfTitlesList, descriptionsList, listOfUrlsList,job_)
            
    elif (nPags>1):
        titlesListConca = []
        descriptionsListConca = []
        urlsListConca= []
        for i in range(1,nPags+1):
            descriptionsList=[]
            jobsIDs = []
            driver.get(pagination_url.format(job_,location,i))
            time.sleep(0.5)
            jobsFoundList = driver.find_elements(by=By.CSS_SELECTOR, value=cssSelector_Jobcard)
            for i in jobsFoundList:
                try:
                    id_html = i.get_attribute("id")
                    jobsIDs.append(id_html)
                except StaleElementReferenceException:
                    print("ID No encontrado")
            listOfTitles, descriptionsList, listOfUrls = obtain_descriptions(jobsFoundList,jobsIDs)
            descriptionsListConca.extend(descriptionsList)
            titlesListConca.extend(listOfTitles)
            urlsListConca.extend(listOfUrls)
        total = total + len(descriptionsListConca)
        for l in descriptionsListConca:
            l = l.replace(","," ")
        prepareDataFrame(titlesListConca, descriptionsListConca, urlsListConca,job_)
    return total

In [10]:
# the web driver its initialized, then for each job(occupation) in the joblist will scrap the needed information
# then close the explorer window and quit the driver
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()),options=option)
totalJobs = 0
for job_ in joblist_infocomm:
    job_ = job_.replace(" ","-")
    total = scrap_job_infos(job_)
    totalJobs = totalJobs + total
driver.close()
driver.quit()
print("\n")
print(f"\nTotal Job posts extracted: {totalJobs}")

Error: ElementClickInterceptedException
Error: ElementClickInterceptedException
Error: ElementClickInterceptedException
Error: ElementClickInterceptedException
Error: ElementClickInterceptedException
Error: ElementClickInterceptedException
Error: ElementClickInterceptedException
Error: ElementClickInterceptedException
Error: ElementClickInterceptedException
Error: ElementClickInterceptedException
Error: ElementClickInterceptedException
Error: ElementClickInterceptedException
Error: ElementClickInterceptedException
Error: ElementClickInterceptedException
Error: ElementClickInterceptedException
Error: ElementClickInterceptedException
Error: ElementClickInterceptedException
Error: ElementClickInterceptedException
Error: ElementClickInterceptedException
Error: ElementClickInterceptedException


WebDriverException: Message: disconnected: Unable to receive message from renderer
  (failed to check if window was closed: disconnected: not connected to DevTools)
  (Session info: chrome=124.0.6367.61)
Stacktrace:
0   chromedriver                        0x0000000104956934 chromedriver + 4368692
1   chromedriver                        0x000000010494edc8 chromedriver + 4337096
2   chromedriver                        0x0000000104572c04 chromedriver + 289796
3   chromedriver                        0x000000010455d230 chromedriver + 201264
4   chromedriver                        0x000000010455cf5c chromedriver + 200540
5   chromedriver                        0x000000010455ad20 chromedriver + 191776
6   chromedriver                        0x000000010455b6bc chromedriver + 194236
7   chromedriver                        0x000000010457516c chromedriver + 299372
8   chromedriver                        0x00000001045edc08 chromedriver + 793608
9   chromedriver                        0x00000001045ed5ec chromedriver + 792044
10  chromedriver                        0x00000001045a9ab4 chromedriver + 514740
11  chromedriver                        0x00000001045aa50c chromedriver + 517388
12  chromedriver                        0x000000010491ae50 chromedriver + 4124240
13  chromedriver                        0x000000010491fc40 chromedriver + 4144192
14  chromedriver                        0x0000000104900818 chromedriver + 4016152
15  chromedriver                        0x0000000104920570 chromedriver + 4146544
16  chromedriver                        0x00000001048f22cc chromedriver + 3957452
17  chromedriver                        0x000000010493feb8 chromedriver + 4275896
18  chromedriver                        0x0000000104940034 chromedriver + 4276276
19  chromedriver                        0x000000010494ea28 chromedriver + 4336168
20  libsystem_pthread.dylib             0x0000000186d4a034 _pthread_start + 136
21  libsystem_pthread.dylib             0x0000000186d44e3c thread_start + 8
