 # Webscrapping de vagas do Glassdoor com Selenium

## Importação de bibliotecas

In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
from IPython.display import clear_output


## Definição da função `get_job_info`

In [3]:
def get_job_info(keyword, location_name, num_pages):
    
    #seta opções de tela para o driver
    options = Options()
    options.add_argument("window-size=1024,768")
    options.add_argument("--window-position=1024,768")
    
    #inicia o driver
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
    driver.get("https://www.glassdoor.co.in/Job/Home/recentActivity.htm")
    
    #insere a palavra chave (ex.: Data Scientist)
    search_input = driver.find_element(By.ID, 'sc.keyword')
    search_input.send_keys(keyword)

    #insere a palavra localização da vaga (ex.: United States)
    location_input = driver.find_element(By.ID, 'sc.location')
    location_input.send_keys(Keys.CONTROL + 'a')
    time.sleep(2)
    location_input.send_keys(location_name)
    time.sleep(5)
    location_input.send_keys(Keys.ENTER)
    time.sleep(2)
    
    #inicializa as listas de atributos das vagas
    company_name = []
    job_title = []
    location = []
    job_description = []
    salary_estimate = []
    company_size = []
    company_type = []
    company_sector = []
    company_industry = []
    company_founded = []
    company_revenue = []

    #atribuindo 1 como índice da primeira página
    current_page = 1     
     
    time.sleep(3)
    
    while current_page <= num_pages:   
        
        done = False
        
        #iterador criado para escrever o xpath do salário
        i = 1

        while not done:
            job_cards = driver.find_elements(By.XPATH, "//article[@id='MainCol']//ul/li[@data-adv-type='GENERAL']")
            for card in job_cards:
                card.click()
                time.sleep(3)

                # Fecha a janela de requisição de login
                try:
                    driver.find_element(By.XPATH, ".//span[@class='SVGInline modal_closeIcon']").click()
                    time.sleep(1)
                except NoSuchElementException:
                    time.sleep(2)
                    pass

                #Expande as descrições em "Show More"
                opened=False
                tries=0

                while not opened and tries!=5:
                    try:
                        driver.find_element(By.XPATH,"//div[@class='css-t3xrds e856ufb4']").click()
                        time.sleep(1)
                        opened=True
                    except NoSuchElementException:
                        card.click()
                        print(str(current_page) + '#ERROR: no such element')
                        tries+=1
                        time.sleep(2)
                
                #Raspagem dos dados 
                try:
                    company_name.append(driver.find_element(By.XPATH, "//div[@class='css-87uc0g e1tk4kwz1']").text)
                except:
                    company_name.append("#N/A")
                    pass

                try:
                    job_title.append(driver.find_element(By.XPATH, "//div[@class='css-1vg6q84 e1tk4kwz4']").text)
                except:
                    job_title.append("#N/A")
                    pass

                try:
                    location.append(driver.find_element(By.XPATH, "//div[@class='css-56kyx5 e1tk4kwz5']").text)
                except:
                    location.append("#N/A")
                    pass

                try:
                    job_description.append(driver.find_element(By.XPATH, "//div[@id='JobDescriptionContainer']").text)
                except:
                    job_description.append("#N/A")
                    pass

                try:
                    salary_estimate.append(driver.find_element(By.XPATH, f"/html/body/div[2]/div/div/div/div/div[2]/section/article/div[1]/ul/li[{i}]/div[2]/div[3]/div[1]/span").text)

                except:
                    salary_estimate.append("#N/A")
                    pass
                
                try:
                    company_size.append(driver.find_element(By.XPATH, "//div[@id='CompanyContainer']//span[text()='Size']//following-sibling::*").text)
                except:
                    company_size.append("#N/A")
                    pass
                
                try:
                    company_type.append(driver.find_element(By.XPATH, "//div[@id='CompanyContainer']//span[text()='Type']//following-sibling::*").text)
                except:
                    company_type.append("#N/A")
                    pass
                    
                try:
                    company_sector.append(driver.find_element(By.XPATH, "//div[@id='CompanyContainer']//span[text()='Sector']//following-sibling::*").text)
                except:
                    company_sector.append("#N/A")
                    pass
                    
                try:
                    company_industry.append(driver.find_element(By.XPATH, "//div[@id='CompanyContainer']//span[text()='Industry']//following-sibling::*").text)
                except:
                    company_industry.append("#N/A")
                    pass
                    
                try:
                    company_founded.append(driver.find_element(By.XPATH, "//div[@id='CompanyContainer']//span[text()='Founded']//following-sibling::*").text)
                except:
                    company_founded.append("#N/A")
                    pass
                    
                try:
                    company_revenue.append(driver.find_element(By.XPATH, "//div[@id='CompanyContainer']//span[text()='Revenue']//following-sibling::*").text)
                except:
                    company_revenue.append("#N/A")
                    pass
                
                #debug simples dos dados raspados
                clear_output(wait=True)
                print('='*60)
                print(f'Página {current_page} - Vaga {i}')
                print('='*60)
                print(f'Job title: {job_title[i-1]}')
                print(f'Company name: {company_name[i-1]}')
                print(f'Salary estimate: {salary_estimate[i-1]}')
                print('\n')
                
                i += 1    
                    
                done = True
                
       # Para a próxima página:        
        if done:
            driver.find_element(By.XPATH, "//span[@alt='next-icon']").click()   
            current_page += 1
            time.sleep(4)

    driver.close()

    #criação do data frame
    df = pd.DataFrame({'company': company_name, 
    'job title': job_title,
    'location': location,
    'job description': job_description,
    'salary estimate': salary_estimate,
    'company_size': company_size,
    'company_type': company_type,
    'company_sector': company_sector,
    'company_industry' : company_industry,
    'company_founded' : company_founded,
    'company_revenue': company_revenue})
    
    df.to_csv(f'{keyword.replace(" ", "_")}_{location_name.replace(" ", "_")}.csv', index=False, mode='w+')

## Disponibilização do dataset em `.csv` 

In [4]:
get_job_info("Data Engineer", 'United States', 25)

Página 16 - Vaga 2
Job title: Data Engineer- Remote
Company name: YT Global Network
5.0
Salary estimate: $90.00 - $120.00 Per hour(Employer Est.)




StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
  (Session info: chrome=111.0.5563.147)
Stacktrace:
Backtrace:
	(No symbol) [0x0035DCE3]
	(No symbol) [0x002F39D1]
	(No symbol) [0x00204DA8]
	(No symbol) [0x002076FB]
	(No symbol) [0x002075D0]
	(No symbol) [0x00207850]
	(No symbol) [0x002316CD]
	(No symbol) [0x00228B4D]
	(No symbol) [0x0024AECC]
	(No symbol) [0x00228726]
	(No symbol) [0x0024B224]
	(No symbol) [0x0025D57C]
	(No symbol) [0x0024ACC6]
	(No symbol) [0x00226F68]
	(No symbol) [0x002280CD]
	GetHandleVerifier [0x005D3832+2506274]
	GetHandleVerifier [0x00609794+2727300]
	GetHandleVerifier [0x0060E36C+2746716]
	GetHandleVerifier [0x00406690+617600]
	(No symbol) [0x002FC712]
	(No symbol) [0x00301FF8]
	(No symbol) [0x003020DB]
	(No symbol) [0x0030C63B]
	BaseThreadInitThunk [0x752E00F9+25]
	RtlGetAppContainerNamedObjectPath [0x77467BBE+286]
	RtlGetAppContainerNamedObjectPath [0x77467B8E+238]
