In [None]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, StaleElementReferenceException
import pandas as pd
import time
import logging
from datetime import datetime

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] %(message)s',
    filename='../logs/logs.log',
    datefmt='%Y-%m-%d %H:%M:%S'
)

def get_filter_options(filter_id):
    filter_element = driver.find_element(By.ID, filter_id)
    dropdown = Select(filter_element)
    options = [option.get_attribute("value") for option in dropdown.options if option.get_attribute("value") != "disabled"]

    return options

def set_filter(filter_id, value):
    filter_element = driver.find_element(By.ID, filter_id)
    dropdown = Select(filter_element)
    dropdown.select_by_value(value)
    time.sleep(1)

def select_dropdown_option_startswith(dropdown, starting_text):
    """
    Selects an option in a dropdown where the visible text starts with a specified set of characters.

    Parameters:
    dropdown (Select): The dropdown element as a Selenium Select object.
    starting_text (str): The starting text to look for in each option's visible text.

    Returns:
    bool: True if an option was successfully selected, False otherwise.
    """
    try:
        for option in dropdown.options:
            if option.text.startswith(starting_text):
                dropdown.select_by_visible_text(option.text)
                logging.info(f"Selected option: {option.text}")
                time.sleep(1)

        # If no match was found
        logging.warning(f"No option found starting with '{starting_text}' in the dropdown.")

    except Exception as e:
        logging.error(f"An error occurred while selecting the option: {e}")

def main():
    # Define url
    url = 'https://gptw.com.br/ranking/melhores-empresas/'
    # Configure webdriver
    options = webdriver.FirefoxOptions()
    options.add_argument("--headless")
    # Initialize webdriver
    driver = webdriver.Firefox(options=options)
    driver.get(url)
    # Define wait
    wait = WebDriverWait(driver, 10)

    # Cookie clicker
    try:
        dismiss_cookie_button = wait.until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "a.cc-btn.cc-DISMISS"))
        )
        dismiss_cookie_button.click()
        logging.info("Cookie popup dismissed.")
    except Exception as e:
        logging.warning(f"Could not dismiss cookie popup: {e}")

    year_options = get_filter_options("filterByYear")
    set_filter("filterByYear", year_options[0])

    table_data = []
    for year in year_options:
        set_filter("filterByYear", year)
        dropdown_type = Select(driver.find_element(By.ID, "filterByType"))
        select_dropdown_option_startswith(dropdown_type, "Nacional ou")
        dropdown_ranking = Select(driver.find_element(By.ID, "filterByRanking"))
        select_dropdown_option_startswith(dropdown_ranking, "Latam")

        cut_options = get_filter_options("filterByCut")
        for cut in cut_options[1:]:
            set_filter("filterByCut", cut)
            driver.find_element(By.ID, "filterRanking").click()

            # Wait until the table is visible
            WebDriverWait(driver, 10).until(
                EC.visibility_of_element_located((By.ID, "filterResult"))
            )

            # Locate tbody within the table
            table_body = driver.find_element(By.CSS_SELECTOR, "#filterResult tbody")

            # Collect data from each row in tbody
            for row in table_body.find_elements(By.TAG_NAME, "tr"):
                cells = row.find_elements(By.TAG_NAME, "td")
                if len(cells) >= 5:  # Ensure there are enough cells in the row
                    row_data = {
                        "posicao": cells[0].text,
                        "empresa": cells[1].text,
                        "funcionarios": cells[2].text,
                        "industria": cells[3].text,
                        "propriedade": cells[4].text,
                        "ano": year,
                        "corte": cut
                    }
                    table_data.append(row_data)

    # Close the driver
    driver.quit()

    # Convert to DataFrame and save to CSV
    df = pd.DataFrame(table_data)
    df.to_csv('../data/gptw_ranking.csv', index=False)

if __name__ == "__main__": 
    main()

In [10]:
df.empresa.str.lower()

0                                                elektro
1                                              transbank
2                                                  gazin
3                     sama s. a. - minerações associadas
4      liderman asesoría, seguridad y vigilancia asev...
                             ...                        
427                                               e-core
428                                              visagio
429                                                gazin
430                fundimisa - fundicao e usinagem ltda.
431                       supera farma laboratorios s.a.
Name: empresa, Length: 432, dtype: object