In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

from bs4 import BeautifulSoup
import requests
from parsel import Selector
import re
import polars as pl
import time
import os

In [2]:
def get_names(selector) -> list[str]:
    return selector.xpath(
        '//*[contains(concat( " ", @class, " " ), concat( " ", "PositionCard_name__iERDX", " " ))]'
    ).re(
        r'<span class="PositionCard_name__iERDX">(.*?)</span>'
    )  # .getall()


def get_titles(selector) -> list[str]:
    return [
        i
        for i in selector.xpath(
            '//*[contains(concat( " ", @class, " " ), concat( " ", "PositionCard_role__XNUly", " " ))]'
        ).re(r'<div class="PositionCard_role__XNUly">(.*?)</div>')
        if "Board Member" not in i
    ]


def get_image_results(selector) -> dict[str, str]:
    result = pl.DataFrame(
        {
            "image_results": selector.xpath(
                '//*[contains(concat( " ", @class, " " ), concat( " ", "hLdhge", " " ))]'
            ).getall()
        }
    ).with_columns(
        pl.col("image_results").str.extract(r'src="([^"]*)"').alias("url"),
        pl.col("image_results")
        .str.extract(r'alt="([^"]*)"')
        .str.split("'s profile picture")
        .list.first()
        .alias("names"),
    )

    return dict(zip(result["names"], result["url"]))


def get_direct_superior_name(selector) -> str:
    return selector.xpath(
        '//*[contains(concat( " ", @class, " " ), concat( " ", "dVvVox", " " ))]'
    ).re(r">(.*?)<")[0]

In [3]:
def get_text_selector(text: str):
    return Selector(text)

In [4]:
def collect_dataset_text(text: str, chart_status: int) -> tuple[pl.DataFrame, str]:
    selector = get_text_selector(text)
    direct_superior = get_direct_superior_name(selector)
    return (
        pl.DataFrame(
            {"name": get_names(selector), "title": get_titles(selector)}
        ).with_columns(
            pl.col("name")
            .replace_strict(get_image_results(selector), default="no_match")
            .alias("image_src"),
            pl.lit(direct_superior).alias("reports_to"),
            pl.lit(chart_status).alias("chart_status"),
        ),
        direct_superior,
    )

In [5]:
def save_collected_data(data: pl.DataFrame, chart_status: int, path: str):
    if chart_status > 1:
        data.to_pandas().to_csv(path, header=False, index=False, mode="a+")
    else:
        data.to_pandas().to_csv(path, header=True, index=False, mode="a+")

In [6]:
def run_data_collection(chart_status: int, path: str):
    text = str(input("Input html element: "))
    data, direct_superior = collect_dataset_text(text, chart_status)
    save_collected_data(data, chart_status, path)
    print(
        f"Success - Data collected for -> chart status: {chart_status} with superior: {direct_superior}"
    )

In [269]:
run_data_collection(
    chart_status=5,
    path="C:/Users/gilnr/OneDrive/Ambiente de Trabalho/Learning/Work/Companies/org_chart_data_apple.csv",
)

Success - Data collected for -> chart status: 5 with superior: Lily Peng


## Automate Data Collection

In [None]:
class Browser:
    def __init__(self):
        self.browser = webdriver.Chrome(service=ChromeDriverManager().install())

    def open_page(self, url: str):
        self.browser.get(url)

    def close_browser(self):
        self.browser.close()

    def add_input(self, by: By, value: str, text: str):
        field = self.browser.find_element(by=by, value=value)
        field.send_keys(text)
        time.sleep(1)

    def click_button(self, by: By, value: str):
        button = self.browser.find_element(by=by, value=value)
        button.click()
        time.sleep(1)

In [None]:
def login_org_webside(
    browser: Browser,
    url: str,
):
    browser.open_page(url)
    time.sleep(3)

    browser.add_input(by=By.NAME, value="email", text=os.getenv("EMAIL"))
    browser.add_input(by=By.NAME, value="password", text=os.getenv("PASSWORD"))
    browser.click_button(
        by=By.CLASS_NAME, value="sc-d3ca6972-0 sc-2e5943b7-0 kEyLlR bQvdov"
    )

In [None]:
browser = Browser()
login_org_webside(browser, "https://theorg.com/login?next=/org/google")