# Selenium Test

In [None]:
# IMPORTS
from collections.abc import Generator, Iterator
from datetime import datetime as dt
import difflib
from io import BytesIO
from math import ceil
import numpy as np
from pathlib import Path
from PIL import Image
import requests
from time import sleep, time
from urllib.parse import urlparse, parse_qs, unquote

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.remote.webelement import WebElement
from selenium.common.exceptions import (
    StaleElementReferenceException,
    TimeoutException,
)

## 1. Initialization

In [None]:
# CONSTANTS
SEARCH_URL: str = f"https://duckduckgo.com/?q="


NORMAL_GENERATOR: Generator = np.random.default_rng()

NORMAL_MEAN: float = 0.5
NORMAL_STD: float = 0.3
NORMAL_SIZE: float = 1


# WebElement search dictionary where key is a title pointing to a vector with:
# [By name, By value, #empirical possible, #actual possible]
ELEMENT_DICT: dict[str, list[str]] = {
    # "html": [By.TAG_NAME, "html", 1, 1],
    "body": [By.TAG_NAME, "body", 1, 1],
    "web": [By.ID, "web_content_wrapper", 1, 1],
    # "xpath_1": [By.XPATH, "//*[@id=\"react-layout\"]", 1, 1],
    "xpath": [By.XPATH, "//*", 5188, 5188],
    "section": [By.TAG_NAME, "section", 21, 6],
    "ol_1": [By.TAG_NAME, "ol", 47, 9],
    "ol_2": [By.TAG_NAME, "ol", 126, 9],
    "figure": [By.TAG_NAME, "figure", 117, 100],
    # "xpath_2": [By.XPATH, "/html/body/*", 900, 0],
    # "img": [By.TAG_NAME, "img", 28800, 900],
}


CHECKPOINT_1H = range(0, 432001, 3600)
FLAG_CHECKPOINT_1H = [False for e in CHECKPOINT_1H]
CHECKPOINT_10M = range(0, 432001, 600)
FLAG_CHECKPOINTS_10M = [False for e in CHECKPOINT_10M]

In [None]:
# HELPERS
def _get_search_url(query: str) -> str:
    return f"{SEARCH_URL}{query}&iar=images"


def _get_random(
    *,
    mean: float = NORMAL_MEAN,
    std: float = NORMAL_STD,
    size: float = NORMAL_SIZE,
) -> float:
    """
    Placeholder
    """
    value = NORMAL_GENERATOR.normal(loc = mean, scale = std, size = size)
    return max(float(value[0]), 0.0011)


def _log(t0: int, by_tag: int, by_val: int, max_val: int, curr_n: int, attempt_n: int, total_n: int, checkp: str) -> None:
    tm = time()
    stp = f"[{str(dt.now()).split(".")[0]}]"
    msg = (
        f"{stp} Attempt {attempt_n}/{total_n} in: {by_tag}={by_val} with at least {max_val} elements.\n"
        f"{checkp} Total Time in {by_tag}={by_val} is {tm - t0:.2f}s | Current Elements: {curr_n}.\n"
        f"{"-"*110}\n"
    )
    print(msg, end="")


def _safeproof_click(driver, element_or_list, timeout=10):
    # 1. Determine if it's a list or single element
    target = element_or_list[0] if isinstance(element_or_list, list) else element_or_list
    
    try:
        # 2. Wait until the element is actually clickable
        wait = WebDriverWait(driver, timeout)
        wait.until(EC.element_to_be_clickable(target))
        
        # 3. Try standard Selenium click (mimics real user)
        target.click()
    except Exception:
        # 4. Fallback: JavaScript click (bypasses overlays/visibility issues)
        driver.execute_script("arguments[0].click();", target)


def _click_on_xpath(xpath: str) -> None:
    element_list: list[WebElement] = driver.find_elements(By.XPATH, xpath)
    for element in element_list:
        try:
            element.click()
        except Exception as e:
            pass

    
# Remove empty elements and keep only unique elements
def _unique_remove_empty(
    element_list: list[WebElement],
) -> list[WebElement]:
    """
    - Removes empty inner lists
    - Removes stale/broken WebElements
    - Deduplicates WebElements globally (by element.id)
    - Preserves order and grouping
    """

    seen_ids: set[str] = set()
    cleaned: list[WebElement] = []

    try:
        for el in element_list:
            if el is None:
                continue

            try:
                el_id = el.id  # touching .id validates the element
            except StaleElementReferenceException as ein:
                continue

            if el_id in seen_ids:
                continue

            seen_ids.add(el_id)
            cleaned.append(el)

    except Exception as eout:
        cleaned = {x for x in element_list if x}
        cleaned = list(cleaned)

    return cleaned


def _remove_translate(web_element: str, base_url: str) -> str:
    """
    Extracts and decodes the real image URL from a DuckDuckGo image wrapper URL.
    - Ignores exact base_url matching (robust to variations)
    - Decodes percent-encoded URLs
    """
    
    # 1. Parse and query
    parsed = urlparse(web_element)
    query = parse_qs(parsed.query)
    
    # 2. DuckDuckGo image links usually store the real URL in `iai`
    if "iai" not in query:
    
        # 2.1.1. Normalize spaces encoded as '+'
        normalized = web_element.replace("+", " ")
    
        # 2.1.2. Attempt direct removal if web_element starts with base_url
        if normalized.startswith(base_url):
            remainder = normalized[len(base_url):]
        else:
            # 2.1.3. Fuzzy match (threshold ≈ 90%)
            ratio = difflib.SequenceMatcher(None, normalized[:len(base_url)], base_url).ratio()
            if ratio >= 0.90:
                remainder = normalized[len(base_url):]
            else:
                # 2.1.4. If no good match, fall back:
                #    Find the first encoded "http" that is not the main one.
                idx = normalized.find("http", 5)  # skip the initial "http"
                if idx == -1:
                    return ""  # nothing usable found
                remainder = normalized[idx:]
    
        # 2.1.5. Strip leading junk (&, ?, etc.)
        while remainder.startswith(("&", "?", "=")):
            remainder = remainder[1:]
    
        # 2.1.6. Decode URL-encoded parts
        decoded_url = unquote(remainder)
    
    else:
    
        # 2.2.1. Parse_qs returns lists
        encoded_url = query["iai"][0]
    
        # 2.2.2. Decode percent-encoding
        decoded_url = unquote(encoded_url)
    
    # 3. Remove trailing ?
    if "?" in decoded_url[-20:]:
        decoded_url = "?".join(decoded_url.split("?")[:-1])
    
    return decoded_url

In [None]:
# INPUTS

query: str = "Olafur+Dari+Olafsson"
output_err_file: Path = Path("/Users/egg/Desktop/pics/notebooks/stderr.txt")

options: Options = Options()
service: Service = Service(ChromeDriverManager().install())

nav_window_size: tuple[int, int] = (1920, 1080)
waiter_total_time_base: int = 20
sleep_min_base: int = 0.1
sleep_max_base: int = 2
max_global_tries: int = 2
max_local_tries: int = 5

flag_checkpoint_1h = [False] * len(FLAG_CHECKPOINT_1H)
flag_checkpoints_10m = [False] * len(CHECKPOINT_10M)

In [None]:
# INITIALIZE

# Input
search_url = _get_search_url(query)

# Driver Main
driver = webdriver.Chrome(service=service, options=options)

# Create window to maximize chance
driver.maximize_window()
driver.set_window_size(*nav_window_size)

# Navigate to the URL
driver.get(search_url)
sleep(1)
driver.refresh()
sleep(1)

print(f"Search URL = {search_url}")

In [None]:
actions = ActionChains(driver, duration=100)
actions.scroll_by_amount(delta_x=0, delta_y = 100)
sleep(sleep_min_base + _get_random())

In [None]:


actions = ActionChains(driver, duration=150)
for i in range(0, 1000):
    actions.scroll_by_amount(delta_x=0, delta_y = ceil(1 + _get_random()))
actions.perform()

## 2. Pre-Switches (clicks)

In [None]:
# Switch on Location Search

def click_location_button() -> None:
    """
    Placeholder
    """

    element_list = driver.find_elements(By.CSS_SELECTOR, "[role='switch']")
    # print(f"element_list = {element_list}")

    for element in element_list:

        # print(f"element = {element}")

        aria_checked: str = element.get_property("ariaChecked")

        # print(f"aria_checked = {aria_checked} | type = ({type(aria_checked)})")

        if aria_checked is not None and aria_checked == "false":
            try:
                element.click()
            except Exception as e:
                continue
            break

        sleep(sleep_min_base + _get_random())
    return

click_location_button()

In [None]:
# Select Location to USA and Image Size to Large

def select_location_dropdown(
    *,
    target_text_list: list[str] = [
        "US (English)",
        "US",
        "USA",
        "Estados Unidos (inglês)",
        "Estados Unidos",
        "United States (english)",
        "United States"
    ]
) -> None:
    """
    Placeholder
    """

    # Change locale to US (English)
    sleep(sleep_max_base + _get_random())
    _click_on_xpath('//*[@id="react-layout"]/div/div[2]/div/nav/div/ul/li[1]/div/div[1]')
    sleep(sleep_min_base + _get_random())
    _click_on_xpath('//*[@id="react-layout"]/div/div[2]/div/nav/div/ul/li[1]/div/div[2]/div[2]/div[63]/div/div/div/span[2]')
    sleep(sleep_max_base + _get_random())

    # Changing Image Sizes to Large
    sleep(sleep_max_base + _get_random())
    _click_on_xpath('//*[@id="react-layout"]/div/div[2]/div/nav/div/ul/li[5]/div/div[1]')
    sleep(sleep_min_base + _get_random())
    _click_on_xpath('//*[@id="react-layout"]/div/div[2]/div/nav/div/ul/li[5]/div/div[2]/div/div[4]/div')
    sleep(sleep_max_base + _get_random())

select_location_dropdown()

## 3. Search

In [None]:
# HTML

# Find all <html> elements
waiter = WebDriverWait(driver, sleep_max_base+waiter_total_time_base+_get_random())
curr_element_list: list[WebElement] = waiter.until(EC.presence_of_all_elements_located((By.TAG_NAME, "html")))

# Print the number of <html> elements found
print(f"Found {len(curr_element_list)} html elements")

In [None]:
# Main Loop

for counter, (element_key, element_value) in enumerate(ELEMENT_DICT.items(), start=1):

    t0 = time()
    flag_condition_met: bool = False
    
    by_tag = element_value[0]
    by_val = element_value[1]
    max_val = element_value[3]
 
    if max_val == 0:
        max_val = element_value[2]

    # if counter%10 == 0:
    #     print(f"{"#"*120}")
    #     print(f"[STARTED] by_tag = {by_tag} | by_val = {by_val} | max_val = {max_val}")
    
    if curr_element_list:
        prev_element_list: list[WebElement] = curr_element_list.copy()
        curr_element_list: list[WebElement] = []
        flag_condition_met: bool = False
        minimum_unique_elements: int = max_val if max_val > 3 else 1
        flag_checkpoint_1h[:] = [False] * len(flag_checkpoint_1h)
        flag_checkpoints_10m[:] = [False] * len(flag_checkpoints_10m)

    # _log(t0, by_tag, by_val, max_val, len(curr_element_list), counter, max_global_tries, "[Before-Loop]")
    
    for glonen in range(max_global_tries):

        if flag_condition_met:
            break

        for web_element in prev_element_list:

            if flag_condition_met:
                break

            counter_hour: int = 0
            counter_minute: int = 0
            total_time_elapsed: int = 0
            section_element_list: list[WebElement] = []
            print(f"Loading Attempt {glonen} of {max_global_tries}: .", end="")
            for _ in range(max_local_tries):

                if len(curr_element_list) >= minimum_unique_elements:
                    flag_condition_met = True
                    break
                
                try:
                    section_element_list = waiter.until(EC.presence_of_all_elements_located((by_tag, by_val)))

                except StaleElementReferenceException as sere:
                    pass
                except TimeoutException as toe:
                    pass

                if not section_element_list:
                    continue

                curr_element_list.extend(section_element_list)
                curr_element_list: list[WebElement] = _unique_remove_empty(curr_element_list)
                # sleep(sleep_min_base+_get_random())

                if section_element_list:
                    section_element_list: list[WebElement] = []

                if len(curr_element_list) >= minimum_unique_elements:
                    flag_condition_met = True
                    break
                    
                t1 = time()
                total_time_elapsed += (t1-t0)

                if not flag_checkpoints_10m[counter_minute] and total_time_elapsed >= CHECKPOINT_10M[counter_minute]:
                    # print(".", end="")
                    flag_checkpoints_10m[counter_minute] = True
                    counter_minute += 1

                    
                if not flag_checkpoint_1h[counter_hour] and total_time_elapsed >= CHECKPOINT_1H[counter_hour]:
                    # print(f"|{(total_time_elapsed/3600):.2f}|", end="")
                    flag_checkpoint_1h[counter_hour] = True
                    counter_hour += 1

                sleep(sleep_min_base + _get_random())
            
            # print(".\n", end="")
            # _log(t0, by_tag, by_val, max_val, len(curr_element_list), counter, max_local_tries, "[After-Loop]")
            if flag_condition_met or len(curr_element_list) >= minimum_unique_elements:
                flag_condition_met = True
                break
        
        # print(len(curr_element_list), minimum_unique_elements)
        if flag_condition_met or len(curr_element_list) >= minimum_unique_elements:
            flag_condition_met = True
            break

        # _log(t0, by_tag, by_val, max_val, len(curr_element_list), counter, max_global_tries, "[End-Loop]")

In [None]:
# Creation of URI List

import difflib
from urllib.parse import urlparse, parse_qs, unquote

# Getting baseURI property
uri_list: list[str] = []
for web_element in curr_element_list:
    try:
        web_element.click()
        uri = web_element.get_property("baseURI")
    except Exception as e:
        pass
    sleep(sleep_min_base + _get_random())
    uri_list.append(uri)

In [None]:
# Get links

image_link_list: list[str] = []

for uri in uri_list:
    image_link = _remove_translate(uri, search_url)
    image_link_list.append(image_link)

print(f"Image Link list has {len(image_link_list)} elements")
print(f"# Sample Head:\n{"\n".join(image_link_list[:min(len(image_link_list), 10)])}")

## 4. Scrolling

In [None]:
# Scroll down for more images

sleep(sleep_min_base + _get_random())
# driver.refresh()
sleep(sleep_min_base + _get_random())

actions = ActionChains(driver)
for _ in range(2):
    actions.scroll_by_amount(delta_x=0, delta_y = nav_window_size[1])
    sleep(sleep_min_base + _get_random())


## 5. Novel Search

In [None]:
# HTML

# Find all <html> elements
waiter = WebDriverWait(driver, sleep_max_base+waiter_total_time_base+_get_random())
curr_element_list: list[WebElement] = waiter.until(EC.presence_of_all_elements_located((By.TAG_NAME, "html")))

# Print the number of <html> elements found
print(f"Found {len(curr_element_list)} html elements")

In [None]:
for counter, (element_key, element_value) in enumerate(ELEMENT_DICT.items(), start=1):

    t0 = time()
    flag_condition_met: bool = False
    
    by_tag = element_value[0]
    by_val = element_value[1]
    max_val = element_value[3]
 
    if max_val == 0:
        max_val = element_value[2]

    # if counter%10 == 0:
    #     print(f"{"#"*120}")
    #     print(f"[STARTED] by_tag = {by_tag} | by_val = {by_val} | max_val = {max_val}")
    
    if curr_element_list:
        prev_element_list: list[WebElement] = curr_element_list.copy()
        curr_element_list: list[WebElement] = []
        flag_condition_met: bool = False
        minimum_unique_elements: int = max_val if max_val > 3 else 1
        flag_checkpoint_1h[:] = [False] * len(flag_checkpoint_1h)
        flag_checkpoints_10m[:] = [False] * len(flag_checkpoints_10m)

    # _log(t0, by_tag, by_val, max_val, len(curr_element_list), counter, max_global_tries, "[Before-Loop]")
    
    for glonen in range(max_global_tries):

        if flag_condition_met:
            break

        for web_element in prev_element_list[::-1]:

            counter_hour: int = 0
            counter_minute: int = 0
            total_time_elapsed: int = 0
            section_element_list: list[WebElement] = []
            print(f"Loading Attempt {glonen} of {max_global_tries}: .", end="")
            for _ in range(max_local_tries):
                
                try:
                    section_element_list = waiter.until(EC.presence_of_all_elements_located((by_tag, by_val)))

                except StaleElementReferenceException as sere:
                    pass
                except TimeoutException as toe:
                    pass

                if not section_element_list:
                    continue

                curr_element_list.extend(section_element_list)
                curr_element_list: list[WebElement] = _unique_remove_empty(curr_element_list)
                # sleep(sleep_min_base+_get_random())

                if section_element_list:
                    section_element_list: list[WebElement] = []

                if len(curr_element_list) >= minimum_unique_elements:
                    flag_condition_met = True
                    continue
                    
                t1 = time()
                total_time_elapsed += (t1-t0)

                if not flag_checkpoints_10m[counter_minute] and total_time_elapsed >= CHECKPOINT_10M[counter_minute]:
                    # print(".", end="")
                    flag_checkpoints_10m[counter_minute] = True
                    counter_minute += 1

                    
                if not flag_checkpoint_1h[counter_hour] and total_time_elapsed >= CHECKPOINT_1H[counter_hour]:
                    # print(f"|{(total_time_elapsed/3600):.2f}|", end="")
                    flag_checkpoint_1h[counter_hour] = True
                    counter_hour += 1

                sleep(sleep_min_base + _get_random())
            
            # print(".\n", end="")
            # _log(t0, by_tag, by_val, max_val, len(curr_element_list), counter, max_local_tries, "[After-Loop]")
            if flag_condition_met or len(curr_element_list) >= minimum_unique_elements:
                flag_condition_met = True
                break
        
        # print(len(curr_element_list), minimum_unique_elements)
        if flag_condition_met or len(curr_element_list) >= minimum_unique_elements:
            flag_condition_met = True
            break

In [None]:
# Getting baseURI property
new_uri_list: list[str] = []
for web_element in curr_element_list:
    try:
        web_element.click()
        uri = web_element.get_property("baseURI")
    except Exception as e:
        pass
    sleep(sleep_min_base + _get_random())
    new_uri_list.append(uri)

In [None]:
# Final of URI List

new_image_link_list: list[str] = []

for uri in new_uri_list:
    image_link = _remove_translate(uri, search_url)
    new_image_link_list.append(image_link)

image_link_list.extend(new_image_link_list)

final_uri_list: list[str] = list(set(uri_list))

print(f"Final URI list has {len(final_uri_list)} elements")
print(f"# Sample Head:\n{"\n".join(final_uri_list[:min(len(final_uri_list), 10)])}")

In [None]:
# Close the WebDriver
driver.quit()

# Old Code 1

In [None]:
# web_element = curr_element_list[0]
# my_click = web_element.click()
# print(my_click)

# Getting baseURI property
uri_list: list[str] = []
for web_element in curr_element_list:
    try:
        web_element.click()
        uri = web_element.get_property("baseURI")
    except Exception as e:
        pass
    sleep(sleep_min_base + _get_random())
    uri_list.append(uri)


print(f"There are {len(uri_list)} ULRs in uri_list")

# Final Links 1
image_link_list: list[str] = []

for uri in uri_list:
    image_link = _remove_translate(uri, base_url: str)
    image_link_list.append(image_link)

print(f"There are {len(image_link_list)} ULRs in image_link_list")
print(f"Example: {image_link_list[0]}")

In [None]:
from urllib.parse import unquote
import difflib
from urllib.parse import urlparse, parse_qs, unquote

base_url = search_url
web_element = uri_list[1]
print(uri)

In [None]:
# 1. Parse
parsed = urlparse(web_element)
print(parsed)

In [None]:
# 2. Query
query = parse_qs(parsed.query)
print(query)

In [None]:
# DuckDuckGo image links usually store the real URL in `iai`
a = "iai" not in query
print(a)

In [None]:
# 3. Parse_qs returns lists
encoded_url = query["iai"][0]
print(encoded_url)

In [None]:
# 4. Decode percent-encoding
decoded_url = unquote(encoded_url)
print(decoded_url)

### Separator

In [None]:
# 1. Normalize spaces encoded as '+'
normalized = web_element.replace("+", " ")
print(normalized)

In [None]:
# 2. Attempt direct removal if web_element starts with base_url
a = normalized.startswith(base_url)
print(a)

In [None]:
remainder = normalized[len(base_url):]
print(remainder)

In [None]:
# 3. Fuzzy match (threshold ≈ 90%)
ratio = difflib.SequenceMatcher(None, normalized[:len(base_url)], base_url).ratio()
b = ratio >= 0.90
print(ratio)
print(b)

In [None]:
remainder = normalized[len(base_url):]
print(remainder)

In [None]:
# 4. If no good match, fall back:
#    Find the first encoded "http" that is not the main one.
idx = normalized.find("http", 5)  # skip the initial "http"
print(idx)

In [None]:
remainder = normalized[idx:]
print(remainder)

In [None]:
# 5. Strip leading junk (&, ?, etc.)
while remainder.startswith(("&", "?", "=")):
    remainder = remainder[1:]
print(remainder)

In [None]:
# 6. Decode URL-encoded parts
decoded = unquote(remainder)
if "?" in decoded[-20:]:
    decoded = "?".join(decoded.split("?")[:-1])
print(decoded)

# Old Code 2

In [None]:
# Getting baseURI property
uri_list: list[str] = []
for web_element in curr_element_list:
    try:
        web_element.click()
        uri = web_element.get_property("baseURI")
    except Exception as e:
        pass
    sleep(sleep_min_base + _get_random())
    uri_list.append(uri)


print(f"There are {len(uri_list)} ULRs in uri_list")

# Final Links 1
image_link_list: list[str] = []

for uri in uri_list:
    image_link = _remove_translate(uri, base_url: str)
    image_link_list.append(image_link)

print(f"There are {len(image_link_list)} ULRs in image_link_list")
print(f"Example: {image_link_list[0]}")

In [None]:
def select_location_dropdown(
    *,
    target_text_list: list[str] = [
        "US (English)",
        "US",
        "USA",
        "Estados Unidos (inglês)",
        "Estados Unidos",
        "United States (english)",
        "United States"
    ]
) -> None:
    """
    Placeholder
    """

    def _walk_dfs(root: WebElement) -> Iterator[WebElement]:
        stack = [root]
        while stack:
            element = stack.pop()
            yield element
            parent = element.get_property("parentNode")
            if parent:
                stack.append(parent)
            else:    
                stack.append("False")
    
    div_element_list = driver.find_elements(By.XPATH, "//*[@data-testid='dropdown-options']")

    success_flag = False

    for idx1, div_element in enumerate(div_element_list, start = 1):

        print(f"# Div {idx1} of {len(div_element_list)}: {div_element}")
        span_element_list = div_element.find_elements(By.TAG_NAME, "span")

        for idx2, span_element in enumerate(span_element_list, start = 1):

            outer_text: str = span_element.get_property("outerText")
            print(f"    # Span {idx2} of {len(span_element_list)} has outer_text = {outer_text} | type = {type(outer_text)}")

            if outer_text:
                for target_text in target_text_list:
                    if target_text.strip().lower() == outer_text.strip().lower():

                        try:
                            driver.execute_script("arguments[0].click();", span_element)
                            span_element.click()
                            success_flag = True
                            break
                        except Exception as e1:
                            span_parent = span_element.get_property("parentNode")
                            for parent_element in _walk_dfs(span_parent):
                                try:
                                    parent_element.click()
                                    success_flag = True
                                    break
                                except Exception as e2:
                                    pass

                    if success_flag:
                        break
            
            if success_flag:
                break

        if success_flag:
            break

    sleep(1.0 + _get_random())
    return

select_location_dropdown()

In [None]:
#for _ in range(0, 7):
#element.get_property("parentNode")
#div_element_list = driver.find_elements(By.XPATH, "//*[@data-testid='dropdown-options']")

# print(len(div_element_list))
# div_children_list_1 = div_element_list[0].get_property("childNodes")
# print(len(div_children_list_1))
# div_children_list_2 = div_children_list_1[0].get_property("childNodes")
# print(len(div_children_list_2))
# div_children_list_3 = div_children_list_2[0].get_property("childNodes")
# print(len(div_children_list_3))

# classname_0 = div_element_list[0].get_property("className")
# classname_1 = div_children_list_1[0].get_property("className")
# classname_2 = div_children_list_2[0].get_property("className")
# classname_3 = div_children_list_3[0].get_property("className")

# print(f"classname_0: |{classname_0}|")
# print(f"classname_1: |{classname_1}|")
# print(f"classname_2: |{classname_2}|")
# print(f"classname_3: |{classname_3}|")

# def select_location_dropdown(
#     *,
#     target_text_list: list[str] = [
#         "US (English)",
#         "US",
#         "USA",
#         "Estados Unidos (inglês)",
#         "Estados Unidos",
#         "United States (english)",
#         "United States"
#     ]
# ) -> None:
#     """
#     Placeholder
#     """

#     div_element_list = driver.find_elements(By.XPATH, "//*[@data-testid='dropdown-options']")
    
#     for current_element in walk_dfs(div_element_list):

#         class_name = current_element.get_property("className")
#         outer_text = current_element.get_property("outerText")

#         if outer_text:
#                 for target_text in target_text_list:
#                     if target_text.strip().lower() == outer_text.strip().lower():


#         if idx >= len(prev_element_list):
            
                        
# #for element in 
# #print(len(span_element_list))

In [None]:
# BODY
body_element_list = []

for e in html_elements:

    # Find all <body> elements
    body_elements = e.find_elements(By.TAG_NAME, "body")
    # body_elements = waiter.until(e.find_elements((By.TAG_NAME, "body")))
    body_element_list.append(body_elements)
    sleep(1)

# Print the number of <body> elements found
print(f"Found {len(body_element_list)} body elements")

In [None]:
# DIV ID = "web_content_wrapper"
divwebcont_element_list = []

for sublist in body_element_list:
    for e in sublist:

        # Find all <div id="web_content_wraper"> elements
        divwebcont_elements = e.find_elements(By.ID, "web_content_wrapper")
        # divwebcont_elements = waiter.until(e.find_elements((By.ID, "web_content_wrapper")))
        divwebcont_element_list.append(divwebcont_elements)
        sleep(1)

# Print the number of <div id="web_content_wraper"> elements found
print(f"Found {len(divwebcont_element_list)} div ID=web_content_wrapper elements")

In [None]:
# DIV XPATH = "'//*[@id="react-layout"]'"
divwebxpath_element_list = []

for sublist in divwebcont_element_list:
    for e in sublist:

        # Find all <div xpath="'//*[@id="react-layout"]'"> elements
        divwebxpath_elements = e.find_elements(By.XPATH, '//*[@id="react-layout"]')
        # divwebxpath_elements = waiter.until(e.find_elements((By.XPATH, '//*[@id="react-layout"]')))
        divwebxpath_element_list.append(divwebxpath_elements)
        sleep(1)

# Print the number of <div xpath="XPATH=//*[@id='react-layout']"> elements found
print(f"Found {len(divwebxpath_element_list)} div XPATH=//*[@id='react-layout'] elements")

In [None]:
print(divwebxpath_element_list)

In [None]:
# DIV XPATH = "//*[@id='react-layout']/div/div[2]/div/div[2]"
divwebxpathreact_element_list = []

for sublist in divwebxpath_element_list:
    for e in sublist:
        
        # Find all <div> elements
        divwebxpathreact_elements = e.find_elements(By.XPATH, "//*")
        # divwebxpathreact_elements = waiter.until(e.find_elements((By.XPATH, "//*[@id='react-layout']/div/div[2]/div/div[2]")))
        divwebxpathreact_element_list.append(divwebxpathreact_elements)
        sleep(3)

# Print the number of <div xpath="//*[@id='react-layout']/div/div[2]/div/div[2]"> elements found
print(f"Found {len(divwebxpathreact_element_list)} div XPATH=//*[@id='react-layout']/div/div[2]/div/div[2] elements")

In [None]:
# SECTION
section_element_list = []

for sublist in divwebxpathreact_element_list:
    for e in sublist:
        
        # Find all <section> elements
        section_elements = e.find_elements(By.TAG_NAME, "section")
        # section_elements = waiter.until(e.find_elements((By.TAG_NAME, "section")))
        section_element_list.append(section_elements)
        sleep(1)

# Print the number of <section> elements found
print(f"Found {len(section_element_list)} section elements")

In [None]:
section_element_list = [e for e in section_element_list if e]
print(f"Found {len(section_element_list)} section elements")
print(section_element_list[:5])

In [None]:
# OL 1
ol_element_list_1 = []

for sublist in section_element_list:
    for e in sublist:
    
        # Find all <ol> elements
        ol_elements = e.find_elements(By.TAG_NAME, "ol")
        # ol_elements = waiter.until(e.find_elements((By.TAG_NAME, "ol")))
        ol_element_list_1.append(ol_elements)
        sleep(1)

# Print the number of <ol> elements found
print(f"Found {len(ol_element_list_1)} ol1 elements")

In [None]:
ol_element_list_1 = [e for e in ol_element_list_1 if e]
print(f"Found {len(ol_element_list_1)} section elements")
print(ol_element_list_1[:5])

In [None]:
# OL 2
ol_element_list_2 = []

for sublist in ol_element_list_1:
    for e in sublist:

        # Find all <ol> elements
        ol_elements = e.find_elements(By.TAG_NAME, "ol")
        # ol_elements = waiter.until(e.find_elements((By.TAG_NAME, "ol")))
        ol_element_list_2.append(ol_elements)
        sleep(1)

# Print the number of <ol> elements found
print(f"Found {len(ol_element_list_2)} ol2 elements")

In [None]:
ol_element_list_2 = [e for e in ol_element_list_2 if e]
print(f"Found {len(ol_element_list_2)} section elements")
print(ol_element_list_2[:5])

In [None]:
# FIGURE
figure_element_list = []

for sublist in ol_element_list_2:
    for e in sublist:

        # Find all <figure> elements
        figure_elements = e.find_elements(By.TAG_NAME, "figure")
        # figure_elements = waiter.until(e.find_elements((By.TAG_NAME, "figure")))
        figure_element_list.append(figure_elements)
        sleep(1)

# Print the number of <figure> elements found
print(f"Found {len(figure_element_list)} figure elements")

In [None]:
figure_element_list = [e for e in figure_element_list if e]
print(f"Found {len(figure_element_list)} section elements")
print(figure_element_list[:1][:1])

In [None]:
# DIV XPATH = "/html/body/div[2]/div[6]/div[4]/div/div[2]/div/div[2]/section/ol/li[1]/ol/li[1]/figure/div[1]"
divwebxpathreact_body_li_element_list = []

for sublist in figure_element_list:
    for e in sublist:

        # Find all <div xpath="/html/body/div/ol/li/figure/div"> elements
        divwebxpathreact_body_li_elements = e.find_elements(By.XPATH, "/html/body/*")
        # divwebxpathreact_body_li_elements = waiter.until(e.find_elements((By.XPATH, "/html/body/div[2]/div[6]/div[4]/div/div[2]/div/div[2]/section/ol/li[1]/ol/li[1]/figure/div[1]")))
        divwebxpathreact_body_li_element_list.append(divwebxpathreact_body_li_elements)
        sleep(1)

# Print the number of <div xpath="/html/body/div/ol/li/figure/div"> elements found
print(f"Found {len(divwebxpathreact_body_li_element_list)} div XPATH=/html/body/div/ol/li/figure/div elements")

In [None]:
# IMG
img_element_list = []

for sublist in divwebxpathreact_body_li_element_list:
    for e in sublist:

        # Find all <img> elements
        img_elements = e.find_elements(By.TAG_NAME, "img")
        # img_elements = waiter.until(e.find_elements((By.TAG_NAME, "img")))
        img_element_list.append(img_elements)
        sleep(1)

# Print the number of <img> elements found
print(f"Found {len(img_element_list)} img elements")

In [None]:
img_element_list = [e for e in img_element_list if e]
print(f"Found {len(img_element_list)} img elements")
print(img_element_list[:1][:1])

In [None]:
# LINKS
# "https://duckduckgo.com/?q=Olafur+Dari+Olafsson&iar=images&iai=https%3A%2F%2Fknightedgemedia.com%2Fwp-content%2Fuploads%2F2025%2F12%2Folafur-darri-olafsson-max-parker-god-of-war-banner.jpg"
# "https://knightedgemedia.com/wp-content/uploads/2025/12/olafur-darri-olafsson-max-parker-god-of-war-banner.jpg"

link_list = []

for sublist in img_element_list:
    for e in sublist:
        
        # Find all links
        link = e.find_elements(By.CSS_SELECTOR, "baseURI")
        # img_elements = waiter.until(e.find_elements((By.TAG_NAME, "img")))
        link_list.append(link)
        sleep(1)

# Print the number of <img> elements found
print(f"Found {len(link_list)} links with baseURI elements")

In [None]:
link_list = [e for e in link_list if e]
print(f"Found {len(link_list)} img elements")
print(link_list[:1])

In [None]:
# Check any name
def print_element(element, output_path: Path, mode):
    with open(output_path, mode, encoding="utf-8") as file:

        pref_list = ["", "tag_", "item_"]
        suff_list = ["name", "id", "class", "style"]
        names = dict()
        for pref in pref_list:
            for suff in suff_list:
                name = pref + suff
                value1 = element.get_attribute(name) or None
                value2 = getattr(element, name, None)
                value = value1
                if not value1 and not value2:
                    continue
                elif value1 and value2:
                    value = (value1, value2)
                    if value1 == value2:
                        value = value1
                elif not value1:
                        value = value2
                names[name] = value
        for k, v in names.items():
            file.write(f"# {k}: {v}\n")
            file.write(f"{"-"*50}\n")
        file.write(f"\n{"="*100}\n\n")

In [None]:
# Download Large Images
def download_large_images(driver, waiter, min_width, min_height):
    """Downloads images larger than the specified dimensions."""
    
    images = waiter.until(EC.presence_of_all_elements_located((By.TAG_NAME, "img")))

    for img in images:
        try:  # use try-except for error handling
            src = img.get_attribute("src")
            if src:  # check if src attribute exists
                response = requests.get(src, stream=True)
                response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
                image = Image.open(BytesIO(response.content))  # Open the image using PIL
                width, height = image.size  # Get the image size

                if width > min_width and height > min_height:
                    print(f"Downloading: {src}")
                    # Save the image (you'll need to determine the filename)
                    image.save(f"image_{width}x{height}.jpg")  # Example filename
            else:
                print("Image has no src attribute") 
        except Exception as e:
            print(f"An unexpected error occurred: {e}")

# for element in clickable_elements:
#     element_name = element.get_attribute("name") or element.get_attribute("id") or element.tag_name
#     print(f"Clicked element: {element_name}")
#     element.click()
#     WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "img"))) # wait for at least one img tag
#     download_large_images(driver, 200, 200) # example dimensions
#     driver.back() # go back to the main page

In [None]:
# Cheats to work in headless mode
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
sleep(1)
driver.execute_script("window.scrollTo(0, 0);")
sleep(2)
driver.refresh()

# Cheats to work in headless mode


actions.move_by_offset(100, 100).perform()
sleep(1)
actions.move_by_offset(-50, -50).perform()
sleep(1)
driver.refresh()

In [None]:
# Find all the <img> elements
img_elements = driver.find_elements(By.TAG_NAME, "img")

# Print the number of <img> elements found
print(f"Found {len(img_elements)} img elements")

# Find all the <img> elements
img_elements = driver.find_elements(By.CSS_SELECTOR, "nodeName: IMG")

# Print the number of <img> elements found
print(f"Found {len(img_elements)} img elements")


In [None]:
# Find all the <ol> elements
ol_elements = driver.find_elements(By.TAG_NAME, "ol")

# Add a wait for the ol elements to appear
# ol_elements = waiter.until(EC.presence_of_element_located((By.TAG_NAME, "OL")))

# Print the number of <ol> elements found
print(f"Found {len(ol_elements)} ol elements")

In [None]:
for element in img_elements:
    print(element)
    #print_element(element, output_err_file, mode="w")

In [None]:
# Optionally, print the text content of each <ol> element
for ol in ol_elements:
    print(ol)
    value = ol.get_attribute("textContent")
    # print_element(element, output_err_file, mode="w")
    print(value)
    sleep(5)

In [None]:
# Looking for clickeable items
with open(output_err_file, "w", encoding="utf-8") as file:
    for ol1 in ol1_elements:
        try:
            # Find clickable elements *within* the current <ol>
            # This assumes the clickable elements are <a> tags; adjust if needed
            # Add a wait for the a elements to appear
            ol2_elements = waiter.until(ol.find_elements((By.TAG_NAME, "ol")))

            for ol2 
            
            for element in clickable_elements:
                try:

                    # Check any name
                    ename = element.get_attribute("name") or None
                    eid = element.get_attribute("id") or None
                    etag_name = element.get_attribute("tag_name") or None
                    tag_name = getattr(element, "tag_name", None)
                    file.write(f"{"#"*30} {ename} | {eid} | {etag_name} | {tag_name}\n")
                    
                    #Click on the element
                    element.click()

                    # Go back to the main page
                    waiter.until(driver.back())
                    
                    #Re-find the ol elements, as the page might have changed
                    ol_elements = waiter.until(EC.presence_of_all_elements_located((By.TAG_NAME, "ol")))
                    break #break to the parent loop.
                except Exception as e:
                    stp = f"[{str(dt.now()).split(".")[0]}] "
                    file.write(f"{stp}Could not click element: {e}\n")
                    file.write(f"{"-"*100}\n\n")
                    continue
    
        except Exception as e:
            stp = f"[{str(dt.now()).split(".")[0]}] "
            file.write(f"{stp}Could not find elements in ol: {e}\n")
            file.write(f"{"-"*100}\n\n")
            continue

In [None]:
stp = f"[{str(dt.now()).split(".")[0]}] "
print(stp)

In [None]:
from time import time

t1 = time()
sleep(1+_get_random())
t2 = time()
print(t2-t1)