In [50]:
import requests as req
from bs4 import BeautifulSoup
import unicodedata
import re
import tiktoken

In [None]:
BASE_URL = 'https://www.dol.gov'

In [68]:
def get_html(url):
    """Fetch HTML content from a given URL."""
    response = req.get(url)
    response.raise_for_status()  # Raise an error for bad responses
    return response.text

def search_for_link(html):
    """Parse HTML and extract relevant data."""
    data = []
    for link in html.find_all('a', href=True):
        data.append({
            'text': link.get_text(strip=True),
            'url': BASE_URL + link['href'] if link['href'].startswith('/') else link['href']
        })
    
    return data

def adjust_text(soup, link):
    """Substitui footnotes <a href="#..."> pelo texto do rodapé sem o número."""
    if not link:
        return soup.get_text(" ", strip=True)

    url = link[0]['url']
    href_html = get_html(url)
    href_soup = BeautifulSoup(href_html, "html.parser")

    if '#' in url:
        footnote_id = url.split('#')[-1]
        div = href_soup.find('div', {'id': footnote_id})
        if div:
            for sup in div.find_all("sup"):
                sup.decompose()
            footnote_text = div.get_text(strip=True)

            for a in soup.find_all("a", href=True):
                if footnote_id in a["href"]:
                    a.replace_with(" " + footnote_text + " ")

    return soup.get_text(" ", strip=True)


def normalize_text(text):
    """Normaliza caracteres, remove espaços extras e caracteres de controle."""
    text = unicodedata.normalize("NFKC", text)
    text = "".join(ch for ch in text if unicodedata.category(ch)[0] != "C")
    text = " ".join(text.split())
    return text.strip()


def chunk_text(text, max_tokens=512, overlap=50):
    enc = tiktoken.get_encoding("cl100k_base")
    tokens = enc.encode(text)
    
    chunks = []
    if len(tokens)>max_tokens:
        
        for i in range(0, len(tokens), max_tokens - overlap):
            chunk = tokens[i:i+max_tokens]
            chunks.append(enc.decode(chunk))
    
    return chunks

#def embed_texts(chunks, model="models/embedding-001"):
    """Gera embeddings com Gemini para uma lista de chunks."""
    embeddings = []
    for chunk in chunks:
        response = genai.embed_content(model=model, content=chunk)
        embeddings.append(response["embedding"])
    return embeddings




__Get HTML__

In [None]:
html = get_html(f"{BASE_URL}/agencies/whd/minimum-wage/state")
soup = BeautifulSoup(html, "html.parser")

cat = soup.find('div', {"id": "states"})
state = cat.find_all('div')

countries = {}

for state_name in state:
    if state_name.get('id').lower() == 'as': # America Samoa is a special case
        continue
    link = search_for_link(state_name)

    # pega os irmãos após o <h2> (já como soup)
    siblings = state_name.h2.find_next_siblings()
    temp_soup = BeautifulSoup("".join(str(s) for s in siblings), "html.parser")

    # 1. Substitui footnotes
    clean_text = adjust_text(temp_soup, link)

    # 2. Normaliza texto
    norm_text = normalize_text(clean_text)

    # 3. Chunking
    chunks = chunk_text(norm_text)
    
    # 4. Geração de embeddings
   # embeddings = embed_texts(chunks, model="models/embedding-001")

    # 5. Guardar tudo no dicionário
    countries[state_name.h2.text.capitalize()] = {
        'acronym': state_name.get('id').lower(),
        'text': norm_text,       # texto completo normalizado
        'chunks': chunks,        # lista de chunks
        #'embeddings': embeddings,# embeddings de cada chunk
        'link': link
    }

In [70]:
countries

{'Alabama': {'acronym': 'al',
  'text': 'No state minimum wage law. Employers subject to the Fair Labor Standards Act must pay the current Federal minimum wage of $7.25 per hour.',
  'chunks': [],
  'href': []},
 'Alaska': {'acronym': 'ak',
  'text': "Basic Minimum Rate (per hour): $13.00 Premium Pay After Designated Hours The overtime premium rate is one and one-half times the employee's regular rate, unless otherwise specified. : Daily - 8, Weekly - 40 Under a voluntary flexible work hour plan approved by the Alaska Department of Labor, a 10 hour day, 40 hour workweek may be instituted with premium pay after 10 hours a day. The premium overtime pay requirement on either a daily or weekly basis is not applicable to employers of fewer than 4 employees. The minimum wage is adjusted annually based on a set formula.",
  'chunks': [],
  'href': [{'text': '1',
    'url': 'https://www.dol.gov/agencies/whd/minimum-wage/state#footnote'}]},
 'American samoa': {'acronym': 'as',
  'text': 'Americ