In [1]:
import requests as req
from bs4 import BeautifulSoup
import unicodedata
import re
import tiktoken
from google import genai
from dotenv import load_dotenv
import os
import uuid
from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
BASE_URL = 'https://www.dol.gov'

load_dotenv()
 
api_key = os.getenv("GOOGLE_API_KEY")

client = genai.Client(api_key=api_key)

In [4]:
def get_html(url):
    """Fetch HTML content from a given URL."""
    response = req.get(url)
    response.raise_for_status()  # Raise an error for bad responses
    return response.text

def search_for_link(html):
    """Parse HTML and extract relevant data."""
    data = []
    for link in html.find_all('a', href=True):
        data.append({
            'text': link.get_text(strip=True),
            'url': BASE_URL + link['href'] if link['href'].startswith('/') else link['href']
        })
    
    return data

def adjust_text(soup, link):
    """Substitui footnotes <a href="#..."> pelo texto do rodapé sem o número."""
    if not link:
        return soup.get_text(" ", strip=True)

    url = link[0]['url']
    href_html = get_html(url)
    href_soup = BeautifulSoup(href_html, "html.parser")

    if '#' in url:
        footnote_id = url.split('#')[-1]
        div = href_soup.find('div', {'id': footnote_id})
        if div:
            for sup in div.find_all("sup"):
                sup.decompose()
            footnote_text = div.get_text(strip=True)

            for a in soup.find_all("a", href=True):
                if footnote_id in a["href"]:
                    a.replace_with(" " + footnote_text + " ")

    return soup.get_text(" ", strip=True)


def normalize_text(text):
    """Normaliza caracteres, remove espaços extras e caracteres de controle."""
    text = unicodedata.normalize("NFKC", text)
    text = "".join(ch for ch in text if unicodedata.category(ch)[0] != "C")
    text = " ".join(text.split())
    return text.strip()




__Get HTML__

## Check if Guam, Northen Mariana Island, Puerto Rico, Virgin Island, America Samoa need to be 

In [5]:
html = get_html(f"{BASE_URL}/agencies/whd/minimum-wage/state")
soup = BeautifulSoup(html, "html.parser")

cat = soup.find('div', {"id": "states"})
state = cat.find_all('div')

docs = {}

for state_name in state:
    if state_name.get('id').lower() == 'as': # America Samoa is a special case
        continue
    link = search_for_link(state_name)

    # pega os irmãos após o <h2> (já como soup)
    siblings = state_name.h2.find_next_siblings()
    temp_soup = BeautifulSoup("".join(str(s) for s in siblings), "html.parser")

    # 1. Substitui footnotes
    clean_text = adjust_text(temp_soup, link)

    # 2. Normaliza texto
    norm_text = normalize_text(clean_text)
    norm_text = f"For the state of the {state_name.h2.text} the laws of mimum wage is: " + norm_text
    doc_id = str(uuid.uuid4())
    docs[doc_id] = {"text": norm_text, "metadata": {"state": state_name.h2.text, 'text':norm_text}}

In [6]:
import pinecone
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec

In [7]:
api_key = os.getenv("PINECONE_API")
pc = Pinecone(api_key=api_key)
index_name = 'firtsindex'
indexes = pc.list_indexes()
index_names = [index.name for index in indexes]
if index_name not in index_names:
    pc.create_index(name=index_name,
                dimension=384,
                spec=ServerlessSpec(cloud='aws',region='us-east-1'),
                )


In [8]:
index = pc.Index('firstindex')
model = SentenceTransformer('all-MiniLM-L6-v2')

In [45]:
def add_to_pinecone(doc, model, index):
    for id, state in doc.items():
        embeddings = model.encode(state['text']).tolist()

        pinecone_data = {
            "id": id,
            "values": embeddings,
            "metadata": state["metadata"]
        }

        index.upsert(vectors=[pinecone_data])
add_to_pinecone(docs,model, index)

In [9]:
def semantic_search(index, query, n_results):
    query_embedding = model.encode([query]).tolist()[0]
    results = index.query(vector=query_embedding,
                          top_k=n_results,
                          include_metadata=True)
    return results



In [13]:
from groq import Groq
groq_api_key = os.getenv("GROQ_API")
client_groq = Groq(api_key=groq_api_key)

In [10]:

def get_context(results):
    return "\n\n".join([match['metadata']['text'] for match in results['matches']])

def get_prompt(context: str, query: str):
    prompt = f"""Based on the following context and conversation history, 
        please provide a relevant and contextual response. 
        If the answer cannot be derived from the context, only use the conversation history 
        or say "I cannot answer this based on the provided information."

        Context from documents:
        {context}

        Human: {query}

        Assistant:"""

    return prompt

def generate_response(query: str, context: str):
    """Generate a response using Groq's Llama-3.1-8b-instant with conversation history"""
    # Construct the prompt
    prompt = get_prompt(context, query)

    try:
        # Create the chat completion request
        completion = client_groq.chat.completions.create(
            model="llama-3.1-8b-instant",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that answers questions about the minimal wage on USA"},
                {"role": "user", "content": prompt}
            ],
            temperature=1,
            max_tokens=1024,
            top_p=1,
            stream=True
        )


        # Process and return the streamed response
        response_content = []
        for chunk in completion:
            content = chunk.choices[0].delta.content or ""
            response_content.append(content)

        # Combine the response into a single string
        response = "".join(response_content)
        return response

    except Exception as e:
        return f"Error generating response: {str(e)}"


In [11]:
def rag_query(pinecone_index, query: str, n_chunks: int = 2):
    """Perform RAG query: retrieve relevant chunks and generate an answer using Pinecone."""
    # Retrieve relevant chunks using Pinecone
    results = semantic_search(pinecone_index, query, n_chunks)
    context = get_context(results)

    # Generate response using Groq
    response = generate_response(query, context)

    return response


In [16]:
query = 'What is the California minimal wage? Have any extra points? Bullet points'
response = rag_query(index, query)

print('Pergunta: ', query)

print('Reposta: ', response)

Pergunta:  What is the California minimal wage? Have any extra points? Bullet points
Reposta:  The basic minimum wage in California is $16.50 per hour. Additionally, there are the following premium pay rules for overtime hours worked:

* Daily:
  • Any work in excess of 8 hours in a workday is paid at 1.5 times the regular rate of pay (time and a half)
  • Any work in excess of 12 hours in a day is paid at double the regular rate of pay
* Weekly:
  • Any work in excess of 40 hours in a week is paid at 1.5 times the regular rate of pay (time and a half)
  • The first 8 hours worked on the seventh day of a workweek are paid at 1.5 times the regular rate of pay (time and a half)
  • Any work in excess of 8 hours on the seventh day of a workweek is paid at double the regular rate of pay


In [17]:
from dotenv import load_dotenv
import os

# Load environment variables from .env
load_dotenv()

# Quick check to confirm
print("OPENAI_API_KEY loaded:", os.getenv("OPENAI_API_KEY") is not None)

# --- LightRAG setup ---
from lightrag import LightRAG
from lightrag.llm.openai import gpt_4o_mini_complete, openai_embed
from lightrag.kg.shared_storage import initialize_pipeline_status

rag = LightRAG(
    working_dir="data/",
    embedding_func=openai_embed,
    llm_model_func=gpt_4o_mini_complete,
)

# Initialize storages
await rag.initialize_storages()
await initialize_pipeline_status()

# # Insert your docs
for id, state in docs.items():
    await rag.ainsert(state['text'])


INFO:nano-vectordb:Init {'embedding_dim': 1536, 'metric': 'cosine', 'storage_file': 'data/vdb_entities.json'} 0 data
INFO:nano-vectordb:Init {'embedding_dim': 1536, 'metric': 'cosine', 'storage_file': 'data/vdb_relationships.json'} 0 data
INFO:nano-vectordb:Init {'embedding_dim': 1536, 'metric': 'cosine', 'storage_file': 'data/vdb_chunks.json'} 0 data
Rerank is enabled but no rerank_model_func provided. Reranking will be skipped.


OPENAI_API_KEY loaded: True


INFO:openai._base_client:Retrying request to /embeddings in 0.393398 seconds


In [19]:
from lightrag import QueryParam
result = await rag.aquery("What is the biggest minimal wage?", param=QueryParam(mode="mix"))
result

Rerank is enabled but no rerank model is configured. Please set up a rerank model or set enable_rerank=False in query parameters.


"### Overview of Minimum Wage Rates\n\nThe highest minimum wage in the United States is currently set in the **District of Columbia**, where it stands at **$17.95** per hour. This rate reflects the jurisdiction's commitment to ensuring fair compensation for workers.\n\n### State Comparisons\n\nFollowing the District of Columbia, **California** has one of the highest state minimum wages at **$16.50** per hour, joined closely by **Washington** at **$16.66** per hour. **Connecticut** also maintains a significant minimum wage at **$16.35** per hour, while **Oregon** sets its standard rate at **$15.05**, with higher rates for specific areas like the Portland Metro Area at **$16.30** per hour.\n\n### Additional Context\n\nDifferent states have varying rates influenced by local economic conditions and labor policies. As such, the minimum wage shows significant diversity across the country, emphasizing the ongoing efforts to adjust for economic changes and living costs.\n\n### References\n- [K

In [31]:
from pyvis.network import Network

net = Network(notebook=True, height="750px", width="100%", cdn_resources='remote')
net.show("grafo_lightrag.html")


grafo_lightrag.html
