In [1]:
import requests as req
from bs4 import BeautifulSoup
import unicodedata
import re
import tiktoken
from google import genai
from dotenv import load_dotenv
import os
import uuid
from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [27]:
BASE_URL = 'https://www.dol.gov'

load_dotenv()
 
api_key = os.getenv("GOOGLE_API_KEY")

client = genai.Client(api_key=api_key)

In [3]:
def get_html(url):
    """Fetch HTML content from a given URL."""
    response = req.get(url)
    response.raise_for_status()  # Raise an error for bad responses
    return response.text

def search_for_link(html):
    """Parse HTML and extract relevant data."""
    data = []
    for link in html.find_all('a', href=True):
        data.append({
            'text': link.get_text(strip=True),
            'url': BASE_URL + link['href'] if link['href'].startswith('/') else link['href']
        })
    
    return data

def adjust_text(soup, link):
    """Substitui footnotes <a href="#..."> pelo texto do rodapé sem o número."""
    if not link:
        return soup.get_text(" ", strip=True)

    url = link[0]['url']
    href_html = get_html(url)
    href_soup = BeautifulSoup(href_html, "html.parser")

    if '#' in url:
        footnote_id = url.split('#')[-1]
        div = href_soup.find('div', {'id': footnote_id})
        if div:
            for sup in div.find_all("sup"):
                sup.decompose()
            footnote_text = div.get_text(strip=True)

            for a in soup.find_all("a", href=True):
                if footnote_id in a["href"]:
                    a.replace_with(" " + footnote_text + " ")

    return soup.get_text(" ", strip=True)


def normalize_text(text):
    """Normaliza caracteres, remove espaços extras e caracteres de controle."""
    text = unicodedata.normalize("NFKC", text)
    text = "".join(ch for ch in text if unicodedata.category(ch)[0] != "C")
    text = " ".join(text.split())
    return text.strip()




__Get HTML__

## Check if Guam, Northen Mariana Island, Puerto Rico, Virgin Island, America Samoa need to be 

In [43]:
html = get_html(f"{BASE_URL}/agencies/whd/minimum-wage/state")
soup = BeautifulSoup(html, "html.parser")

cat = soup.find('div', {"id": "states"})
state = cat.find_all('div')

docs = {}

for state_name in state:
    if state_name.get('id').lower() == 'as': # America Samoa is a special case
        continue
    link = search_for_link(state_name)

    # pega os irmãos após o <h2> (já como soup)
    siblings = state_name.h2.find_next_siblings()
    temp_soup = BeautifulSoup("".join(str(s) for s in siblings), "html.parser")

    # 1. Substitui footnotes
    clean_text = adjust_text(temp_soup, link)

    # 2. Normaliza texto
    norm_text = normalize_text(clean_text)
    norm_text = f"For the state of the {state_name.h2.text} the laws of mimum wage is: " + norm_text
    doc_id = str(uuid.uuid4())
    docs[doc_id] = {"text": norm_text, "metadata": {"state": state_name.h2.text, 'text':norm_text}}
   
    
docs

{'348ced6f-e86c-43ee-9613-123ebd415cf9': {'text': 'For the state of the Alabama the laws of mimum wage is: No state minimum wage law. Employers subject to the Fair Labor Standards Act must pay the current Federal minimum wage of $7.25 per hour.',
  'metadata': {'state': 'Alabama',
   'text': 'For the state of the Alabama the laws of mimum wage is: No state minimum wage law. Employers subject to the Fair Labor Standards Act must pay the current Federal minimum wage of $7.25 per hour.'}},
 'f5a2c308-f379-4e97-a9b0-8fc217747916': {'text': "For the state of the Alaska the laws of mimum wage is: Basic Minimum Rate (per hour): $13.00 Premium Pay After Designated Hours The overtime premium rate is one and one-half times the employee's regular rate, unless otherwise specified. : Daily - 8, Weekly - 40 Under a voluntary flexible work hour plan approved by the Alaska Department of Labor, a 10 hour day, 40 hour workweek may be instituted with premium pay after 10 hours a day. The premium overtime

In [5]:
import pinecone
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec

In [6]:
api_key = os.getenv("PINECONE_API")
pc = Pinecone(api_key=api_key)
index_name = 'firtsindex'
indexes = pc.list_indexes()
index_names = [index.name for index in indexes]
if index_name not in index_names:
    pc.create_index(name=index_name,
                dimension=384,
                spec=ServerlessSpec(cloud='aws',region='us-east-1'),
                )


In [7]:
index = pc.Index('firstindex')
model = SentenceTransformer('all-MiniLM-L6-v2')

In [45]:
def add_to_pinecone(doc, model, index):
    for id, state in doc.items():
        embeddings = model.encode(state['text']).tolist()

        pinecone_data = {
            "id": id,
            "values": embeddings,
            "metadata": state["metadata"]
        }

        index.upsert(vectors=[pinecone_data])
add_to_pinecone(docs,model, index)

In [33]:
def semantic_search(index, query, n_results):
    query_embedding = model.encode([query]).tolist()[0]
    results = index.query(vector=query_embedding,
                          top_k=n_results,
                          include_metadata=True)
    return results



In [29]:
from groq import Groq
groq_api_key = os.getenv("GROQ_API")
client_groq = Groq(api_key=groq_api_key)

In [53]:
query = 'What is the minimum wage for Arizona?'

def get_context(results):
    return "\n\n".join([match['metadata']['text'] for match in results['matches']])

def get_prompt(context: str, query: str):
    prompt = f"""Based on the following context and conversation history, 
        please provide a relevant and contextual response. 
        If the answer cannot be derived from the context, only use the conversation history 
        or say "I cannot answer this based on the provided information."

        Context from documents:
        {context}

        Human: {query}

        Assistant:"""

    return prompt

def generate_response(query: str, context: str):
    """Generate a response using Groq's Llama-3.1-8b-instant with conversation history"""
    # Construct the prompt
    prompt = get_prompt(context, query)

    try:
        # Create the chat completion request
        completion = client_groq.chat.completions.create(
            model="llama-3.1-8b-instant",
            messages=[
                {"role": "system", "content": "You are a helpful assistant that answers questions."},
                {"role": "user", "content": prompt}
            ],
            temperature=1,
            max_tokens=1024,
            top_p=1,
            stream=True
        )


        # Process and return the streamed response
        response_content = []
        for chunk in completion:
            content = chunk.choices[0].delta.content or ""
            response_content.append(content)

        # Combine the response into a single string
        response = "".join(response_content)
        return response

    except Exception as e:
        return f"Error generating response: {str(e)}"


In [49]:
def rag_query(pinecone_index, query: str, n_chunks: int = 2):
    """Perform RAG query: retrieve relevant chunks and generate an answer using Pinecone."""
    # Retrieve relevant chunks using Pinecone
    results = semantic_search(pinecone_index, query, n_chunks)
    context = get_context(results)

    # Generate response using Groq
    response = generate_response(query, context)

    return response


In [58]:
query = 'What is the minimal wage for California? Have any extra points?'
response = rag_query(index, query)

print('Pergunta: ', query)

print('Reposta: ', response)

Pergunta:  What is the minimal wage for California? Have any extra points?
Reposta:  The basic minimum wage in California is $16.50 per hour. Additionally, for any work in excess of eight hours in one workday, in excess of 40 hours in one workweek, or in the first eight hours worked on the seventh day of work in any one workweek, the rate shall be at one and one-half times the regular rate of pay (time and a half). For any work in excess of 12 hours in one day or in excess of eight hours on any seventh day of a workweek, the rate shall be paid no less than twice the regular rate of pay (double time).

To break it down:

- Up to 8 hours in a workday: $16.50
- 8-12 hours in a workday (time and a half): $16.50 * 1.5 = $24.75
- Over 12 hours in a workday (double time): $16.50 * 2 = $33.00

- Up to 40 hours in a workweek: $16.50
- 40-48 hours in a workweek (time and a half): $16.50 * 1.5 = $24.75
- Over 48 hours in a workweek (double time): $16.50 * 2 = $33.00

- Seventh day of the workweek