In [None]:
import regex as re
import os
from typing import List, Dict
import requests
import json

from pprint import pprint

GROQ_API_KEY = os.getenv('groq_key')

FILE_PATH = r'C:\Users\fuedj\Documents\Code\RAG_Dr_Voss\llm-case-study-main\src\parte1_another.txt'

In [None]:
def load_txt(file_path: str) -> str:
    """Carrega o conteúdo do arquivo de texto"""
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()

def chunk_text(text: str, chunk_size: int = 800) -> List[str]:
    """Divide o texto em chunks"""
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

text = load_txt(FILE_PATH)

chunks = chunk_text(text)

In [None]:
def split_large_chunk(chunk: str, max_size: int = 8000) -> list[str]:
    parts = []
    while len(chunk) > max_size:
        cut_index = chunk.rfind('.', 0, max_size)
        if cut_index == -1:
            cut_index = chunk.rfind(' ', 0, max_size)
        if cut_index == -1:
            cut_index = max_size  # Não achou ponto nem espaço, corta bruto
        parts.append(chunk[:cut_index+1].strip())
        chunk = chunk[cut_index+1:].strip()
    if chunk:
        parts.append(chunk)
    return parts

def chunk_diary_by_day_and_paragraph(diary_text: str) -> list[str]:
    chunks = []
    current_day_content = ""
    date_pattern = r"(\d{1,2})(?:st|nd|rd|th)? Day of ([A-Za-z]+) (18\d{2}) - ([A-Za-z\s]+)"

    for line in diary_text.splitlines():
        if re.match(date_pattern, line):
            # Encontrou um novo dia
            if current_day_content:
                # Divida o conteúdo do dia anterior por parágrafos (linhas não vazias)
                paragraphs = [p.strip() for p in current_day_content.strip().split('\n\n') if p.strip()]
                chunks.extend(paragraphs)
            current_day_content = line + "\n"  # Comece o conteúdo do novo dia
        else:
            current_day_content += line + "\n"

    # Processar o conteúdo do último dia
    if current_day_content:
        paragraphs = [p.strip() for p in current_day_content.strip().split('\n\n') if p.strip()]
        for paragraph in paragraphs:
            if len(paragraph) > 800:
                chunks.extend(split_large_chunk(paragraph))
            else:
                chunks.append(paragraph)
    
    return [chunk for chunk in chunks if chunk] # Remove empty ones 

list_chunk = chunk_diary_by_day_and_paragraph(text)

chunks = list_chunk
len(chunks)

In [None]:
import aiohttp
import asyncio
import json
from typing import List, Dict, Optional, Tuple

class SmartAgentSystem:
    def __init__(self, api_key: str, question: str, all_chunks: List[str]):
        self.api_key = api_key
        self.question = question
        self.all_chunks = all_chunks
        self.processed_chunks = set()
        self.partial_answers = []
        self.session = None

    async def __aenter__(self):
        self.session = aiohttp.ClientSession()
        return self

    async def __aexit__(self, *args):
        await self.session.close()

    async def expert_agent(self, chunk: str, chunk_id: int) -> Optional[Dict]:
        """Agente especialista com identificação de chunk"""
        try:
            prompt = f"""Analyze this text regarding: {self.question}
            
            Required JSON format:
            {{
                "analysis": {{
                    "key_findings": ["list", "of", "findings"],
                    "missing_info": ["list", "of", "gaps"]
                }},
                "metadata": {{
                    "chunk_id": {chunk_id},
                    "relevance": 0.0-1.0,
                    "needs_more_context": true/false
                }}
            }}

            Text: {chunk}"""

            async with self.session.post(
                "https://api.groq.com/openai/v1/chat/completions",
                headers={"Authorization": f"Bearer {self.api_key}"},
                json={
                    "model": "qwen-qwq-32b",
                    "messages": [{
                        "role": "system",
                        "content": "You must respond in valid JSON format exactly as specified."
                    }, {
                        "role": "user",
                        "content": prompt
                    }],
                    "temperature": 0.1,
                    "response_format": {"type": "json_object"}
                },
                timeout=20
            ) as response:
                response.raise_for_status()
                data = await response.json()
                result = json.loads(data['choices'][0]['message']['content'])
                
                print(f"Expert Agent Decision: {result}")
                
                # Validate response structure
                required_keys = {'analysis', 'metadata'}
                if not all(key in result for key in required_keys):
                    raise ValueError("Missing required keys in response")
                
                self.processed_chunks.add(chunk_id)
                if result['metadata']['relevance'] > 0.4:
                    self.partial_answers.append(result)
                return result
            
        except json.JSONDecodeError as e:
            print(f"JSON Error in chunk {chunk_id}: {str(e)}")
        except KeyError as e:
            print(f"Missing key in chunk {chunk_id} response: {str(e)}")
        except Exception as e:
            print(f"Error processing chunk {chunk_id}: {str(e)}")
        return None

    async def central_agent(self) -> Dict:
        """Agente central que pode solicitar mais chunks"""
        max_attempts = 1
        print(f"Partial Answers: {self.partial_answers}")
    
        try:
            context = "\n".join(
                f"Chunk {ans['metadata']['chunk_id']}:\n"
                f"Findings: {ans['analysis']['key_findings']}\n"
                f"Missing: {ans['analysis']['missing_info']}"
                for ans in self.partial_answers
            )
            # context = "\n".join(
            #     f"Chunk {ans['metadata']['chunk_id']}:\n"
            #     f"Findings: {ans['analysis']['key_findings']}"
                
            #     for ans in self.partial_answers
            # )

            prompt = f"""Evaluate these analyses about: {self.question}
            
            Current Analysis:
            {context}
            
            Required JSON response:
            {{
                "status": "complete" or "incomplete",
                "answer": "summary text" or null,
                "requested_chunks": [list, of, ids] or null,
                "confidence": 0.0-1.0
            }}"""

            async with self.session.post(
                "https://api.groq.com/openai/v1/chat/completions",
                headers={"Authorization": f"Bearer {self.api_key}"},
                json={
                    "model": "qwen-qwq-32b",
                    "messages": [{
                        "role": "system",
                        "content": "You must provide valid JSON. Double-check your output before returning."
                    }, {
                        "role": "user",
                        "content": prompt
                    }],
                    "temperature": 0.2,
                    "response_format": {"type": "json_object"}
                },
                timeout=25
            ) as response:
                response.raise_for_status()
                data = await response.json()
                decision = json.loads(data['choices'][0]['message']['content'])
                
                print(f"Central Agent Decision: {decision}")
                
                # Validate central agent response
                if not all(k in decision for k in ["status", "answer", "requested_chunks", "confidence"]):
                    raise ValueError("Invalid central agent response format")
                
                if decision["status"] == "complete" or decision['confidence'] > 0.6:
                    return {
                        "status": "success",
                        "answer": decision["answer"],
                        "confidence": decision["confidence"],
                        "used_chunks": [ans['metadata']['chunk_id'] for ans in self.partial_answers]
                    }
                
                else:
                    await self.process_additional_chunks(decision["requested_chunks"])
                    
        except json.JSONDecodeError as e:
            print(f"JSON Error: {str(e)}")
        except Exception as e:
            print(f"Central agent error: {str(e)}")
        

    async def process_additional_chunks(self, chunk_ids: List[int], first=False):
        """Processa chunks adicionais de forma simples e eficiente"""
        # Filtra apenas chunks não processados e válidos
        
        chunks_to_process = self.all_chunks[:2]
        
        # Processa todos os chunks válidos de uma vez
        if chunks_to_process:
            print(f"\nChunks to Process: {len(chunks_to_process)}")
            print("Parte de um Chunk:")
            _ = [chunk[:30] for chunk_id, chunk in enumerate(chunks_to_process)]
            print(_[:30])
            tasks = [
                self.expert_agent(chunk, chunk_id) 
                for chunk_id, chunk in enumerate(chunks_to_process)
            ]
            self.all_chunks = self.all_chunks[-1:]
            print(f"All Chunks: {self.all_chunks[:50]}")
            await asyncio.gather(*tasks)

async def analyze_with_feedback(api_key: str, question: str, chunks: List[str]):
    """Versão simplificada do pipeline de análise"""
    async with SmartAgentSystem(api_key, question, chunks) as system:
        # Processa os 3 primeiros chunks inicialmente
        await system.process_additional_chunks([0, 1, 2], first=True)
        
        # Obtém a decisão do agente central
        
        result = await system.central_agent()
        
        while not result:
            result = await system.central_agent()
        
        print(f"Result: {result}")
        
        # Retorna o resultado seja qual for o status
        return {
            "status": result["status"],
            "answer": result.get("answer"),
            "used_chunks": list(system.processed_chunks)
        }
        

# Exemplo de uso:
async def main():
    question = """
    Based on the diary entries, describe the uses of moonpetal by the people of Oakhaven and what the prevalence of this flower suggests about the local culture and economy.
    """
    result = await analyze_with_feedback(GROQ_API_KEY, question, chunks)
    
    print("\n=== Resultado Final ===")
    print(json.dumps(result, indent=2, ensure_ascii=False))

# Executar no Jupyter:
await main()