In [53]:
import os
import sys
import time

import numpy as np
import json
import regex as re

from typing import Dict

# Pega o diretório atual do notebook
notebook_dir = os.getcwd() # ou os.path.dirname(__file__) se fosse um script .py

# Assume que 'src' está no mesmo nível do notebook ou um nível acima
# Ajuste '..' conforme a estrutura do seu projeto
project_root = os.path.abspath(os.path.join(notebook_dir, '..')) # Volta um diretório

# Se o 'src' estiver diretamente no mesmo nível do notebook:
# project_root = notebook_dir

# Adiciona o diretório raiz do projeto ao sys.path
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [54]:
from src.classe_gemini import GeminiApiClient

In [55]:
import json
from typing import List, Dict
import statistics

# Suponha que gemini_client esteja definido com generate_multimodal_content e extract_text_from_response

def evaluate_answers(
    answers_dict: List[Dict[str, str]],
    model_name: str = "gemini-1.5-pro"
) -> Dict:
    """
    Avalia as respostas em answers_dict usando a API do Gemini para calcular similaridade semântica.

    Args:
        answers_dict (List[Dict[str, str]]): Lista de dicionários com 'query', 'llm_answer', 'actual_answer'.
        gemini_client: Cliente da API Gemini com métodos generate_multimodal_content e extract_text_from_response.
        model_name (str): Modelo Gemini a ser usado (padrão: 'gemini-1.5-pro').

    Returns:
        Dict: Resultados da avaliação, incluindo métricas e detalhes por query.
    """
    
        # --- Inicializa o cliente Gemini API ---
    
    try:
        api_key = os.environ.get("GOOGLE_API_KEY")
        if not api_key:
            raise ValueError("A variável de ambiente 'GOOGLE_API_KEY' não está definida.")
        
        gemini_client = GeminiApiClient(api_key=api_key)
    except ValueError as e:
        print(f"Erro de configuração da API: {e}")
        exit() # Encerra o programa se a chave da API não estiver configurada
        
    evaluation_results = {
        "evaluations": [],
        "average_similarity": 0.0,
        "exact_match_count": 0,
        "total_queries": len(answers_dict)
    }
    
    for entry in answers_dict:
        query = entry.get('query', '')
        llm_answer = entry.get('llm_answer', '')
        actual_answer = entry.get('actual_answer', '')

        # Verificar acurácia exata (case-insensitive, ignorando espaços)
        # is_exact_match = (
        #     llm_answer.strip().lower() == actual_answer.strip().lower()
        #     if llm_answer and actual_answer else False
        # )
        # if is_exact_match:
        #     evaluation_results["exact_match_count"] += 1

        # Montar o prompt para o Gemini avaliar similaridade
        
        
        system_prompt = f"""
            You are an evaluator in a fantasy world context. Your task is to compare two answers to a question and provide a semantic similarity score between 0 and 1, where:
            - 1 means the answers are semantically identical or convey the same meaning.
            - 0 means the answers are completely different.
            - Partial similarity should be scored between 0 and 1 (e.g., 0.8 for very similar answers with minor differences).

            Input format:
            - Question: {query}
            - Generated Answer: {llm_answer}
            - Expected Answer: {actual_answer}

            Output format:
            - Similarity Score: [number between 0 and 1]
            - Explanation: [brief explanation of the similarity score]
        """

        prompt = system_prompt

        prompt_parts = [{"text": prompt}]

        # Chamar a API do Gemini
        try:
            response_data = gemini_client.generate_multimodal_content(model_name, prompt_parts)
            #print(response_data)
            evaluation_text = gemini_client.extract_text_from_response(response_data)

            # Extrair pontuação de similaridade e explicação
            similarity_score = 0.0
            explanation = "No explanation provided."
            if evaluation_text:
                lines = evaluation_text.split('\n')
                for line in lines:
                    if line.startswith("Similarity Score:"):
                        try:
                            similarity_score = float(line.split(":")[1].strip())
                        except (ValueError, IndexError):
                            print(f"Erro ao extrair pontuação para query: {query}")
                    elif line.startswith("Explanation:"):
                        explanation = line.split(":", 1)[1].strip()

            evaluation_results["evaluations"].append({
                "query": query,
                "llm_answer": llm_answer,
                "actual_answer": actual_answer,
                "similarity_score": similarity_score,
                "explanation": explanation
            })

        except Exception as e:
            print(f"Erro ao avaliar query '{query}': {e}")
            evaluation_results["evaluations"].append({
                "query": query,
                "llm_answer": llm_answer,
                "actual_answer": actual_answer,
                "similarity_score": 0.0,
                "explanation": f"Erro na avaliação: {str(e)}"
            })

    # Calcular média dos scores de similaridade
    if evaluation_results["evaluations"]:
        scores = [e["similarity_score"] for e in evaluation_results["evaluations"]]
        evaluation_results["average_similarity"] = statistics.mean(scores) if scores else 0.0

    return evaluation_results

    

In [56]:
# Carregar answers_dict de um arquivo JSON (ou usar diretamente a lista)
try:
    with open(r'C:\Users\fuedj\Documents\Code\RAG_Dr_Voss_v2\drvossv2\data\answers_dict.json', 'r', encoding='utf-8') as f:
        answers_dict = json.load(f)
except FileNotFoundError:
    print("Arquivo answers_dict.json não encontrado. Usando dados de exemplo.")
    answers_dict = [
        {
            'query': "Who is the current Grand Chancellor of Veridia?",
            'llm_answer': "Queen Isolde",
            'actual_answer': "Queen Isolde"
        },
        {
            'query': "What is Zelphar stew?",
            'llm_answer': "A traditional Veridian dish that warms the soul",
            'actual_answer': "A traditional Veridian dish"
        }
    ]

In [None]:
# Avaliar as respostas
evaluation_results = evaluate_answers(answers_dict, model_name="gemini-1.5-pro")


In [None]:

# Exibir resultados
print("\nResultados da Avaliação:")


for eval_entry in evaluation_results["evaluations"]:
    print(f"Query: {eval_entry['query']}")
    print(f"LLM Answer: {eval_entry['llm_answer']}")
    print(f"Actual Answer: {eval_entry['actual_answer']}")
    print(f"Similarity Score: {eval_entry['similarity_score']}")
    print(f"Explanation: {eval_entry['explanation']}")
    print(f"Exact Match: {eval_entry['exact_match']}")
    print("---------------------------------")

# Opcional: Salvar resultados da avaliação em um arquivo JSON
with open('evaluation_results.json', 'w', encoding='utf-8') as f:
    json.dump(evaluation_results, f, ensure_ascii=False, indent=4)
print("Resultados da avaliação salvos em evaluation_results.json")


Resultados da Avaliação:


TypeError: 'NoneType' object is not subscriptable

In [None]:
evaluation_results

In [None]:
response_data = {'candidates': [{'content': {'parts': [{'text': '- Similarity Score: 0.2\n- Explanation: The expected answer is extremely concise and only mentions "Hair."  While the generated answer *does* contain information about the doctor\'s hair (auburn/reddish-brown, soft wave), it provides much more detail about the doctor\'s appearance than requested.  Since the question asked what the doctor *looks like* generally, and the expected answer seems to be focusing on a specific feature (perhaps as a shorthand or in a testing scenario where only the hair color or style was relevant), the generated response, while correct in describing the hair, is mostly superfluous information. Therefore, the similarity is low.  Had the question been more specific about hair, or the expected answer included more features, the score would be higher.\n'}], 'role': 'model'}, 'finishReason': 'STOP', 'avgLogprobs': -0.4062601089477539}], 'usageMetadata': {'promptTokenCount': 188, 'candidatesTokenCount': 160, 'totalTokenCount': 348, 'promptTokensDetails': [{'modality': 'TEXT', 'tokenCount': 188}], 'candidatesTokensDetails': [{'modality': 'TEXT', 'tokenCount': 160}]}, 'modelVersion': 'gemini-1.5-pro-002', 'responseId': 'z8FUaNboEqqO7dcP7s2EyAY'}

candidates = response_data.get("candidates", [])
if candidates:
    content = candidates[0].get("content", {})
    parts = content.get("parts", [])
    if parts:
        # Assumindo que a resposta de texto gerada estará na primeira 'part'
        # e que é um campo 'text'. Pode precisar de mais lógica se a resposta for complexa.
        print(parts[0].get("text"))

- Similarity Score: 0.2
- Explanation: The expected answer is extremely concise and only mentions "Hair."  While the generated answer *does* contain information about the doctor's hair (auburn/reddish-brown, soft wave), it provides much more detail about the doctor's appearance than requested.  Since the question asked what the doctor *looks like* generally, and the expected answer seems to be focusing on a specific feature (perhaps as a shorthand or in a testing scenario where only the hair color or style was relevant), the generated response, while correct in describing the hair, is mostly superfluous information. Therefore, the similarity is low.  Had the question been more specific about hair, or the expected answer included more features, the score would be higher.

