# Voyage

In [58]:
import os
import sys
import time

import numpy as np
import json
import regex as re

from typing import Dict

# Pega o diretório atual do notebook
notebook_dir = os.getcwd() # ou os.path.dirname(__file__) se fosse um script .py

# Assume que 'src' está no mesmo nível do notebook ou um nível acima
# Ajuste '..' conforme a estrutura do seu projeto
project_root = os.path.abspath(os.path.join(notebook_dir, '..')) # Volta um diretório

# Se o 'src' estiver diretamente no mesmo nível do notebook:
# project_root = notebook_dir

# Adiciona o diretório raiz do projeto ao sys.path
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [None]:
from src.interact_database_sql import get_all_days_content
from src.voyage_emb import get_voyage_embeddings, voyage_rerank

In [None]:
# 1. Recuperar apenas content_without_image e content_image_described para todos os registros
results = get_all_days_content(fields=['content_without_image', 'content_image_described'])

In [None]:
content = [result['content_image_described'] if result['content_image_described'] != "" else result['content_without_image'] for result in results]

In [None]:
len(content)

In [None]:
# First import the chunker you want from Chonkie
from chonkie import SemanticChunker, RecursiveChunker

# Initialize the chunker
# chunker = RecursiveChunker()
chunker = SemanticChunker()

In [None]:
chunks = []
for day in content:
    # Chunk some text
    _chunks = chunker(day)

    # Access chunks
    for chunk in _chunks:
        #print(f"Chunk: {chunk.text}")
        chunks.append(chunk.text)

In [None]:
old_size = 0
for chunk in chunks:
    new_size = len(chunk)
    if new_size > old_size:
        max_size = new_size
    else:
        max_size = old_size

In [None]:
max_size

In [None]:
chunks_emb: list[dict] = [{'chunk': chunk, 'embedding': ''} for chunk in chunks]

In [None]:
chunks_emb[:3]

In [None]:
# # Make embeddings
for chunk in chunks_emb:
    # Get the chunk text to encode
    chunk_text = chunk['chunk']
    # Generate the embedding for this specific chunk
    # Note: model.encode usually takes a list of strings, even for a single string,
    # and returns a list of embeddings. So, we get the first (and only) embedding.
    
    # Model2Vec
    # embedding = model.encode(chunk_text)
    
    # Voyage
    embedding = get_voyage_embeddings(chunk_text)
    time.sleep(0.05)
    # Assign the generated embedding to the 'embedding' key
    chunk['embedding'] = embedding
    

# # Make sequences of token embeddings
# token_embeddings = model.encode_as_sequence(["It's dangerous to go alone!", "It's a secret to everybody."])

In [62]:
def calculate_cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
    """Calcula a similaridade de cosseno entre dois vetores."""
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    
    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0.0 # Evita divisão por zero
        
    return dot_product / (norm_vec1 * norm_vec2)

def semantic_search(query: str, indexed_chunks: list[dict], model = None, k = 10) -> list[dict]:
    """
    Realiza uma pesquisa de similaridade semântica.

    Args:
        query (str): A string de consulta.
        indexed_chunks (list[dict]): A lista de dicionários de chunks com embeddings.
        model: O modelo usado para gerar embeddings.

    Returns:
        list[dict]: Uma lista de dicionários de chunks, ordenados por similaridade
                    (maior primeiro), incluindo a pontuação de similaridade.
    """
    # 1. Gerar o embedding da consulta
    query_embedding = get_voyage_embeddings(query)
    #print(f"\nEmbedding da Consulta ('{query}'): {query_embedding}")

    results = []
    # 2. Calcular similaridade para cada chunk
    for item in indexed_chunks:
        chunk_text = item['chunk']
        chunk_embedding = item['embedding']
        
        # Certifique-se de que o embedding do chunk também é um array numpy
        # (se o seu modelo já retorna numpy arrays, isso pode ser redundante)
        if not isinstance(chunk_embedding, np.ndarray):
             chunk_embedding = np.array(chunk_embedding)

        similarity = calculate_cosine_similarity(query_embedding, chunk_embedding)
        
        results.append({
            'chunk': chunk_text,
            'similarity': similarity,
            'embedding': chunk_embedding # Opcional, para debug
        })
    
    # 3. Ordenar os resultados pela similaridade (decrescente)
    results.sort(key=lambda x: x['similarity'], reverse=True)
    
    return results[:k]

In [63]:
# --- Inicializa o cliente Gemini API ---
from src.classe_gemini import GeminiApiClient
# Certifique-se de que a variável de ambiente 'GOOGLE_API_KEY' está definida com sua chave de API
try:
    api_key = os.environ.get("GOOGLE_API_KEY")
    if not api_key:
        raise ValueError("A variável de ambiente 'GOOGLE_API_KEY' não está definida.")
    
    gemini_client = GeminiApiClient(api_key=api_key)
except ValueError as e:
    print(f"Erro de configuração da API: {e}")
    exit() # Encerra o programa se a chave da API não estiver configurada

In [92]:
json_file = r"C:\Users\fuedj\Documents\Code\RAG_Dr_Voss_v2\drvossv2\data\unit_qa.json"

with open(json_file, mode='r', encoding='utf-8') as jf:
    jsonfile: Dict = json.load(jf)

In [93]:
#jsonfile = {"How does the doctor look like?":"Hair"}

In [94]:
answers_dict = []
k = 30

for query, answer in jsonfile.items():
    reranked = []
    
    search_results = semantic_search(query, chunks_emb, k=k)
    txt_results = []
    
    for chunk in search_results:
        txt_results.append(chunk['chunk'])
    
    sorted_rows = voyage_rerank(
        query=query,
        documents=txt_results[:k]
        )    
    
    top_3 = sorted_rows[:10]  # Já está ordenado por relevance_score (maior para menor)
    
    # Extrair apenas o texto dos documentos
    top_3_texto = [doc for doc, score, index in top_3]
    
    for i, doc in enumerate(top_3_texto, 1):
        reranked.append(doc)
        
    # --- Chamada da API ---
    
    model_name = 'gemini-1.5-pro' # Ou 'gemini-1.5-pro' se preferir um modelo mais potente

    prompt = f"""
        You are answering a question from a fantasy world in a travel log journey of Doctor Voss, a woman visiting the capital of Veridia.
        Question: {query}
        Here is the context to help you answer: {reranked}.
        Bring the answer **only**. Example: 'The Veridian's Skys were blue most days.'
        If you don't know the answer, respond: 'NTD' - meaning nothing to disclosure.
        If you are not sure, respond: 'NS' - meaning not sure.
        """
        
    prompt_parts = [
        {"text": f"{prompt}"}
    ]
    # Chama o método da classe GeminiApiClient
    response_data = gemini_client.generate_multimodal_content(model_name, prompt_parts)

    # Extrai o texto da resposta usando o método da classe
    generated_text = gemini_client.extract_text_from_response(response_data)

    if generated_text:
        print(f"********************************\nQuery: {query}")
        print(f"Gemini: {generated_text}")
        print(f"Actual Answer: {answer}")
        print("--------------------------------")
        
        answers_dict.append({
            'query': query,
            'llm_answer': generated_text,
            'actual_answer': answer
        })
    else:
        print("\nNão foi possível extrair texto da resposta do Gemini.")
    

********************************
Query: What is the official language of Veridia?
Gemini: Veridian

Actual Answer: The official language of Veridia is Veridian.
--------------------------------
********************************
Query: Which mountain range forms the northern border of Veridia?
Gemini: Aralith Mountains

Actual Answer: The Aralith Mountains form the northern border of Veridia.
--------------------------------
********************************
Query: What is the currency of Veridia called?
Gemini: Veridian Crown

Actual Answer: The currency of Veridia is called the Veridian Crown.
--------------------------------
********************************
Query: Which historical event is commemorated on Veridia's Independence Day?
Gemini: The Unification Accord of 1932.

Actual Answer: Veridia's Independence Day commemorates the signing of the Treaty of Syth in 1854.
--------------------------------
********************************
Query: What is the largest city in Veridia by popula

In [95]:
reranked

["6th Day of Blossomtide 1855 - Reflections in Eldoria\n\nAs I strolled through the cobblestone streets of Eldoria today, echoes of last night's festival still lingering in my mind, I couldn't help but reflect on the area's rich history. Eldoria is particularly known for the Festival of Lights, which honors the historic unification of the northern kingdoms. The streets were still adorned with spirit lanterns, a unique aspect of the festival, used to honor ancestors. The gentle flicker of light seemed to whisper stories of the past as they swayed in the gentle breeze.\n\nMy explorations led me to a quaint bookstore where I came upon an old tome detailing the arduous paths through the Aralith Mountains, forming Veridia's northern border. Reading about the mountain's fierce beauty and the arduous adventures of explorers who paved the way through its treacherous paths filled my mind with the urge to witness it myself. Perhaps after my current engagements, a more physical journey than that 

In [96]:
with open(r'C:\Users\fuedj\Documents\Code\RAG_Dr_Voss_v2\drvossv2\data\answers_dict.json', 'w', encoding='utf-8') as f:
    json.dump(answers_dict, f, ensure_ascii=False, indent=4)