# Chonkie

In [2]:
import os
import sys
import time

import numpy as np
import json
import regex as re

# Pega o diretório atual do notebook
notebook_dir = os.getcwd() # ou os.path.dirname(__file__) se fosse um script .py

# Assume que 'src' está no mesmo nível do notebook ou um nível acima
# Ajuste '..' conforme a estrutura do seu projeto
project_root = os.path.abspath(os.path.join(notebook_dir, '..')) # Volta um diretório

# Se o 'src' estiver diretamente no mesmo nível do notebook:
# project_root = notebook_dir

# Adiciona o diretório raiz do projeto ao sys.path
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [3]:
from src.interact_database_sql import get_all_days_content
from src.voyage_emb import get_voyage_embeddings

In [4]:
# 1. Recuperar apenas content_without_image e content_image_described para todos os registros
results = get_all_days_content(fields=['content_without_image', 'content_image_described'])

In [5]:
content = [result['content_image_described'] if result['content_image_described'] != "" else result['content_without_image'] for result in results]

In [6]:
len(content)

123

In [7]:
# First import the chunker you want from Chonkie
from chonkie import SemanticChunker, RecursiveChunker

# Initialize the chunker
chunker = RecursiveChunker()

In [8]:
chunks = []
for day in content:
    # Chunk some text
    _chunks = chunker(day)

    # Access chunks
    for chunk in _chunks:
        #print(f"Chunk: {chunk.text}")
        chunks.append(chunk.text)

In [9]:
old_size = 0
for chunk in chunks:
    new_size = len(chunk)
    if new_size > old_size:
        max_size = new_size
    else:
        max_size = old_size

In [10]:
max_size

1730

In [11]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
from langchain_chroma import Chroma
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

from model2vec import StaticModel

model = StaticModel.from_pretrained("minishlab/potion-base-32M")



In [12]:
chunks_emb: list[dict] = [{'chunk': chunk, 'embedding': ''} for chunk in chunks]

In [13]:
chunks_emb[:3]

[{'chunk': "1st Day of Frostfall 1855 - Arrival in the Capital of Veridia\n\nPicture - A bustling street market stretches before the grand Assembly House, its pastel facade and colonnaded balconies overlooking the scene. Colorful awnings shade vendors and shoppers, carts laden with produce lining the sunlit avenue.\n\n\nToday marks my arrival in the capital city of Veridia, a place teeming with vibrant cultural heritage and an unyielding commitment to progress. Under Queen Isolde's famed patronage, the arts flourished here. Walking through the grand avenues, I was captivated by the array of sculptures and paintings, bearing testament to her legacy. My first stop was the regal Assembly House, where the Assembly of Voices, Veridia's main legislative body, convenes. The debate inside, I was told, revolved around initiatives to fulfill Veridia's ambitious goal of using 80% renewable energy by 2050. This pervasive focus on sustainability is palpable, infusing the city's very lifeblood.\n\nA

In [None]:
# # Make embeddings
for chunk in chunks_emb:
    # Get the chunk text to encode
    chunk_text = chunk['chunk']
    # Generate the embedding for this specific chunk
    # Note: model.encode usually takes a list of strings, even for a single string,
    # and returns a list of embeddings. So, we get the first (and only) embedding.
    
    # Model2Vec
    # embedding = model.encode(chunk_text)
    
    # Voyage
    embedding = get_voyage_embeddings(chunk_text)
    # Assign the generated embedding to the 'embedding' key
    chunk['embedding'] = embedding
    

# # Make sequences of token embeddings
# token_embeddings = model.encode_as_sequence(["It's dangerous to go alone!", "It's a secret to everybody."])

In [15]:
def calculate_cosine_similarity(vec1: np.ndarray, vec2: np.ndarray) -> float:
    """Calcula a similaridade de cosseno entre dois vetores."""
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    
    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0.0 # Evita divisão por zero
        
    return dot_product / (norm_vec1 * norm_vec2)

def semantic_search(query: str, indexed_chunks: list[dict], model) -> list[dict]:
    """
    Realiza uma pesquisa de similaridade semântica.

    Args:
        query (str): A string de consulta.
        indexed_chunks (list[dict]): A lista de dicionários de chunks com embeddings.
        model: O modelo usado para gerar embeddings.

    Returns:
        list[dict]: Uma lista de dicionários de chunks, ordenados por similaridade
                    (maior primeiro), incluindo a pontuação de similaridade.
    """
    # 1. Gerar o embedding da consulta
    query_embedding = model.encode([query])[0]
    #print(f"\nEmbedding da Consulta ('{query}'): {query_embedding}")

    results = []
    # 2. Calcular similaridade para cada chunk
    for item in indexed_chunks:
        chunk_text = item['chunk']
        chunk_embedding = item['embedding']
        
        # Certifique-se de que o embedding do chunk também é um array numpy
        # (se o seu modelo já retorna numpy arrays, isso pode ser redundante)
        if not isinstance(chunk_embedding, np.ndarray):
             chunk_embedding = np.array(chunk_embedding)

        similarity = calculate_cosine_similarity(query_embedding, chunk_embedding)
        
        results.append({
            'chunk': chunk_text,
            'similarity': similarity,
            'embedding': chunk_embedding # Opcional, para debug
        })
    
    # 3. Ordenar os resultados pela similaridade (decrescente)
    results.sort(key=lambda x: x['similarity'], reverse=True)
    
    return results[:20]

In [16]:
# --- Inicializa o cliente Gemini API ---
from src.classe_gemini import GeminiApiClient
# Certifique-se de que a variável de ambiente 'GOOGLE_API_KEY' está definida com sua chave de API
try:
    api_key = os.environ.get("GOOGLE_API_KEY")
    if not api_key:
        raise ValueError("A variável de ambiente 'GOOGLE_API_KEY' não está definida.")
    
    gemini_client = GeminiApiClient(api_key=api_key)
except ValueError as e:
    print(f"Erro de configuração da API: {e}")
    exit() # Encerra o programa se a chave da API não estiver configurada

In [17]:
qs = [
    'What is the official language of Veridia?',
    'Which mountain range forms the northern border of Veridia?',
    'What is the currency of Veridia called?',
    "Which historical event is commemorated on Veridia's Independence Day?",
    'What is the largest city in Veridia by population?',
    'Which body of water lies to the east of Veridia?'
]

for query in qs:
    #query = "Who is the current Grand Chancellor of Veridia?"
    search_results = semantic_search(query, chunks_emb, model)

    #print(f"\n********** \nResultados da pesquisa para: '{query}'")
    
    for result in search_results[:6]:
        # --- Chamada da API ---
        model_name = 'gemini-1.5-pro' # Ou 'gemini-1.5-pro' se preferir um modelo mais potente

        chunk_description = "1st Day of Frostfall 1855 - Arrival in the Capital of Veridia\n\n!\n\nToday marks my arrival in the capital city of Veridia, a place teeming with vibrant cultural heritage and an unyielding commitment to progress. Under Queen Isolde's famed patronage, the arts flourished here. Walking through the grand avenues, I was captivated by the array of sculptures and paintings, bearing testament to her legacy. My first stop was the regal Assembly House, where the Assembly of Voices, Veridia's main legislative body, convenes. The debate inside, I was told, revolved around initiatives to fulfill Veridia's ambitious goal of using 80% renewable energy by 2050. This pervasive focus on sustainability is palpable, infusing the city's very lifeblood.\n\nAs evening crept in, I joined locals indulging in Zelphar stew, a traditional Veridian dish that warms the soul as much as it pleases the palate. Each spoonful was a blend of flavors, rich and comforting after my long journey.\n\n####"

        prompt = f"""
            
            You are answering a question from a fantasy world in a travel log journey of Doctor Voss, a woman visiting the capital of Veridia.
            Here is the context to help you answer: {chunk_description}.
            
            Describe **only** the image. Bring **only the description**. Be concise - 300 characters max.
            
            """
            
        prompt_parts = [
            {"text": f"{prompt}"}
        ]
        #print(f"  Similaridade: {result['similarity']:.4f} - ########### Chunk \n\n'{result['chunk']}'\n")
        

In [18]:
from rerankers import Reranker
ranker = Reranker(
  model_name='mixedbread-ai/mxbai-rerank-large-v1',
  model_type='cross-encoder'
)

Loading TransformerRanker model mixedbread-ai/mxbai-rerank-large-v1 (this message can be suppressed by setting verbose=0)
No device set
Using device cpu
No dtype set
Using dtype torch.float32
Loaded model mixedbread-ai/mxbai-rerank-large-v1
Using device cpu.
Using dtype torch.float32.


In [56]:
qs = [
    'What is the official language of Veridia?',
    'Which mountain range forms the northern border of Veridia?',
    'What is the currency of Veridia called?',
    "Which historical event is commemorated on Veridia's Independence Day?",
    'What is the largest city in Veridia by population?',
    'Which body of water lies to the east of Veridia?'
]
reranked = []
for query in qs:
    
    #query = "Who is the current Grand Chancellor of Veridia?"
    search_results = semantic_search(query, chunks_emb, model)
    txt_results = []
    
    for chunk in search_results:
        txt_results.append(chunk['chunk'])
        
    sorted_rows = ranker.rank(
        query=query,
        docs=txt_results[:10]
        )
    #print(f"\n********** \nResultados da pesquisa para: '{query}'")
    
    for i, doc in enumerate(sorted_rows.top_k(3)):
        reranked.append(doc.text)
    # --- Chamada da API ---
    
    model_name = 'gemini-1.5-pro' # Ou 'gemini-1.5-pro' se preferir um modelo mais potente

    chunk_description = "1st Day of Frostfall 1855 - Arrival in the Capital of Veridia\n\n!\n\nToday marks my arrival in the capital city of Veridia, a place teeming with vibrant cultural heritage and an unyielding commitment to progress. Under Queen Isolde's famed patronage, the arts flourished here. Walking through the grand avenues, I was captivated by the array of sculptures and paintings, bearing testament to her legacy. My first stop was the regal Assembly House, where the Assembly of Voices, Veridia's main legislative body, convenes. The debate inside, I was told, revolved around initiatives to fulfill Veridia's ambitious goal of using 80% renewable energy by 2050. This pervasive focus on sustainability is palpable, infusing the city's very lifeblood.\n\nAs evening crept in, I joined locals indulging in Zelphar stew, a traditional Veridian dish that warms the soul as much as it pleases the palate. Each spoonful was a blend of flavors, rich and comforting after my long journey.\n\n####"

    prompt = f"""
        
        You are answering a question from a fantasy world in a travel log journey of Doctor Voss, a woman visiting the capital of Veridia.
        Question: {query}
        Here is the context to help you answer: {reranked}.
        
        Bring the answer **only**. Example: 'The Veridian's Skys were blue most days.'
        If you don't know the answer, respond: 'NTD' - meaning nothing to disclosure.
        """
        
    prompt_parts = [
        {"text": f"{prompt}"}
    ]
    # Chama o método da classe GeminiApiClient
    response_data = gemini_client.generate_multimodal_content(model_name, prompt_parts)

    # Extrai o texto da resposta usando o método da classe
    generated_text = gemini_client.extract_text_from_response(response_data)

    if generated_text:
        print(f"Query: {query}")
        print(f"\nGemini: {generated_text}")
        print("---------------------------------")
    else:
        print("\nNão foi possível extrair texto da resposta do Gemini.")
    

Query: What is the official language of Veridia?

Gemini: Veridian

---------------------------------
Query: Which mountain range forms the northern border of Veridia?

Gemini: NTD

---------------------------------
Query: What is the currency of Veridia called?

Gemini: Veridian Crown

---------------------------------
Query: Which historical event is commemorated on Veridia's Independence Day?

Gemini: The establishment of Veridia's constitution in 1783.

---------------------------------
Query: What is the largest city in Veridia by population?

Gemini: Dolverin

---------------------------------
Query: Which body of water lies to the east of Veridia?

Gemini: NTD

---------------------------------
