In [7]:
PINECONE_INDEX_NAME   = "2-leg-embedding"
PINECONE_HOST         = "https://2-leg-embedding-1ov95ew.svc.aped-4627-b74a.pinecone.io"


# ╔════════════════════════════════════════════════╗
# ║                 CONFIG SECTION                 ║
# ╚════════════════════════════════════════════════╝
PINECONE_API_KEY      = "pcsk_5ux2kL_6W6FVpmRbdLXxbRS4zZkTxZwR2JWy9MZJ3aGZGfPWko1JPfjDFqGpsMUXDWZMfd"
GEMINI_API_KEY        = "AIzaSyCODpKsOvCLHwU8M5TfeHvkgeTvCG27F9s"
K_RETRIEVE            = 5                                     
# ╔════════════════════════════════════════════════╗
# ║                LIBRARIES                       ║
# ╚════════════════════════════════════════════════╝
# !pip install sentence-transformers pinecone-client google-generativeai tqdm


from tqdm import tqdm
from typing import List

import pinecone
import google.generativeai as genai
from sentence_transformers import SentenceTransformer

# ════════════════ INIT MODELS ════════════════
model = SentenceTransformer("dariolopez/bge-m3-es-legal-tmp-6")  # 1024-D
genai.configure(api_key=GEMINI_API_KEY)

# ════════════════ LOAD CORPUS ════════════════
def split_text_into_chunks(text, chunk_size=900, overlap=100):
    chunks: List = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk.strip())
        start += chunk_size - overlap
    return chunks

# Read the full text
with open("constitucion_nacional.txt", "r", encoding="utf-8") as file:
    text = file.read()

# Split into chunks
chunks = split_text_into_chunks(text)

# Write the chunks to a new file
with open("constitucion_static_chunks.txt", "w", encoding="utf-8") as out_file:
    out_file.write("\n\n".join(chunks))

print(f"Generated {len(chunks)} chunks.")

# ════════════════ EMBEDDING FUNCTION (E5) ════════════════
def embed_texts(texts: List[str]) -> List[List[float]]:
    formatted = [f"passage: {text}" for text in texts]
    return model.encode(formatted, show_progress_bar=True)

print("🔧  Generating embeddings …")
EMBEDS = embed_texts(chunks)


# ════════════════ PINECONE SETUP ════════════════
pc = pinecone.Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index(name=PINECONE_INDEX_NAME, host=PINECONE_HOST)

def upsert_vectors(texts: List[str],
                   vecs: List[List[float]],
                   batch: int = 100):
    for i in tqdm(range(0, len(texts), batch), desc="⬆️  Upserting"):
        batch_vecs = [
            {
                "id": f"id-{j}",
                "values": vecs[j],
                "metadata": {"text": texts[j]}
            }
            for j in range(i, min(i + batch, len(texts)))
        ]
        index.upsert(vectors=batch_vecs)

# print("📤  Uploading to Pinecone …")
upsert_vectors(chunks, EMBEDS)
    

Generated 100 chunks.
🔧  Generating embeddings …


Batches: 100%|██████████| 4/4 [00:03<00:00,  1.09it/s]
⬆️  Upserting: 100%|██████████| 1/1 [00:02<00:00,  2.49s/it]


In [9]:

# ════════════════ RETRIEVE FUNCTION  ════════════════
def retrieve(query: str, k: int = K_RETRIEVE) -> List[str]:
    query_vec = model.encode(f"query: {query}")
    res = index.query(vector=query_vec.tolist(), top_k=k, include_metadata=True)
    return [m.metadata["text"] for m in res.matches]


# ════════════════ GEMINI PRO RAG ════════════════
gemini = genai.GenerativeModel(model_name="gemini-2.0-flash") 

def rag_answer(question: str) -> str:
    context = "\n\n".join(retrieve(question))
    prompt  = f"Contexto:\n{context}\n\nPregunta: {question}\nRespuesta:"
    return gemini.generate_content(prompt).text

In [4]:
# ════════════════ TEST IT ════════════════
q = "¿Cuáles son las atribuciones del presidente de la Argentina?"
print("\n🔎 Pregunta:", q)
print("\n🧠 Respuesta (Gemini):\n", rag_answer(q))


🔎 Pregunta: ¿Cuáles son las atribuciones del presidente de la Argentina?

🧠 Respuesta (Gemini):
 Según el texto proporcionado, el Presidente de la Nación Argentina tiene las siguientes atribuciones (enumeradas en el Artículo 99):

1.  Es el jefe supremo de la Nación, jefe del gobierno y responsable político de la administración general del país.
2.  Expide las instrucciones y reglamentos que sean necesarios para la ejecución de las leyes de la Nación, cuidando de no alterar su espíritu con excepciones reglamentarias.
3.  Participa de la formación de las leyes con arreglo a la Constitución, las promulga y las hace publicar.
4.  Nombra los magistrados de la Corte Suprema con acuerdo del Senado por dos tercios de sus miembros presentes, en sesión pública, convocada al efecto. Nombra los demás jueces de los tribunales federales inferiores conforme a una propuesta vinculante en terna del Consejo de la Magistratura, con acuerdo del Senado, en sesión pública, en la que se apreciará la idonei

In [5]:
import json
from ragas.evaluation import evaluate
from datasets import Dataset
from tqdm import tqdm
import numpy as np



def load_reference_answers(path):
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return {item["question"]: item["answer"] for item in data}



# ════════════════════════════════════════════════
# Construcción del dataset para RAGAS
# ════════════════════════════════════════════════
def prepare_ragas_dataset(reference_answers: dict, tag: str = "") -> Dataset:
    records = []
    for question, reference in tqdm(reference_answers.items(), desc=f"🧪 Procesando {tag}"):
        retrieved = retrieve(question)
        answer = rag_answer(question)
        records.append({
            "question": question,
            "contexts": retrieved,
            "answer": answer,
            "reference": reference
        })
    return Dataset.from_list(records)


reference_easy = load_reference_answers("../easy_questions.json")
reference_hard = load_reference_answers("../hard_questions.json")

# Prepare datasets
dataset_easy = prepare_ragas_dataset(reference_easy, tag="fáciles")
dataset_hard = prepare_ragas_dataset(reference_hard, tag="difíciles")

# Save to JSON
dataset_easy.to_json("ragas_easy.json", orient="records", lines=False)
dataset_hard.to_json("ragas_hard.json", orient="records", lines=False)




🧪 Procesando fáciles: 100%|██████████| 50/50 [01:04<00:00,  1.30s/it]
🧪 Procesando difíciles: 100%|██████████| 65/65 [02:18<00:00,  2.13s/it]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 79.20ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 588.18ba/s]


422633

In [10]:
from ragas.embeddings.base import LangchainEmbeddingsWrapper
from sentence_transformers import SentenceTransformer

class CustomE5Embedding(LangchainEmbeddingsWrapper):
    def __init__(self, model_name="dariolopez/bge-m3-es-legal-tmp-6"):
        self.model = SentenceTransformer(model_name)

    def embed_query(self, texts):
        # E5 expects queries to be prefixed like this
        texts = [f"query: {text}" for text in texts]
        return self.model.encode(texts, convert_to_tensor=False)

    def embed_documents(self, texts):
        # E5 expects docs to be prefixed like this
        texts = [f"passage: {text}" for text in texts]
        return self.model.encode(texts, convert_to_tensor=False)


In [11]:
from ragas.metrics import (
    LLMContextPrecisionWithoutReference,
    LLMContextPrecisionWithReference,
    NonLLMContextPrecisionWithReference,


    ResponseRelevancy,
    LLMContextRecall,
    Faithfulness
)
from ragas.run_config import RunConfig
from ragas.embeddings.base import embedding_factory
from datasets import load_dataset
import os

os.environ["OPENAI_API_KEY"] = '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'

# Load datasets
ragas_easy:Dataset = load_dataset("json", data_files="ragas_easy.json", split="train")
ragas_hard:Dataset = load_dataset("json", data_files="ragas_hard.json", split="train")

# Run config
run_config = RunConfig()
custom_embeddings = CustomE5Embedding() 

metrics = [
    LLMContextPrecisionWithReference(),
    LLMContextRecall(),
    Faithfulness()
]

print("\n📊 Evaluando preguntas FÁCILES:")
result_easy = evaluate(
    ragas_easy,
    metrics=metrics,
    run_config=run_config,
    batch_size=1
)
print("✅ Resultados EASY:", result_easy)

print("\n📊 Evaluando preguntas DIFÍCILES:")
result_hard = evaluate(
    ragas_hard,
    metrics=metrics,
    run_config=run_config,
    batch_size=1
)
print("✅ Resultados HARD:", result_hard)



Generating train split: 50 examples [00:00, 1441.34 examples/s]
Generating train split: 65 examples [00:00, 12641.06 examples/s]



📊 Evaluando preguntas FÁCILES:


Evaluating: 100%|██████████| 150/150 [15:17<00:00,  6.12s/it]


✅ Resultados EASY: {'llm_context_precision_with_reference': 0.6386, 'context_recall': 0.7000, 'faithfulness': 0.8544}

📊 Evaluando preguntas DIFÍCILES:


Evaluating: 100%|██████████| 195/195 [32:26<00:00,  9.98s/it]


✅ Resultados HARD: {'llm_context_precision_with_reference': 0.7906, 'context_recall': 0.8564, 'faithfulness': 0.8178}


In [12]:
import csv
import numpy as np
from pathlib import Path

# CSV path
csv_path = Path("../results.csv")
write_header = not csv_path.exists()

# Define metrics you expect
metric_names = [
    "llm_context_precision_with_reference",
    "context_recall",
    "faithfulness"
]

# Compute means manually from EvaluationResult
easy_scores = [np.mean(result_easy[m]) for m in metric_names]
hard_scores = [np.mean(result_hard[m]) for m in metric_names]

# Write to CSV
with open(csv_path, mode='a', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    if write_header:
        writer.writerow(["experiment", "dataset"] + metric_names)

    writer.writerow(["experiment_2", "easy"] + easy_scores)
    writer.writerow(["experiment_2", "hard"] + hard_scores)
