In [None]:
import os
from dotenv import load_dotenv
from pathlib import Path

# Load .env from parent directory if needed
env_path = Path(__file__).resolve().parent.parent / ".env"
load_dotenv(dotenv_path=env_path)

# Access variables
OPENAI_API_KEY        = os.getenv("OPENAI_API_KEY")
PINECONE_INDEX_NAME   = os.getenv("PINECONE_INDEX_NAME")
PINECONE_HOST         = os.getenv("PINECONE_HOST")
PINECONE_API_KEY      = os.getenv("PINECONE_API_KEY")
GEMINI_API_KEY        = os.getenv("GEMINI_API_KEY")
K_RETRIEVE            = int(os.getenv("K_RETRIEVE", 5))  # default to 5
OPENAI_API_KEY        = os.getenv("OPENAI_API_KEY")

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [None]:
import json
from tqdm import tqdm
from typing import List

import pinecone
import google.generativeai as genai
from sentence_transformers import SentenceTransformer

JSON_PATH = "incisos-chunks.json"

# ════════════════ INIT MODELS ════════════════
model = SentenceTransformer("dariolopez/bge-m3-es-legal-tmp-6")  # 1024-D
genai.configure(api_key=GEMINI_API_KEY)

# ════════════════ LOAD CORPUS ════════════════
def load_texts_from_json(json_path: str) -> List[str]:
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return [entry["text"].strip() for entry in data if "text" in entry]

ARTICULOS = load_texts_from_json(JSON_PATH)
print(f"📚  Loaded {len(ARTICULOS):,} artículos")


# ════════════════ EMBEDDING FUNCTION (E5) ════════════════
def embed_texts(texts: List[str]) -> List[List[float]]:
    formatted = [f"passage: {text}" for text in texts]
    return model.encode(formatted, show_progress_bar=True)

print("🔧  Generating embeddings …")
EMBEDS = embed_texts(ARTICULOS)
assert len(EMBEDS[0]) == 1024, "❌ Embedding dim mismatch!"


# ════════════════ PINECONE SETUP ════════════════
pc = pinecone.Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index(name=PINECONE_INDEX_NAME, host=PINECONE_HOST)

def upsert_vectors(texts: List[str],
                   vecs: List[List[float]],
                   batch: int = 100):
    for i in tqdm(range(0, len(texts), batch), desc="⬆️  Upserting"):
        batch_vecs = [
            {
                "id": f"id-{j}",
                "values": vecs[j],
                "metadata": {"text": texts[j]}
            }
            for j in range(i, min(i + batch, len(texts)))
        ]
        index.upsert(vectors=batch_vecs)

print("📤  Uploading to Pinecone …")
upsert_vectors(ARTICULOS, EMBEDS)
    

  from .autonotebook import tqdm as notebook_tqdm


📚  Loaded 199 artículos
🔧  Generating embeddings …


Batches: 100%|██████████| 7/7 [00:08<00:00,  1.28s/it]


📤  Uploading to Pinecone …


⬆️  Upserting: 100%|██████████| 2/2 [00:03<00:00,  1.66s/it]


In [None]:

# ════════════════ RETRIEVE FUNCTION  ════════════════
def retrieve(query: str, k: int = K_RETRIEVE) -> List[str]:
    query_vec = model.encode(f"query: {query}")
    res = index.query(vector=query_vec.tolist(), top_k=k, include_metadata=True)
    return [m.metadata["text"] for m in res.matches]


# ════════════════ GEMINI PRO RAG ════════════════
gemini = genai.GenerativeModel(model_name="gemini-2.0-flash") 

PROMPT_TEMPLATE = """
Eres un/a **abogado/a constitucionalista argentino/a**.  
Tu tarea es **contestar en UNA sola frase** y **exclusivamente** con la
información que aparece dentro de las etiquetas <context></context>.

Reglas de oro (cúmplelas al pie de la letra):

1. Si la respuesta está en el contexto, da la solución **exactamente** como
   figura allí, sin agregar ni quitar nada relevante.
2. Al final de la frase, escribe entre paréntesis el/los número(s) de
   artículo(s) que sustenten la respuesta –por ejemplo: **(art. 14)**.
   - Si el fragmento de contexto trae algo como “Artículo 14 bis”, ponlo igual: **(art. 14 bis)**.
3. Si la información **no** aparece en el contexto, contesta **exactamente**:
   > No tengo información sobre esto.
4. No inventes datos, no cites fuentes externas, no expliques tu razonamiento.
5. Responde en español neutro y evita tecnicismos innecesarios.
6. Si no sabes la respuesta, responde 'no tengo información sobre esto'.

<context>
{context}
</context>

Pregunta: {question}
Respuesta:
""".strip()

def rag_answer(question: str) -> str:
    context = "\n\n".join(retrieve(question))
    prompt  = PROMPT_TEMPLATE.format(context=context, question=question)
    return gemini.generate_content(prompt).text.strip()


In [5]:
# ════════════════ TEST IT ════════════════
q = "Estoy preparando un negocio de pancho con un puesto en el rio, que me importa de la constitución?"
print("\n🔎 Pregunta:", q)
print("\n🧠 Respuesta (Gemini):\n", rag_answer(q))


🔎 Pregunta: Estoy preparando un negocio de pancho con un puesto en el rio, que me importa de la constitución?

🧠 Respuesta (Gemini):
 Los extranjeros pueden navegar los ríos y costas (art. 20).


In [7]:
import json
import time
from ragas.evaluation import evaluate
from datasets import Dataset
from tqdm import tqdm
import numpy as np



def load_reference_answers(path):
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return {item["question"]: item["answer"] for item in data}



# ════════════════════════════════════════════════
# Construcción del dataset para RAGAS
# ════════════════════════════════════════════════
def prepare_ragas_dataset(reference_answers: dict, tag: str = "") -> Dataset:
    records = []
    for question, reference in tqdm(reference_answers.items(), desc=f"🧪 Procesando {tag}"):
        retrieved = retrieve(question)
        answer = rag_answer(question)
        time.sleep(4)
        records.append({
            "question": question,
            "contexts": retrieved,
            "answer": answer,
            "reference": reference
        })
    return Dataset.from_list(records)


reference_easy = load_reference_answers("../easy_questions.json")
reference_hard = load_reference_answers("../hard_questions.json")

# Prepare datasets
dataset_easy = prepare_ragas_dataset(reference_easy, tag="fáciles")
dataset_hard = prepare_ragas_dataset(reference_hard, tag="difíciles")

# Save to JSON
dataset_easy.to_json("ragas_easy.json", orient="records", lines=False)
dataset_hard.to_json("ragas_hard.json", orient="records", lines=False)




🧪 Procesando fáciles: 100%|██████████| 50/50 [04:34<00:00,  5.48s/it]
🧪 Procesando difíciles: 100%|██████████| 65/65 [05:50<00:00,  5.39s/it]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 95.41ba/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 624.15ba/s]


410517

In [None]:
from ragas.embeddings.base import LangchainEmbeddingsWrapper
from sentence_transformers import SentenceTransformer

class CustomE5Embedding(LangchainEmbeddingsWrapper):
    def __init__(self, model_name="dariolopez/bge-m3-es-legal-tmp-6"):
        self.model = SentenceTransformer(model_name)

    def embed_query(self, texts):
        texts = [f"query: {text}" for text in texts]
        return self.model.encode(texts, convert_to_tensor=False)

    def embed_documents(self, texts):
        texts = [f"passage: {text}" for text in texts]
        return self.model.encode(texts, convert_to_tensor=False)


In [7]:
from ragas.metrics import (
    LLMContextPrecisionWithoutReference,
    LLMContextPrecisionWithReference,
    NonLLMContextPrecisionWithReference,


    ResponseRelevancy,
    LLMContextRecall,
    Faithfulness
)
from ragas.run_config import RunConfig
from ragas.embeddings.base import embedding_factory
from datasets import load_dataset
import os
import json
import time
from ragas.evaluation import evaluate
from datasets import Dataset
from tqdm import tqdm
import numpy as np


os.environ["OPENAI_API_KEY"] = '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'

# Load datasets
ragas_easy:Dataset = load_dataset("json", data_files="ragas_easy.json", split="train")
ragas_hard:Dataset = load_dataset("json", data_files="ragas_hard.json", split="train")

# Run config
run_config = RunConfig()
custom_embeddings = CustomE5Embedding() 

metrics = [
    LLMContextPrecisionWithReference(),
    LLMContextRecall(),
    Faithfulness()
]

print("\n📊 Evaluando preguntas FÁCILES:")
result_easy = evaluate(
    ragas_easy,
    metrics=metrics,
    run_config=run_config,
    batch_size=1
)
print("✅ Resultados EASY:", result_easy)

print("\n📊 Evaluando preguntas DIFÍCILES:")
result_hard = evaluate(
    ragas_hard,
    metrics=metrics,
    run_config=run_config,
    batch_size=1
)
print("✅ Resultados HARD:", result_hard)



Generating train split: 50 examples [00:00, 2554.95 examples/s]
Generating train split: 65 examples [00:00, 13993.93 examples/s]



📊 Evaluando preguntas FÁCILES:


Evaluating: 100%|██████████| 150/150 [21:26<00:00,  8.58s/it]


✅ Resultados EASY: {'llm_context_precision_with_reference': 0.6519, 'context_recall': 0.7600, 'faithfulness': 0.8210}

📊 Evaluando preguntas DIFÍCILES:


Evaluating: 100%|██████████| 195/195 [31:44<00:00,  9.77s/it]


✅ Resultados HARD: {'llm_context_precision_with_reference': 0.7927, 'context_recall': 0.8759, 'faithfulness': 0.8028}


In [None]:
import csv
import numpy as np
from pathlib import Path

csv_path = Path("../results.csv")
write_header = not csv_path.exists()

metric_names = [
    "llm_context_precision_with_reference",
    "context_recall",
    "faithfulness"
]
easy_scores = [np.mean(result_easy[m]) for m in metric_names]
hard_scores = [np.mean(result_hard[m]) for m in metric_names]

with open(csv_path, mode='a', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)

    if write_header:
        writer.writerow(["experiment", "dataset"] + metric_names)

    writer.writerow(["experiment_6", "easy"] + easy_scores)
    writer.writerow(["experiment_6", "hard"] + hard_scores)
