In [None]:
import pandas as pd
import sqlite3

from rich.pretty import pprint

from cparla.retriever import Retriever
from cparla.retriever.retriever import TextChunk

In [None]:
from functools import lru_cache
from langchain_openai import OpenAIEmbeddings


@lru_cache()
def get_openai_embeddings(
    model: str = "text-embedding-3-large",
    dimensions: int = 256,
) -> OpenAIEmbeddings:
    return OpenAIEmbeddings(
        model=model,
        dimensions=dimensions,
    )


dense_embeddings = get_openai_embeddings()
dense_embeddings

In [None]:
retriever = Retriever(dense_embeddings=dense_embeddings)

In [None]:
DB_PATH = "/resources/db/data.db"
conn = sqlite3.connect(DB_PATH)

## Legisladores - Diputados


In [None]:
legisladores_diputados = pd.read_sql("SELECT * FROM legisladores_diputados", conn)
legisladores_diputados.head()

In [None]:
legisladores_diputados.info()

In [None]:
row = legisladores_diputados.iloc[-1]
pprint(row)

In [None]:
def row_to_text_chunk(row: pd.Series, text_column: str) -> TextChunk:
    metadata = row.to_dict()
    text = metadata.pop(text_column)

    return TextChunk(text=text, metadata=metadata)

In [None]:
pprint(row_to_text_chunk(row, text_column="nombre"))

In [None]:
documents = [
    row_to_text_chunk(row, text_column="nombre")
    for _, row in legisladores_diputados.iterrows()
]

pprint(documents[-10:])

In [None]:
collection_name = "legisladores-diputados"

retriever.create_collection(collection_name=collection_name)
retriever.insert_text_chunks(
    collection_name=collection_name,
    text_chunks=documents,
)

In [None]:
results = await retriever.hybrid_search(
    collection_name=collection_name,
    query="Quiero saber cuáles fueron los votos de Myriam Bregman",
    k=3,
)

pprint(results)

## Asuntos - Diputados


In [None]:
asuntos_diputados = pd.read_sql("SELECT * FROM asuntos_diputados", conn)
asuntos_diputados.head()

In [None]:
asuntos_diputados.info()

In [None]:
def row_to_text_chunk_asuntos(row: pd.Series, text_columns: list[str]) -> TextChunk:
    text = " - ".join([row[col] or "" for col in text_columns])
    metadata = row.to_dict()

    return TextChunk(text=text, metadata=metadata)

In [None]:
row_to_text_chunk_asuntos(
    asuntos_diputados.iloc[-1], text_columns=["asunto", "titulo"]
).model_dump()

In [None]:
documents = [
    row_to_text_chunk_asuntos(row, text_columns=["asunto", "titulo"])
    for _, row in asuntos_diputados.iterrows()
]

pprint(documents[-10:])

In [None]:
collection_name = "asuntos-diputados"

retriever.create_collection(collection_name=collection_name)
retriever.insert_text_chunks(
    collection_name=collection_name,
    text_chunks=documents,
)

In [None]:
results = await retriever.hybrid_search(
    collection_name=collection_name,
    query="Traeme información sobre 'Ficha Limpia'",
    k=3,
)

pprint(results)