In [1]:
from fundar_llms.api.tokenizers import get_tokenizer
from itertools import product
from fundar import json
from glob import glob
import polars

In [2]:
docs_por_presidencia = json.load('../data/docs_por_presidencia.json')
docs_por_presidencia = polars.DataFrame([
    dict(source=x, presidente=k)
    for k,v in docs_por_presidencia.items()
    for x in v
])

preguntas_df = polars.read_csv('../data/preguntas_clean_arg.csv')['qid', 'pregunta'].unique(maintain_order=True)

preguntas_dict = dict(preguntas_df.iter_rows())
preguntas_qid = preguntas_df['qid'].to_list()
presidentes = docs_por_presidencia['presidente'].unique(maintain_order=True).to_list()

arg_embeddings = polars.read_parquet('../data/arg_embeddings.parquet')

In [3]:
tokenizer = get_tokenizer('llama3.2').auto_tokenizer_from_pretrained()

In [4]:
relevant_chunks = glob('../data/relevant_chunks_q?-*.parquet')

In [5]:
preguntas = []
for qid, presidente in product(preguntas_qid, presidentes):
    relevant_chunks_for_question_path = filter(lambda x: qid in x and presidente in x, relevant_chunks)
    relevant_chunks_for_question_path = list(relevant_chunks_for_question_path)
    
    assert len(relevant_chunks_for_question_path) == 1

    relevant_chunks_for_question = polars.read_parquet(relevant_chunks_for_question_path)
    relevant_chunks_for_question_with_text = relevant_chunks_for_question.join(arg_embeddings, on='vector_id')

    pregunta_text = preguntas_dict[qid]
    text = relevant_chunks_for_question_with_text['text'].str.join('\n')[0]

    preguntas.append(dict(
        pregunta = qid,
        pregunta_text = pregunta_text,
        presidente = presidente,
        contexto = text
    ))

In [6]:
polars.DataFrame(preguntas).write_parquet('../data/preguntas.parquet')