In [1]:
import polars
from fundar import json

docs_por_presidencia = json.load('../data/docs_por_presidencia.json')
docs_por_presidencia = polars.DataFrame([
    dict(source=x, presidente=k)
    for k,v in docs_por_presidencia.items()
    for x in v
])

presidentes = docs_por_presidencia['presidente'].unique(maintain_order=True).to_list()

arg_embeddings = (
    polars
        .scan_parquet('../data/arg_embeddings.parquet')
        .select('vector_id', 'source'))

arg_tokencount = (
    polars
        .scan_parquet('../data/arg_token_counts.parquet')
        .select('vector_id', 'token_count'))

preguntas = polars.read_csv('../data/preguntas_clean_arg.csv')['qid'].unique(maintain_order=True).to_list()

In [2]:
MAX_TOKENS = 20_000

for presidente in presidentes:
    for pregunta in preguntas:
        df = (
            polars
                .scan_parquet(f"../data/{pregunta}_normalized_score.parquet")
                .join(arg_embeddings, on='vector_id', how='left')
                .join(arg_tokencount, on='vector_id', how='left')
                .join(docs_por_presidencia.lazy(), on='source')
                .filter(polars.col('presidente') == presidente)
                .sort('normalized_score', descending=True)
                .with_columns([
                    polars.col('token_count').cum_sum().alias('cumsum')
                ])
                # .filter(polars.col('cumsum') <= 20_000)
        )

        df = df.collect()
        filtered_df = df.filter(polars.col('cumsum') <= MAX_TOKENS)        

        filtered_df.write_parquet(f'../data/relevant_chunks_{pregunta}-{presidente}.parquet')