In [1]:
import polars as pl
from cvm_rag import preprocessing
import os

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
fre_data = pl.read_csv("data/fre_cia_aberta_2024/fre_cia_aberta_2024.csv", separator=';', encoding='latin1', try_parse_dates=True)
max_fre_by_cia = fre_data[["CNPJ_CIA", "DENOM_CIA", "VERSAO", "DT_RECEB"]].group_by(["CNPJ_CIA", "DENOM_CIA"]).agg(pl.max("DT_RECEB"))

In [6]:
last_fre_data = fre_data.join(max_fre_by_cia, how="inner", on=max_fre_by_cia.columns)

In [5]:
downloaded_files = preprocessing.get_list_files("downloads")

In [7]:
pdf_files = preprocessing.get_list_files("pdfs")

In [None]:
conn_params = {
    'host': os.getenv("RDS_DB_URL"),
    'port': '5432',
    'dbname': 'db_name',
    'user': 'postgres',
    'password': os.getenv("RDS_CVM_RAG_DB_PW")
}

preprocessing.process_pdfs_with_db(pdf_files, conn_params)

Successfully inserted 261 records into the database.
Stored data for 261 chunks in the database.
Successfully inserted 725 records into the database.
Stored data for 725 chunks in the database.
Successfully inserted 390 records into the database.
Stored data for 390 chunks in the database.
Successfully inserted 392 records into the database.
Stored data for 392 chunks in the database.
Successfully inserted 230 records into the database.
Stored data for 230 chunks in the database.
Successfully inserted 25 records into the database.
Stored data for 25 chunks in the database.
Successfully inserted 778 records into the database.
Stored data for 778 chunks in the database.
Successfully inserted 1078 records into the database.
Stored data for 1078 chunks in the database.
Successfully inserted 322 records into the database.
Stored data for 322 chunks in the database.
Successfully inserted 266 records into the database.
Stored data for 266 chunks in the database.
Successfully inserted 577 reco

In [9]:
input_query = """O que você sabe sobre a Aguas do Rio?"""

input_query_embedding = preprocessing.generate_embeddings(input_query)

In [10]:
input_query_embedding

array([ 0.37611458, -0.17901197,  0.13942601,  0.15601122, -0.05080128,
       -0.26450637,  0.23096907,  0.18026076,  0.1113631 , -0.11023039,
       -0.04208172, -0.34783584, -0.32740638,  0.13918903, -0.32090855,
       -0.2136436 ,  0.13863204,  0.05688524, -0.15466468, -0.1384759 ,
        0.29137197,  0.0699602 , -0.1612442 ,  0.0986677 ,  0.13788322,
        0.16524021, -0.1937548 ,  0.1041615 , -0.01076231, -0.26597527,
       -0.11739872, -0.1621374 ,  0.06775843,  0.13202341, -0.17164962,
        0.12628886,  0.11562718,  0.08518589,  0.1435351 ,  0.07559364,
        0.18047643, -0.2414477 ,  0.3486562 ,  0.46097818,  0.0824427 ,
       -0.06900313, -0.08611255, -0.19389036, -0.06432167,  0.04831737,
        0.28629112, -0.33039874, -0.02447213, -0.35038954, -0.00096513,
       -0.19176862, -0.20110448,  0.01890185,  0.03045044,  0.13142182,
        0.40494463,  0.13777472, -0.40351376,  0.28666458, -0.39631113,
        0.03734676, -0.0277831 ,  0.5938088 ,  0.08839406, -0.26

In [11]:
query_embedding_str = ','.join(map(str, input_query_embedding))
query_embedding_str

'0.37611458,-0.17901197,0.13942601,0.15601122,-0.050801285,-0.26450637,0.23096907,0.18026076,0.1113631,-0.110230386,-0.04208172,-0.34783584,-0.32740638,0.13918903,-0.32090855,-0.2136436,0.13863204,0.056885235,-0.15466468,-0.1384759,0.29137197,0.0699602,-0.1612442,0.098667696,0.13788322,0.16524021,-0.1937548,0.1041615,-0.010762311,-0.26597527,-0.117398724,-0.1621374,0.067758426,0.13202341,-0.17164962,0.12628886,0.115627185,0.085185885,0.1435351,0.07559364,0.18047643,-0.2414477,0.3486562,0.46097818,0.0824427,-0.06900313,-0.08611255,-0.19389036,-0.064321674,0.048317373,0.28629112,-0.33039874,-0.024472134,-0.35038954,-0.000965126,-0.19176862,-0.20110448,0.018901853,0.030450443,0.13142182,0.40494463,0.13777472,-0.40351376,0.28666458,-0.39631113,0.037346758,-0.027783101,0.5938088,0.08839406,-0.2651445,-0.035423927,-0.06179363,0.23697248,-0.00079882704,-0.20650972,-0.16613078,0.04798495,0.3463273,-0.40000305,0.1802988,0.1863486,-0.0016785368,-0.04555308,0.15788114,0.01725291,0.10732681,0.1894