In [1]:
from imports import *
from functions import *
from setup import GB_PATH, PERSIAN_DATA, CHROMA_PATH, CHUNK_SIZE, CHUNK_OVERLAP,\
                     BATCH_SIZE, NUM_CTX, LLM_MODEL, PROMPT_TEMPLATE, LLM_TEMP, \
                     K_RESULTS, PRINT_PROMPT, TOP_K 

# Populate Db

Extract text from PDF

In [None]:
def load_documents(DATA_PATH):
    document_load = PyPDFDirectoryLoader(DATA_PATH)
    return document_load.load()


def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = CHUNK_SIZE,
        chunk_overlap = CHUNK_OVERLAP,
        length_function = len,
        is_separator_regex=False
    )
    return text_splitter.split_documents(documents)

def add_to_chroma(chunks: list[Document]) :
    # calculate Page IDs
    chunks_with_ids = calculate_chunks_ids(chunks)
    # add or update the documents
    db = Chroma(
        persist_directory=CHROMA_PATH,
        embedding_function=get_embedding_function(),
        relevance_score_fn=get_distance_func()
    )
    existing_items = db.get(include=[]) #ids, included by default
    existing_ids = set(existing_items["ids"])
    print(f"ℹ️ Number of existing documents in DB : {len(existing_ids)}")
    
    # add only chunks that were never added before
    new_chunks=[]   
    for chunk in chunks_with_ids:
        if chunk.metadata["id"] not in existing_ids :
            new_chunks.append(chunk)

    if len(new_chunks) :
        print(f"👉 Adding new documents: {len(new_chunks)}")
        new_chunks_ids=[chunk.metadata["id"] for chunk in new_chunks]
        if len(new_chunks) > BATCH_SIZE :
            # split into smaller batches
            batches = [new_chunks[i:i+BATCH_SIZE] for i in range(0, len(new_chunks), BATCH_SIZE)]
            batches_ids = [new_chunks_ids[i:i+BATCH_SIZE] for i in range(0, len(new_chunks_ids), BATCH_SIZE)]

            # Process each batch separately
            for i, batch in enumerate(batches) :
                db.add_documents(batch, ids=batches_ids[i])
                print(f"✅ Documents added successfully [batch {i+1} on {len(batches)}]")

        else :
            db.add_documents(new_chunks, ids=new_chunks_ids)
            print("✅ Documents added successfully!")     
        
        #db.persist() # deprecated
        
    else :
        print(f"✅ No new documents to add")


In [None]:
gb_documents = load_documents(GB_PATH)

# afficher toutes les pages
for i, doc in enumerate(gb_documents):
    print(f"Page {i+1} of {len(gb_documents)}:")
    print(doc.page_content[:1000])
    print("\n"+ "+" + "-"*50 + "\n")

Page 1 of 173:
Gabriel García Márquez 
Cien años de soledad

+--------------------------------------------------

Page 2 of 173:
Para Jomi García Ascot 
y María Luisa Elio

+--------------------------------------------------

Page 3 of 173:
Cien años de soledad 
Gabriel  García Márquez 
 3 
 
I 
 
Muchos años después, frente al pelotón de fusilamiento, el coronel Aureliano Buendía había de 
recordar aquella tarde remota en que su padre lo llevó a conocer el hielo. Macondo era entonces 
una aldea de veinte casas de barro y cañabrava construidas a la orilla de un río de aguas diáfanas 
que se precipitaban por un lecho de piedras pulidas, blancas y enormes como huevos 
prehistóricos. El mundo era tan reciente, que muchas cosas carecían de nombre, y para 
mencionarlas había que señalarías con el dedo. Todos los años, por el mes de marzo, una familia 
de gitanos desarrapados plantaba su carpa cerca de la aldea, y con un grande alboroto de pitos y 
timbales daban a conocer los nuevos invento

Load data to Db

In [None]:
def parse_doc_and_add_to_db():
    # Avoid Jupyter-specific arguments that aren't relevant to the script
    if 'ipykernel_launcher' in sys.argv[0]:
        sys.argv = sys.argv[:1]  # Remove unwanted arguments for Jupyter execution
    # Check if the database should be cleared (using the --clear flag).
    parser = argparse.ArgumentParser()
    parser.add_argument("--reset", action="store_true", help="Reset the database.")
    args = parser.parse_args()
    if args.reset:
        print("✨ Clearing Database")
        clear_database()

    # Create (or update) the data store.
    print("ℹ️ Loading documents from 'data' folder.")
    documents = load_documents(GB_PATH)
    print("ℹ️ Splitting document(s).")
    chunks = split_documents(documents)
    print("ℹ️ Adding document(s) to chroma DB.")
    
    # Adding tqdm for progress bar during chunk addition
    for chunk in tqdm(chunks, desc="Adding chunks to DB", unit="chunk"):
        add_to_chroma(chunk)

In [None]:
parse_doc_and_add_to_db()

ℹ️ Loading documents from 'data' folder.
ℹ️ Splitting document(s).
ℹ️ Adding document(s) to chroma DB.
ℹ️ Number of existing documents in DB : 0
👉 Adding new documents: 2316
✅ Documents added successfully!


# Query Data

Query Text

In [None]:
QUERY = "What are the best prompts i could make to ask about the book ?"

In [None]:
# from langchain_text_splitters import splitter