In [2]:
!uv add langchain-experimental

[2K[2mResolved [1m230 packages[0m [2min 294ms[0m[0m                                       [0m
[2K[2mPrepared [1m1 package[0m [2min 55ms[0m[0m                                               
         If the cache and target directories are on different filesystems, hardlinking may not be supported.
[2K[2mInstalled [1m1 package[0m [2min 4ms[0m[0mtal==0.3.4                        [0m
 [32m+[39m [1mlangchain-experimental[0m[2m==0.3.4[0m


In [None]:
import os
from glob import glob
from tqdm.notebook import tqdm
from langchain_community.document_loaders import TextLoader
from langchain_experimental.text_splitter import SemanticChunker  # corrected import
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma

# Directories
DATA_DIR = "./../../final_train/"
VECTOR_DB = "/app/vector_stores/bhavana_db/semantic_txt_db"
os.makedirs(VECTOR_DB, exist_ok=True)

# Embeddings (small or larger as desired)
embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en")

# Vector Store
vector_store = Chroma(
    collection_name="bhavana_semantic_txt",
    embedding_function=embeddings,
    persist_directory=VECTOR_DB
)

# Semantic splitter setup
splitter = SemanticChunker(
    embeddings,
    breakpoint_threshold_type="percentile",       # detects semantic boundaries
    breakpoint_threshold_amount=95,               # top 5% divergence triggers chunk breaks
    min_chunk_size=100                            # avoid very small chunks
)

# Loader for TXT
def load_txt(path):
    return TextLoader(path, encoding='utf-8').load()

# Process limited files
def process_txt_files(file_paths):
    total = len(file_paths)
    for i, path in enumerate(file_paths[:10000], start=1):
        docs = load_txt(path)
        split_docs = splitter.split_documents(docs)
        print(f"[{i}/{total}] {os.path.basename(path)} → {len(split_docs)} semantic chunks")
        vector_store.add_documents(split_docs)

# Run
txt_files = glob(os.path.join(DATA_DIR, "**", "*.txt"), recursive=True)
print(f"Found {len(txt_files)} TXT files (processing up to 5000)")
process_txt_files(txt_files)

print("▶ Done! Vector DB stored at:", VECTOR_DB)


Found 97958 TXT files (processing up to 5000)
[1/97958] 19393327__WQ__E-2008-4260__EN.txt → 1 semantic chunks
[2/97958] 22860129__WQA__E-2009-3395__EN.txt → 2 semantic chunks
[3/97958] 14701632__QT__H-2007-0537__EN.txt → 2 semantic chunks
[4/97958] 23299635__WQ__E-2009-4626__EN.txt → 1 semantic chunks
[5/97958] 15616943__WQ__E-2007-2963__EN.txt → 2 semantic chunks
[6/97958] 18305684__IM-PRESS__20080423-IPR-27459__EN_8ae3dc.txt → 2 semantic chunks
[7/97958] 30733054__WQA__E-2011-002789__EN.txt → 2 semantic chunks
[8/97958] 32901198__WQ__E-2011-011977__EN.txt → 2 semantic chunks
[9/97958] 29875515__WQ__E-2011-003389__EN.txt → 2 semantic chunks
[10/97958] 23026940__WQ__E-2009-3858__EN.txt → 2 semantic chunks
[11/97958] 342687__PRESS__BI-20021127-1__EN.txt → 7 semantic chunks
[12/97958] 12553909__WQ__E-2006-4819__EN_0c9b50.txt → 2 semantic chunks
[13/97958] 2316098__WQ__E-2003-3266__EN.txt → 2 semantic chunks
[14/97958] 14463272__QT__H-2007-0465__EN.txt → 2 semantic chunks
[15/97958] 36107

In [9]:
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

VECTOR_DB = "/app/vector_stores/bhavana_db/semantic_txt_db"

embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en")

retriever = Chroma(
    collection_name="bhavana_semantic_txt",
    embedding_function=embeddings,
    persist_directory=VECTOR_DB
).as_retriever(search_kwargs={"k": 5})  # top-5 chunks


In [10]:
query = "What does the training dataset say about financial transactions?"

# New method
results = retriever.invoke(query)

for i, doc in enumerate(results, 1):
    print(f"\n--- Chunk {i} ---\n{doc.page_content[:300]}...")



--- Chunk 1 ---
whether this funding has been used? ...

--- Chunk 2 ---
whether this funding has been used?...

--- Chunk 3 ---
the financial activity report – 11327/10, FIN 278 - point 2.2). ...

--- Chunk 4 ---
Can the Commission confirm this, and, should it be the case, explain how it intends to utilise the unspent amounts over the financial programming period 2014-2020?...

--- Chunk 5 ---
20110830CAN25553 draft report...

--- Chunk 6 ---
Can the Commission give specific information on the feedback received from EFSA on this specific issue?...

--- Chunk 7 ---
How much has it allocated? How much remains unallocated? Are there any outstanding requests for payment? ...

--- Chunk 8 ---
;
7. Calls on the Agency to establish a monitoring system at the level of certification projects to make sure that, over the entire project duration, the fees levied do not deviate significantly from the actual cost;
8. Regrets that, in 2008, the system of annual flat fees generated income which was.