In [13]:
!uv add langchain langchain-huggingface langchain-chroma chromadb tiktoken ijson pandas torch transformers


[2K[2mResolved [1m229 packages[0m [2min 36ms[0m[0m                                        [0m
[2K[2mAudited [1m222 packages[0m [2min 0.02ms[0m[0m                                       [0m


In [31]:
import os, json
from glob import glob
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import TextLoader, JSONLoader, CSVLoader
from langchain_core.documents import Document

In [22]:
DATA_DIR = "./../final_train/"
VECTOR_DB = "./chroma_db"   # local folder, not /app (unless inside Docker)

In [23]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [24]:
vector_store = Chroma(
    persist_directory=VECTOR_DB,
    embedding_function=embeddings
)

In [25]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=50,
    add_start_index=True,
)

In [36]:
def process_and_add_files(file_paths):
    for path in file_paths:
        ext = os.path.splitext(path)[1].lower()
        if ext == ".txt":
            docs = load_txt(path)
        elif ext == ".tsv":
            docs = load_tsv(path)
        elif ext == ".json":
            docs = load_custom_json(path)
        else:
            print(f"⚠️ Skipping unsupported file: {path}")
            continue

            
        split_docs = splitter.split_documents(docs)
        vector_store.add_documents(split_docs)
        print(f"📚 Indexed {len(split_docs)} chunks from {os.path.basename(path)}")


In [35]:
split_docs = splitter.split_documents(docs)
vector_store.add_documents(split_docs)
print(f"📚 Indexed {len(split_docs)} chunks from {os.path.basename(path)}")


NameError: name 'docs' is not defined

In [28]:
txt_files = glob(os.path.join(DATA_DIR, "**", "*.txt"), recursive=True)
json_files = glob(os.path.join(DATA_DIR, "**", "context_*.json"), recursive=True)
tsv_files = glob(os.path.join(DATA_DIR, "**", "*.tsv"), recursive=True)

print("TXT:", len(txt_files), "JSON:", len(json_files), "TSV:", len(tsv_files))


TXT: 97958 JSON: 27366 TSV: 4


In [26]:
def load_txt(path):
    return TextLoader(path, encoding="utf-8").load()

def load_json(path):
    # Assumes JSON is an array of objects, each with a "text" key
    return JSONLoader(path, jq_schema=".[]", text_content_key="text").load()

def load_tsv(path):
    return CSVLoader(path, encoding="utf-8", csv_args={"delimiter": "\t"}).load()


In [30]:
with open(json_files[0], "r", encoding="utf-8") as f:
    data = json.load(f)

print("🔎 Type of JSON root:", type(data))
if isinstance(data, list):
    print("First 2 entries:\n", json.dumps(data[:2], indent=2))
elif isinstance(data, dict):
    # Print first 3 keys only
    keys = list(data.keys())[:3]
    preview = {k: data[k] for k in keys}
    print("First few keys:\n", json.dumps(preview, indent=2))


🔎 Type of JSON root: <class 'dict'>
First few keys:
 {
  "context": [
    [
      "Frailea",
      [
        "Frailea is a genus of globular to short cylindrical cacti native to Brazil.",
        " These species are cleistogamous.",
        " They were first classified in the genus \"Echinocactus\"."
      ]
    ],
    [
      "Hoodia alstonii",
      [
        "Hoodia alstonii is a succulent plant native to Namibia and the Cape Province of South Africa.",
        " \"H. alstonii\" is also known commonly as Ghaap, an Afrikaans name.",
        " It tends to grow in rocky, desert areas."
      ]
    ],
    [
      "Hawaii Route 92",
      [
        "Route 92 is a major east\u2013west highway on the island of Oahu which begins at exit 15 off Interstate H-1 in Honolulu and ends 0.6 mi east of the Ala Wai Canal crossing in Waikiki.",
        " The western portion, west of Richards Street, is locally known as the Nimitz Highway (named after Pacific Fleet Admiral during World War II, Chester 

In [20]:
def ingest_dataset(batch_size: int = 500):
    files = glob(os.path.join(DATA_DIR, "*"))
    total_docs = 0

    for file in files:
        if file.endswith(".json"):
            buffer = []
            for doc in stream_json(file):
                buffer.append(doc)
                if len(buffer) >= batch_size:
                    vectorstore.add_documents(buffer)
                    total_docs += len(buffer)
                    print(f"Indexed {total_docs} docs so far from {file}...")
                    buffer = []
            if buffer:
                vectorstore.add_documents(buffer)
                total_docs += len(buffer)

        elif file.endswith(".tsv"):
            buffer = []
            for doc in load_tsv(file):
                buffer.append(doc)
                if len(buffer) >= batch_size:
                    vectorstore.add_documents(buffer)
                    total_docs += len(buffer)
                    print(f"Indexed {total_docs} docs so far from {file}...")
                    buffer = []
            if buffer:
                vectorstore.add_documents(buffer)
                total_docs += len(buffer)

        elif file.endswith(".txt"):
            docs = list(load_txt(file))
            vectorstore.add_documents(docs)
            total_docs += len(docs)
            print(f"Indexed {len(docs)} docs from {file}")

    if total_docs > 0:
        vectorstore.persist()
        print(f"✅ Finished ingestion: {total_docs} total documents stored in Chroma")
    else:
        print("⚠️ No documents ingested. Check dataset format.")



In [None]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

query = "What does the dataset say about object detection?"
results = retriever.get_relevant_documents(query)

for i, doc in enumerate(results, 1):
    print(f"\n--- Result {i} ---\n{doc.page_content[:500]}")


In [None]:
import os
from glob import glob

DATA_DIR = "../final_train"  # adjust if wrong
files = glob(os.path.join(DATA_DIR, "*"))
print("Files found:", files)


In [None]:
print("hi")