# RAG PIPELINE

## IMPORTS

In [None]:
# need to run only once
!uv add langchain-text-splitters langchain-community langgraph
!uv add "langchain-perplexity"
!uv add langchain-ollama
!uv add chromadb

In [7]:
import requests
import json
import getpass
import os
from glob import glob


from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import Chroma

from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter


from langchain_community.document_loaders import TextLoader, JSONLoader, CSVLoader


### generation model

### embedding model

In [2]:
# # pulling ollama embedding model

# OLLAMA_API = "http://ollama:11434/api/pull"
# model_name = "llama3"

# with requests.post(OLLAMA_API, json={"name": model_name}, stream=True) as r:
#     for line in r.iter_lines():
#         if line:
#             data = json.loads(line.decode("utf-8"))
#             print(data)


## Embeddings and vector store

In [4]:
from langchain_ollama import OllamaEmbeddings

embeddings = OllamaEmbeddings(
    model="llama3",
    base_url="http://ollama:11434"  # service name from docker-compose
)

# Chroma will store data locally in /app/chroma_db
vector_store = Chroma(
    collection_name="my_collection",
    embedding_function=embeddings,
    persist_directory="/app/chroma_db"  # relative to /app in container
)


## Document loading and splitting

In [13]:
# Settings
DATA_DIR = "./../final_train/"
VECTOR_DB = "/app/chroma_db/"



splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=50,
    add_start_index=True,
)

# Loaders
def load_txt(path):
    return TextLoader(path, encoding='utf-8').load()
def load_json(path):
    return JSONLoader(path, jq_schema='.[]', text_content_key='text').load()
def load_tsv(path):
    return CSVLoader(path, encoding='utf-8', csv_args={'delimiter': '\t'}).load()

# Automated dispatcher
def process_and_add_files(file_paths, loader_fn):
    
    for path in file_paths:
        docs = loader_fn(path)
        print(docs)
        #split docs
        split_docs = splitter.split_documents(docs)
        vector_store.add_documents(split_docs)
        print(f"Indexed {len(split_docs)} chunks from {path}")

# Recursively collect files
txt_files = glob(os.path.join(DATA_DIR, "**", "*.txt"), recursive=True)
# json_files = glob(os.path.join(DATA_DIR, "**", "*.json"), recursive=True)
json_files = glob(os.path.join(DATA_DIR, "**", "context_*.json"), recursive=True)
tsv_files = glob(os.path.join(DATA_DIR, "**", "*.tsv"), recursive=True)

# print(txt_files[:5])
print(json_files[:1])
# print(tsv_files[:5])


with open(json_files[0], "r", encoding="utf-8") as f:
    data = json.load(f)
    print(json.dumps(data, indent=2))  # pretty print
# Run bulk processing
# process_and_add_files(txt_files, load_txt)
# process_and_add_files(json_files, load_json)
# process_and_add_files(tsv_files, load_tsv)


['./../final_train/context_73727.json']
{
  "context": [
    [
      "Frailea",
      [
        "Frailea is a genus of globular to short cylindrical cacti native to Brazil.",
        " These species are cleistogamous.",
        " They were first classified in the genus \"Echinocactus\"."
      ]
    ],
    [
      "Hoodia alstonii",
      [
        "Hoodia alstonii is a succulent plant native to Namibia and the Cape Province of South Africa.",
        " \"H. alstonii\" is also known commonly as Ghaap, an Afrikaans name.",
        " It tends to grow in rocky, desert areas."
      ]
    ],
    [
      "Hawaii Route 92",
      [
        "Route 92 is a major east\u2013west highway on the island of Oahu which begins at exit 15 off Interstate H-1 in Honolulu and ends 0.6 mi east of the Ala Wai Canal crossing in Waikiki.",
        " The western portion, west of Richards Street, is locally known as the Nimitz Highway (named after Pacific Fleet Admiral during World War II, Chester Nimitz).",
  

## Retrieval

In [None]:
def retrieve_documents(query, chromadb_vectorstore, embeddings, k=4):
    return chromadb_vectorstore.similarity_search(
        query,
        embeddings=embeddings,
        k=k  # number of results
    )

# Usage
query = "Explain Task Decomposition"
retrieved_docs = retrieve_documents(query, chromadb_vectorstore, embeddings, k=4)


In [None]:
def print_documents(docs):
    for idx, doc in enumerate(docs, 1):
        print(f"--- Document {idx} ---")
        print(doc.page_content[:500])  # Print only the start for brevity
        print()

# Usage
print_documents(retrieved_docs)
