In [None]:
from dotenv import load_dotenv

load_dotenv()

import os
import tqdm
from langchain.docstore.document import Document as LangchainDocument
from langchain_openai import ChatOpenAI

# Import from helpers
from rag_helpers import (
    read_jsonl,
    split_documents,
    load_embeddings,
    answer_with_rag,
    RAG_PROMPT_TEMPLATE,
)

# Set folder produced by the save step
load_dir = "datasets_local/20250914_145157"
raw_path = os.path.join(load_dir, "initial_corpus.jsonl")

ds = read_jsonl(raw_path)

RAW_KNOWLEDGE_BASE = [
    LangchainDocument(page_content=doc["text"], metadata={"source": doc["source"]})
    for doc in tqdm.tqdm(ds)
]

# Setup Reader LLM
READER_MODEL_NAME = "gpt-4o-mini"
READER_LLM = ChatOpenAI(
    model=READER_MODEL_NAME,
    temperature=0.1,
    max_tokens=512,
)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2647/2647 [00:00<00:00, 632358.76it/s]


In [None]:
# # Load saved raw knowledge base into LangChain documents

# import os, json, tqdm
# from langchain.docstore.document import Document as LangchainDocument


# def read_jsonl(path):
#     if not os.path.exists(path):
#         raise FileNotFoundError(f"Missing file: {path}")
#     with open(path, "r", encoding="utf-8") as f:
#         return [json.loads(line) for line in f]


# # Set folder produced by the save step
# load_dir = "datasets_local/20250914_145157"
# raw_path = os.path.join(load_dir, "initial_corpus.jsonl")

# ds = read_jsonl(raw_path)



# 2. Build our RAG System

### 2.1. Preprocessing documents to build our vector database

- In this part, __we split the documents from our knowledge base into smaller chunks__: these will be the snippets that are picked by the Retriever, to then be ingested by the Reader LLM as supporting elements for its answer.
- The goal is to build semantically relevant snippets: not too small to be sufficient for supporting an answer, and not too large too avoid diluting individual ideas.

Many options exist for text splitting:
- split every `n` words / characters, but this has the risk of cutting in half paragraphs or even sentences
- split after `n` words / character, but only on sentence boundaries
- **recursive split** tries to preserve even more of the document structure, by processing it tree-like way, splitting first on the largest units (chapters) then recursively splitting on smaller units (paragraphs, sentences).

To learn more about chunking, I recommend you read [this great notebook](https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb) by Greg Kamradt.

[This space](https://huggingface.co/spaces/m-ric/chunk_visualizer) lets you visualize how different splitting options affect the chunks you get.

> In the following, we use Langchain's `RecursiveCharacterTextSplitter`.

üí° _To measure chunk length in our Text Splitter, our length function will not be the count of characters, but the count of tokens in the tokenized text: indeed, for subsequent embedder that processes token, measuring length in tokens is more relevant and empirically performs better._

In [46]:
# RAW_KNOWLEDGE_BASE = [
#     LangchainDocument(page_content=doc["text"], metadata={"source": doc["source"]})
#     for doc in tqdm.tqdm(ds)
# ]

In [47]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import List
import tiktoken


# def split_documents(
#     chunk_size: int,
#     knowledge_base: List[LangchainDocument],
#     tokenizer_name: str,
# ) -> List[LangchainDocument]:
#     """
#     Split documents into chunks of size `chunk_size` characters and return a list of documents.
#     """
#     text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
#         encoding_name=tokenizer_name,
#         chunk_size=chunk_size,
#         chunk_overlap=int(chunk_size / 10),
#         add_start_index=True,
#         strip_whitespace=True,
#         separators=["\n\n", "\n", ".", " ", ""],
#     )

#     docs_processed = []
#     for doc in knowledge_base:
#         docs_processed += text_splitter.split_documents([doc])

#     # Remove duplicates
#     unique_texts = {}
#     docs_processed_unique = []
#     for doc in docs_processed:
#         if doc.page_content not in unique_texts:
#             unique_texts[doc.page_content] = True
#             docs_processed_unique.append(doc)

#     return docs_processed_unique

### 2.2. Retriever - embeddings üóÇÔ∏è
The __retriever acts like an internal search engine__: given the user query, it returns the most relevant documents from your knowledge base.

> For the knowledge base, we use Langchain vector databases since __it offers a convenient [FAISS](https://github.com/facebookresearch/faiss) index and allows us to keep document metadata throughout the processing__.

üõ†Ô∏è __Options included:__

- Tune the chunking method:
    - Size of the chunks
    - Method: split on different separators, use [semantic chunking](https://python.langchain.com/docs/modules/data_connection/document_transformers/semantic-chunker)...
- Change the embedding model

In [48]:
from langchain.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
from typing import Optional, List
import os


# def load_embeddings(
#     langchain_docs: List[LangchainDocument],
#     chunk_size: int,
#     embedding_model_name: Optional[str] = "text-embedding-3-small",
# ) -> FAISS:
#     """
#     Creates a FAISS index from the given embedding model and documents. Loads the index directly if it already exists.

#     Args:
#         langchain_docs: list of documents
#         chunk_size: size of the chunks to split the documents into
#         embedding_model_name: name of the embedding model to use

#     Returns:
#         FAISS index
#     """
#     # load embedding_model
#     embedding_model = OpenAIEmbeddings(
#         model=embedding_model_name,
#     )

#     # Check if embeddings already exist on disk
#     index_name = (
#         f"index_chunk:{chunk_size}_embeddings:{embedding_model_name.replace('/', '~')}"
#     )
#     index_folder_path = f"data/indexes/{index_name}/"
#     if os.path.isdir(index_folder_path):
#         return FAISS.load_local(
#             index_folder_path,
#             embedding_model,
#             allow_dangerous_deserialization=True,
#         )

#     else:
#         print("Index not found, generating it...")
#         docs_processed = split_documents(
#             chunk_size,
#             langchain_docs,
#             "cl100k_base",
#         )
#         knowledge_index = FAISS.from_documents(
#             docs_processed, embedding_model, distance_strategy=DistanceStrategy.COSINE
#         )
#         knowledge_index.save_local(index_folder_path)
#         return knowledge_index

### 2.3. Reader - LLM üí¨

In this part, the __LLM Reader reads the retrieved documents to formulate its answer.__

üõ†Ô∏è Here we tried the following options to improve results:
- Switch reranking on/off
- Change the reader model

In [49]:
# RAG_PROMPT_TEMPLATE = """
# <|system|>
# Using the information contained in the context,
# give a comprehensive answer to the question.
# Respond only to the question asked, response should be concise and relevant to the question.
# Provide the number of the source document when relevant.
# If the answer cannot be deduced from the context, do not give an answer.</s>
# <|user|>
# Context:
# {context}
# ---
# Now here is the question you need to answer.

# Question: {question}
# </s>
# <|assistant|>
# """

In [50]:
# from langchain_openai import ChatOpenAI

# READER_MODEL_NAME = "gpt-4o-mini"

# READER_LLM = ChatOpenAI(
#     model=READER_MODEL_NAME,
#     temperature=0.1,
#     max_tokens=512,
# )

In [51]:
from langchain_core.vectorstores import VectorStore
from langchain_core.language_models.base import BaseLanguageModel
from typing import Optional, List, Tuple


# def answer_with_rag(
#     question: str,
#     llm: BaseLanguageModel,
#     knowledge_index: VectorStore,
#     reranker: Optional[any] = None,
#     num_retrieved_docs: int = 30,
#     num_docs_final: int = 7,
# ) -> Tuple[str, List[LangchainDocument]]:
#     """Answer a question using RAG with the given knowledge index."""
#     # Gather documents with retriever
#     relevant_docs = knowledge_index.similarity_search(
#         query=question, k=num_retrieved_docs
#     )
#     relevant_docs_text = [doc.page_content for doc in relevant_docs]

#     # Optionally rerank results
#     if reranker:
#         reranked = reranker.rerank(question, relevant_docs_text, k=num_docs_final)
#         relevant_docs_text = [doc["content"] for doc in reranked]

#     relevant_docs_text = relevant_docs_text[:num_docs_final]

#     # Build the final prompt
#     context = "\nExtracted documents:\n"
#     context += "".join(
#         [f"Document {str(i)}:::\n" + doc for i, doc in enumerate(relevant_docs_text)]
#     )

#     final_prompt = RAG_PROMPT_TEMPLATE.format(question=question, context=context)

#     # Redact an answer
#     if hasattr(llm, "invoke"):
#         response = llm.invoke(final_prompt)
#         answer = response.content if hasattr(response, "content") else str(response)
#     else:
#         answer = llm(final_prompt)

#     return answer, relevant_docs_text