# RAG using Langchain

## Packages loading & import

In [2]:
# No need to install this here due to running locally with uv venv
# !pip install langchain
# !pip install langchain_community
# !pip install langchain_huggingface
# !pip install langchain_text_splitters
# !pip install langchain_chroma
# !pip install rank-bm25
# !pip install huggingface_hub

In [1]:
import os
import json
import bs4
import nltk
import torch
import pickle
import numpy as np

# from pyserini.index import IndexWriter
# from pyserini.search import SimpleSearcher
from numpy.linalg import norm
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize

from langchain_community.llms import Ollama
#imports changed due to environment errors
# from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_classic.chains.combine_documents import create_stuff_documents_chain
# from langchain.chains import create_retrieval_chain
from langchain_classic.chains import create_retrieval_chain
# from langchain.vectorstores import Chroma
from langchain_classic.vectorstores import Chroma
from sentence_transformers import SentenceTransformer
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.embeddings import JinaEmbeddings
# from langchain.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter
# from langchain.docstore.document import Document
from langchain_classic.text_splitter import RecursiveCharacterTextSplitter, TokenTextSplitter
from langchain_classic.docstore.document import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.document_loaders import WebBaseLoader
from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer

from tqdm import tqdm
from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm
USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/didiersalest/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/didiersalest/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

## Hugging face login
- Please apply the model first: https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct
- If you haven't been granted access to this model, you can use other LLM model that doesn't have to apply.
- You must save the hf token otherwise you need to regenrate the token everytime.
- When using Ollama, no login is required to access and utilize the llama model.

In [4]:
from huggingface_hub import login
env_path = "./config/.env"
load_dotenv(dotenv_path=env_path)
hf_token = os.getenv("HF_TOKEN")
login(token=hf_token, add_to_git_credential=True)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [5]:
!huggingface-cli whoami

didiersalest


## TODO1: Set up the environment of Ollama

### Introduction to Ollama
- Ollama is a platform designed for running and managing large language models (LLMs) directly **on local devices**, providing a balance between performance, privacy, and control.
- There are also other tools support users to manage LLM on local devices and accelerate it like *vllm*, *Llamafile*, *GPT4ALL*...etc.

### Launch colabxterm

In [6]:
# I ran the code locally in our server, so I installed it outside this notebook, as a proof ollama is running you can see:
! ollama list

NAME                       ID              SIZE      MODIFIED       
llama3.2:1b                baf6a787fdff    1.3 GB    26 minutes ago    
gemma3:12b                 f4031aab637d    8.1 GB    4 months ago      
deepseek-r1:14b            c333b7232bdb    9.0 GB    4 months ago      
nexusraven:13b             483a8282af74    7.4 GB    4 months ago      
llama2:13b-chat-q8_0       303a83449a06    13 GB     4 months ago      
mixtral:8x7b               a3b6bef0f836    26 GB     4 months ago      
nomic-embed-text:v1.5      0a109f422b47    274 MB    4 months ago      
nomic-embed-text:latest    0a109f422b47    274 MB    4 months ago      
gemma3:12b-it-qat          5d4fa005e7bb    8.9 GB    4 months ago      
gemma3:4b                  a2af6cc3eb7f    3.3 GB    4 months ago      
gemma3:1b-it-qat           b491bd3989c6    1.0 GB    4 months ago      
gemma3:1b                  8648f39daa8f    815 MB    4 months ago      
gemma3:4b-it-qat           d01ad0579247    4.0 GB    4 months ago  

In [2]:
# TODO1-1: You should install colab-xterm and launch it.
# Write your commands here.

In [None]:
# TODO1-2: You should install Ollama.
# You may need root privileges if you use a local machine instead of Colab.

>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
############################################################################################# 100.0%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


In [None]:
%xterm

In [None]:
# TODO1-3: Pull Llama3.2:1b via Ollama and start the Ollama service in the xterm
# Write your commands in the xterm

## Ollama testing
You can test your Ollama status with the following cells.

In [7]:
# Setting up the model that this tutorial will use
MODEL = "llama3.2:1b" # https://ollama.com/library/llama3.2:3b
EMBED_MODEL = "jinaai/jina-embeddings-v2-base-en"

In [8]:
# Initialize an instance of the Ollama model
llm = Ollama(model=MODEL)
# Invoke the model to generate responses
response = llm.invoke("What is the capital of Taiwan?")
print(response)

  llm = Ollama(model=MODEL)


The capital of Taiwan is Taipei.


## Build a simple RAG system by using LangChain

### TODO2: Load the cat-facts dataset and prepare the retrieval database

In [9]:
!wget https://huggingface.co/ngxson/demo_simple_rag_py/resolve/main/cat-facts.txt

--2025-12-01 16:39:40--  https://huggingface.co/ngxson/demo_simple_rag_py/resolve/main/cat-facts.txt
Resolving huggingface.co (huggingface.co)... 3.169.137.119, 3.169.137.19, 3.169.137.111, ...
Connecting to huggingface.co (huggingface.co)|3.169.137.119|:443... connected.
HTTP request sent, awaiting response... 307 Temporary Redirect
Location: /api/resolve-cache/models/ngxson/demo_simple_rag_py/ccd6b7b72b52c7ca4e8f2a0a00b15c368d6ae294/cat-facts.txt?%2Fngxson%2Fdemo_simple_rag_py%2Fresolve%2Fmain%2Fcat-facts.txt=&etag=%22bc94ddd9483183e01bcf61e8bf9450fe3e09edb3%22 [following]
--2025-12-01 16:39:40--  https://huggingface.co/api/resolve-cache/models/ngxson/demo_simple_rag_py/ccd6b7b72b52c7ca4e8f2a0a00b15c368d6ae294/cat-facts.txt?%2Fngxson%2Fdemo_simple_rag_py%2Fresolve%2Fmain%2Fcat-facts.txt=&etag=%22bc94ddd9483183e01bcf61e8bf9450fe3e09edb3%22
Reusing existing connection to huggingface.co:443.
HTTP request sent, awaiting response... 200 OK
Length: 22657 (22K) [text/plain]
Saving to: ‘cat-

In [51]:
# TODO2-1: Load the cat-facts dataset (as `refs`, which is a list of strings for all the cat facts)
# Write your code here

refs = []

with open("cat-facts.txt", "r") as f:
    for line in f.readlines():
        refs.append(line.strip())

refs

['On average, cats spend 2/3 of every day sleeping. That means a nine-year-old cat has been awake for only three years of its life.',
 'Unlike dogs, cats do not have a sweet tooth. Scientists believe this is due to a mutation in a key taste receptor.',
 'When a cat chases its prey, it keeps its head level. Dogs and humans bob their heads up and down.',
 'The technical term for a cat’s hairball is a “bezoar.”',
 'A group of cats is called a “clowder.”',
 'Female cats tend to be right pawed, while male cats are more often left pawed. Interestingly, while 90% of humans are right handed, the remaining 10% of lefties also tend to be male.',
 'A cat can’t climb head first down a tree because every claw on a cat’s paw points the same way. To get down from a tree, a cat must back down.',
 'Cats make about 100 different sounds. Dogs make only about 10.',
 'A cat’s brain is biologically more similar to a human brain than it is to a dog’s. Both humans and cats have identical regions in their brai

In [11]:
from langchain_core.documents import Document
docs = [Document(page_content=doc, metadata={"id": i}) for i, doc in enumerate(refs)]

In [52]:
# Create an embedding model
model_kwargs = {'trust_remote_code': True}
encode_kwargs = {'normalize_embeddings': False}
embeddings_model = HuggingFaceEmbeddings(
    model_name=EMBED_MODEL,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [53]:
# TODO2-2: Prepare the retrieval database
# You should create a Chroma vector store.
# search_type can be “similarity” (default), “mmr”, or “similarity_score_threshold”
vector_store = Chroma.from_documents(
    # Write your code here
    documents=docs,
    embedding=embeddings_model,
)
# retriever = vector_store.as_retriever(
#     # Write your code here
#     search_type="similarity_score_threshold",
#     search_kwargs={"score_threshold": 0.8},
# )
retriever = vector_store.as_retriever(
    # Write your code here
    search_kwargs={"k": 5}
)

### Prompt setting

In [54]:
# TODO3: Set up the `system_prompt` and configure the prompt.
system_prompt = """You are a retrieval augmented generation system (RAG) agent that answers
questions given the retrieved context for grounding. Answer the following question accurately using the context:"""# Write your code here
prompt = ChatPromptTemplate.from_messages(
    messages = [
        ("system", system_prompt),
        ("human", "# **Question:**\n `{input}`\n# **Context Retrieved:** \n`{context}`"),
    ],
    template_format="f-string"
)

- For the vectorspace, the common algorithm would be used like Faiss, Chroma...(https://python.langchain.com/docs/integrations/vectorstores/) to deal with the extreme huge database.

In [55]:
# TODO4: Build and run the RAG system
# TODO4-1: Load the QA chain
# You should create a chain for passing a list of Documents to a model.
question_answer_chain = create_stuff_documents_chain(llm, prompt, document_variable_name="context") # Write your code here

# TODO4-2: Create retrieval chain
# You should create retrieval chain that retrieves documents and then passes them on.
chain = create_retrieval_chain(retriever, question_answer_chain) # Write your code here


In [56]:
# Question (queries) and answer pairs
# Write your code here
# Please load the questions_answers.txt file and prepare the `queries` and `answers` lists.
# Questions queries
queries = []
# Corresponding answers
answers = []
count = 1
with open("questions_answers.txt", "r") as f:
    for line in f.readlines():
        line = line.strip()
        if (count != 3):
            if count == 1:
                queries.append(line)
            elif count == 2:
                answers.append(line)
            count+=1
        else:
            count=1

In [None]:
# output_dir = "./results"
# os.makedirs(output_dir, exist_ok=True)
# results = {}
# responses = []
# results = {"questions": queries, "ground-truths": answers, "answers": []}

# with open(f"{output_dir}/results.json", "w") as f:
#     json.dump(results, f, indent=4)

In [59]:
correct_answers = 0
recall_at_1_num = 0
recall_at_5_num = 0
output_dir = "./results"
os.makedirs(output_dir, exist_ok=True)
results = {}
responses = []
for i, query in tqdm(enumerate(queries), total=len(queries)):
    # TODO4-3: Run the RAG system
    response = chain.invoke({"input": query}) # Write your code here
    responses.append(response["answer"])
    retrieved_docs = response['context']
    # The following lines perform evaluations.
    # if the answer shows up in your response, the response is considered correct.
    answer = answers[i]
    if answer.lower() in response["answer"].lower():
        correct_answers+=1
        # tqdm.write(f"Correct answers: {correct_answers}")
        # print(retrieved_docs)
    
    if retrieved_docs:
        if answer.lower() in retrieved_docs[0].page_content.lower():
            recall_at_1_num+=1
            # tqdm.write(f"Answers found in top 1 context: {recall_at_1_num}")
        
        for retrieved_doc in retrieved_docs[:5]:
            if answer.lower() in retrieved_doc.page_content.lower():
                recall_at_5_num+=1
                # tqdm.write(f"Answers found in top 5 context: {recall_at_5_num}")
                break
    
    # Compute recall@1, recall@5 and Accuracy.
    # Store the questions, ground-truths and answers in a json file.

accuracy = correct_answers/len(queries)
recall_at_1 = recall_at_1_num / len(queries)
recall_at_5 = recall_at_5_num / len(queries)
print(f"Accuracy: {accuracy:.2f}")
print(f"Recall@1: {recall_at_1:.2f}")
print(f"Recall@5: {recall_at_5:.2f}")
results = {"questions": queries, "ground-truths": answers, "answers": responses, 
            "metrics": {
            "accuracy": accuracy,
            "recall@1": recall_at_1,
            "recall@5": recall_at_5
        }
    }

with open(f"{output_dir}/results.json", "w") as f:
    json.dump(results, f, indent=4)
# TODO5: Improve to let the LLM correctly answer the ten questions.

100%|██████████| 150/150 [00:25<00:00,  5.93it/s]

Accuracy: 0.57
Recall@1: 0.67
Recall@5: 0.70



