In [1]:

!pip install llama-cpp-python
!pip install langchain langchain-community sentence-transformers chromadb
!pip install pypdf requests pydantic tqdm

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.9.tar.gz (67.9 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.9/67.9 MB[0m [31m52.3 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting typing-extensions>=4.5.0 (from llama-cpp-python)
  Using cached typing_extensions-4.14.0-py3-none-any.whl.metadata (3.0 kB)
Collecting numpy>=1.20.0 (from llama-cpp-python)
  Downloading numpy-2.3.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (62 kB)
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Using cached diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting jinja2>=2.11.3 (from llama-cpp-python)
  Using cached jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting MarkupSafe>=2.0 (from jinja2>=2.11.3->llama-cpp-python)
  Using 

In [3]:
import os
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.llms import LlamaCpp
import requests
from tqdm import tqdm
import time

In [None]:
model_path = "llama-2-7b-chat.Q4_K_M.gguf"
 
if not os.path.exists(model_path):
    print(f"Downloading {model_path}...")
    # You may want to replace the model URL by another of your choice
    model_url = "https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q4_K_M.gguf"
    response = requests.get(model_url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    
    with open(model_path, 'wb') as f:
        for data in tqdm(response.iter_content(chunk_size=1024), total=total_size//1024):
            f.write(data)
    print("Download complete!")

Downloading llama-2-7b-chat.Q4_K_M.gguf...


3985356it [00:49, 80597.71it/s]                             

Download complete!





In [None]:
os.makedirs("docs", exist_ok=True)

# Sample text for demonstration purposes
with open("docs/sample.txt", "w") as f:
    f.write("""
    Retrieval-Augmented Generation (RAG) is a technique that combines retrieval-based and generation-based approaches
    for natural language processing tasks. It involves retrieving relevant information from a knowledge base and then 
    using that information to generate more accurate and informed responses.
    
    RAG models first retrieve documents that are relevant to a given query, then use these documents as additional context
    for language generation. This approach helps to ground the model's responses in factual information and reduces hallucinations.
    
    The llama.cpp library is a C/C++ implementation of Meta's LLaMA model, optimized for CPU usage. It allows running LLaMA models
    on consumer hardware without requiring high-end GPUs.
    
    LocalAI is a framework that enables running AI models locally without relying on cloud services. It provides APIs compatible
    with OpenAI's interfaces, allowing developers to use their own models with the same code they would use for OpenAI services.
    """)

documents = []
for file in os.listdir("docs"):
    if file.endswith(".pdf"):
        loader = PyPDFLoader(os.path.join("docs", file))
        documents.extend(loader.load())
    elif file.endswith(".txt"):
        loader = TextLoader(os.path.join("docs", file))
        documents.extend(loader.load())

# Split documents into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)

chunks = text_splitter.split_documents(documents)

In [6]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

vectorstore = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings,
    persist_directory="./chroma_db"
)

  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
llm = LlamaCpp(
    model_path='/models/DeepSeek-R1-0528-Qwen3-8B-UD-Q4_K_XL.gguf',
    temperature=0.7,
    max_tokens=2000,
    n_ctx=4096,
    verbose=False
)

llama_context: n_batch is less than GGML_KQ_MASK_PAD - increasing to 64


In [8]:
template = """
Answer the question based on the following context:

{context}

Question: {question}
Answer:
"""
prompt = PromptTemplate(
    template=template,
    input_variables=["context", "question"]
)

In [9]:
rag_pipeline = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": prompt}
)

In [10]:
def ask_question(question):
    start_time = time.time()
    result = rag_pipeline({"query": question})
    end_time = time.time()
    
    print(f"Question: {question}")
    print(f"Answer: {result['result']}")
    print(f"Time taken: {end_time - start_time:.2f} seconds")
    print("\nSource documents:")
    for i, doc in enumerate(result["source_documents"]):
        print(f"Document {i+1}:")
        print(f"Source: {doc.metadata.get('source', 'Unknown')}")
        print(f"Content: {doc.page_content[:150]}...\n")

In [11]:
ask_question("What is RAG and how does it work?")
ask_question("What is llama.cpp?")
ask_question("How does LocalAI relate to cloud AI services?")

  result = rag_pipeline({"query": question})
Number of requested results 3 is greater than number of elements in index 2, updating n_results = 2
Number of requested results 3 is greater than number of elements in index 2, updating n_results = 2


Question: What is RAG and how does it work?
Answer: RAG is a technique that combines retrieval-based and generation-based approaches for natural language processing tasks. It involves retrieving relevant information from a knowledge base and then using that information to generate more accurate and informed responses. The model first retrieves documents that are relevant to a given query, then uses these documents as additional context for language generation. This approach helps to ground the model's responses in factual information and reduces hallucinations.
Time taken: 36.98 seconds

Source documents:
Document 1:
Source: docs/sample.txt
Content: Retrieval-Augmented Generation (RAG) is a technique that combines retrieval-based and generation-based approaches
    for natural language processing ...

Document 2:
Source: docs/sample.txt
Content: The llama.cpp library is a C/C++ implementation of Meta's LLaMA model, optimized for CPU usage. It allows running LLaMA models
    on consumer

Number of requested results 3 is greater than number of elements in index 2, updating n_results = 2


Question: What is llama.cpp?
Answer: llama.cpp is a C/C++ implementation of Meta's LLaMA model, optimized for CPU usage. It allows running LLaMA models on consumer hardware without requiring high-end GPUs.
Time taken: 23.34 seconds

Source documents:
Document 1:
Source: docs/sample.txt
Content: The llama.cpp library is a C/C++ implementation of Meta's LLaMA model, optimized for CPU usage. It allows running LLaMA models
    on consumer hardwar...

Document 2:
Source: docs/sample.txt
Content: Retrieval-Augmented Generation (RAG) is a technique that combines retrieval-based and generation-based approaches
    for natural language processing ...

Question: How does LocalAI relate to cloud AI services?
Answer: LocalAI is a framework that enables running AI models locally without relying on cloud services. It provides APIs compatible with OpenAI's interfaces, allowing developers to use their own models with the same code they would use for OpenAI services. In other words, LocalAI allows deve