# RAG

<img src="imgs/RAG.png" width="800">

In [1]:
# LiteLLM (https://github.com/BerriAI/litellm)

# Uncomment the below line to install litellm
# !pip install litellm

This is an example CURL call for LM Studio using an embedding model:

curl http://127.0.0.1:1234/v1/embeddings \
  -H "Content-Type: application/json" \
  -d '{
    "model": "text-embedding-granite-embedding-278m-multilingual",
    "input": "Some text to embed"
  }'

In [2]:
import requests
import json, os

#
# LM Studio URL 
#
# Embeddings
EMBEDDING_MODEL = "text-embedding-granite-embedding-278m-multilingual"
LMSTUDIO_EMBEDDINGS_URL = "http://127.0.0.1:1234/v1/embeddings"
# LLM
os.environ['LM_STUDIO_API_BASE'] = "http://localhost:1234/v1"
os.environ['LM_STUDIO_API_KEY'] = "42" # Not really used. Set it to a non empty value

In [3]:
data = { "model": EMBEDDING_MODEL, "input": "Some text to embed" }

response = requests.post(LMSTUDIO_EMBEDDINGS_URL, json=data)
print(response.text[:200])

{
  "object": "list",
  "data": [
    {
      "object": "embedding",
      "embedding": [
        -0.014949378557503223,
        -0.03729313239455223,
        -0.06780612468719482,
        0.017350835


# Install ChromaDB vector database

In [4]:
# Install ChromaDB Pyhton library
# ! pip install chromadb

In [5]:
import chromadb

collection_name = "collection00"
client = chromadb.Client()

if collection_name in [coll.name for coll in client.list_collections()]:
    client.delete_collection(name=collection_name)
    print(f"Collection '{collection_name}' deleted.")
else:
    collection = client.create_collection(collection_name)

In [6]:
input1 = "The capital of Italy is Milan."
data1 = { "model": EMBEDDING_MODEL, "input": input1 }
response1 = requests.post(LMSTUDIO_EMBEDDINGS_URL, json=data1)
embedding1 = response1.json()["data"][0]["embedding"]

In [7]:
input2 = "The total population of Milan is 10 people."
data2 = { "model": EMBEDDING_MODEL, "input": input2 }
response2 = requests.post(LMSTUDIO_EMBEDDINGS_URL, json=data2)
embedding2 = response2.json()["data"][0]["embedding"]

In [8]:
input3 = "An horse has 4 legs and a tail"
data3 = { "model": EMBEDDING_MODEL, "input": input3 }
response3 = requests.post(LMSTUDIO_EMBEDDINGS_URL, json=data3)
embedding3 = response3.json()["data"][0]["embedding"]

In [9]:
collection.add(documents=[input1], embeddings=[embedding1], ids=["id1"])
collection.add(documents=[input2], embeddings=[embedding2], ids=["id2"])
collection.add(documents=[input3], embeddings=[embedding3], ids=["id3"])

In [10]:
collection.get()

{'ids': ['id1', 'id2', 'id3'],
 'embeddings': None,
 'documents': ['The capital of Italy is Milan.',
  'The total population of Milan is 10 people.',
  'An horse has 4 legs and a tail'],
 'uris': None,
 'included': ['metadatas', 'documents'],
 'data': None,
 'metadatas': [None, None, None]}

In [11]:
from litellm import completion 
from typing import List, Dict

def generate_response(messages: List[Dict]) -> str:
    """Call LLM to get a response."""
    response = completion(
        model="lm_studio/lmstudio",
        messages=messages,
        max_tokens=1024
    )
    return response.choices[0].message.content.strip()

In [12]:
user_input = input("How can I help you today?")

How can I help you today? what's the capital of Italy?


In [13]:
user_data = { "model": EMBEDDING_MODEL, "input": user_input }
response = requests.post(LMSTUDIO_EMBEDDINGS_URL, json=user_data)
embedding = response.json()["data"][0]["embedding"]
embedding[:10]

[-0.04225713759660721,
 0.052123963832855225,
 -0.01851440779864788,
 0.0545511320233345,
 -0.004544607363641262,
 -0.03987252712249756,
 0.07662363350391388,
 0.05532529950141907,
 0.011184877716004848,
 0.007382847368717194]

In [14]:
# Search in ChromaDB
results = collection.query(
    query_embeddings=[embedding],
    n_results=10
)

In [15]:
# Let's only keep the doc with cosine distance < 0.3
filtered_docs = [
    doc
    for doc, dist in zip(results["documents"][0], results["distances"][0])
    if dist < 0.3
]
filtered_docs

['The capital of Italy is Milan.']

In [16]:
# Most RAG setups inject retrieved context as a system message 
# (or as part of the user message, depending on the model).

# Let's construct the prompt:
retrieved_text = " ".join(filtered_docs)
memory = [
  {"role": "system", "content": "You are a helpful assistant that uses the following context to answer the user's question."},
  {"role": "system", "content": "Context:\n" + retrieved_text},
  {"role": "user", "content": user_input}
]
memory

[{'role': 'system',
  'content': "You are a helpful assistant that uses the following context to answer the user's question."},
 {'role': 'system', 'content': 'Context:\nThe capital of Italy is Milan.'},
 {'role': 'user', 'content': "what's the capital of Italy?"}]

In [17]:
response = generate_response(memory)
print(response)

According to my information, the capital of Italy is Milan.
