In [1]:
#Read from documents
import requests

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()
"""
# You are repurposing the documents_raw file
"course": "data-engineering-zoomcamp",
    "documents": [
      {
        "text": "The purpose of this document is to capture frequently asked technical...."
      },
      ]

# to an documents array with the course name in it

"""
documents = []




for record in documents_raw:
    course_name = record['course'] # record is course and documents. We are pulling out the course name.
    for element in record['documents']: # element is "text": "The purpose of this document is to capture frequently asked technical...."
        element['course'] = course_name
        documents.append(element)


In [2]:
#Vector Search is based on cosine similarity, and we need to convert the text to vectors (number representations. We need an embedding model for that)
# Other models can be found here https://sbert.net/docs/sentence_transformer/pretrained_models.html#multi-qa-models
# https://huggingface.co/sentence-transformers/multi-qa-distilbert-dot-v1
# https://youtu.be/wjZofJX0v4M?si=n63ejz0XTVwufdwP&t=1005
# note : qa stands for question-answer,
# dot stand for dot product
# cos stands for cosine similarity

from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')

In [19]:
#Understand Vectors
docs = [
    ["I just discovered the course, can I still join?"], #Q1 (Question 1)
    ["I just found out about this program, Can I still enroll"], #Q2 (Question 2)
    ["You can join the course at any point of time"] # D (Description)
]

vectors_example = []

for d in docs:
    v = embedding_model.encode(d)
    vectors_example.append(v)

#We now have  Question1, Question2 and Description in a vector representation.
# We find similarity between questions and then questions with the description

# Dot Products :
# Values is positive if vector point in same directions (i.e they are similar)
#  Value moves to zero when its perpendicular (no similarity)
# Value moves to negative when it's more than 90 ( opposite)
# https://youtu.be/wjZofJX0v4M?si=n63ejz0XTVwufdwP&t=1005

q1, q2, d = vectors_example


print(f"Q1 shape : {q1.shape}") # (1,768)
print(f"Q2 shape : {q2.shape}") # (1,768)

#When doing dot multiplication we need to transpose i.e 1,768 cannot dot product with 1,768 , it can only multiply with 7

print(f"Q1 shape : {q1.shape}") # (1,768)
print(f"Q2 Transpose shape : {q2.T.shape}") # (768, 1)

print(f"Q1 dot d : {q1.dot(d.T)}") # 0.7205941(Closer to 1 and positive, good similarity between Q1 and Q2)

print(f"Q2 dot d : {q2.dot(d.T)}") # 0.48303062 (Closer to 1 and positive, ok similarity between Q1 and D but better than text match which would have returned 0)

print(f"Q1 dot Q2 : {q1.dot(q2.T)}") # 0.606797 (Closer to 1 and positive, good similarity between Q1 and Q2). So given a Q2 and we do a cosine vector search, we will find Q2


Q1 shape : (1, 768)
Q2 shape : (1, 768)
Q1 shape : (1, 768)
Q2 Transpose shape : (768, 1)
Q1 dot d : [[0.7205941]]
Q2 dot d : [[0.48303062]]
Q1 dot Q2 : [[0.6067974]]


In [3]:
#Use the embedding model to create an array of vectors.
#for that we combine the question: and text: part of the documents sections
# This is what worked, wouldn't necessarily work for all
# idea is given a q2 and we have the q1 and answer in the same "string" we can find the similar text and send it to LLM.
"""
[
      {
        "text": "The purpose of this document is to capture frequently asked technical ...",
        "section": "General course-related questions",
        "question": "Course - When will the course start?"
      },

"""

import numpy as np
from tqdm.auto import tqdm

embeddings = []

for d in tqdm(documents):
    text = d['question'] + ' ' + d['text']
    v = embedding_model.encode(text)
    embeddings.append(v)

embeddings = np.array(embeddings)

  0%|          | 0/948 [00:00<?, ?it/s]

In [20]:
from minsearch import VectorSearch

vindex = VectorSearch(keyword_fields=['course'])
vindex.fit(embeddings, documents)

<minsearch.vector.VectorSearch at 0x16b59f920>

In [8]:
import json

instructions = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.
""".strip()

prompt_template = """
<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

def build_prompt(question, search_results):
    search_json = json.dumps(search_results)
    return prompt_template.format(
        question=question,
        context=search_json
    )

In [9]:
from openai import OpenAI

openai_client = OpenAI()

def llm(user_prompt, instructions=None, model="gpt-4o-mini"):
    messages = []

    if instructions:
        messages.append({
            "role": "system",
            "content": instructions
        })

    messages.append({
        "role": "user",
        "content": user_prompt
    })

    response = openai_client.responses.create(
        model=model,
        input=messages
    )

    return response.output_text

In [10]:
def vector_search(question):
    q = embedding_model.encode(question)

    return vindex.search(
        q,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        num_results=5
    )

def compare_search(question):
    q = embedding_model.encode(question)
    return vindex.search(
        q,
    )

def rag(question):
    search_results = vector_search(question)
    user_prompt = build_prompt(question, search_results)
    return llm(user_prompt, instructions=instructions)

In [12]:
rag("rag('what is the name of your program')")

'The name of the program is "Data Engineering Zoom Camp 2024."'