In [1]:
from openai import OpenAI

openai_client = OpenAI()

ModuleNotFoundError: No module named 'openai'

In [2]:
def llm(user_prompt, instructions=None, model="gpt-4o-mini"):
    messages = []

    if instructions:
        messages.append({
            "role": "system",
            "content": instructions
        })

    messages.append({
        "role": "user",
        "content": user_prompt
    })

    response = openai_client.responses.create(
        model=model,
        input=messages
    )

    return response.output_text

In [3]:
llm('When does the course start?')

"Could you provide more details about the course you're asking about? That way, I can assist you better!"

#### Read in FAQ documents

In [4]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [5]:
documents[11]

{'text': "No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the course is running.",
 'section': 'General course-related questions',
 'question': 'Certificate - Can I follow the course in a self-paced mode and get a certificate?',
 'course': 'data-engineering-zoomcamp'}

In [6]:
len(documents)

948

## Text Search

In [7]:
from minsearch import Index

In [8]:
index = Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x777b6e274e30>

In [9]:
def search(question):
    return index.search(
        question,
        boost_dict={'question': 3.0, 'section': 0.3},
        filter_dict={'course': 'data-engineering-zoomcamp'},
        num_results=5
    )

In [10]:
question = 'I just discovered the course, can I join now?'

In [11]:
search_results = search(question)

In [12]:
instructions = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.
""".strip()

prompt_template = """
<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

In [13]:
import json

In [14]:
def build_prompt(question, search_results):
    search_json = json.dumps(search_results)
    return prompt_template.format(
        question=question,
        context=search_json
    )

In [15]:
def rag(question):
    search_results = search(question)
    user_prompt = build_prompt(question, search_results)
    return llm(user_prompt, instructions=instructions)

In [16]:
question = 'I just discovered the course, can I join now?'
rag(question)

"Yes, you can join the course even after it has started. You are eligible to submit the homeworks; however, be mindful that there will be deadlines for the final projects. It's advisable not to leave everything until the last minute."

## Vector search

In [None]:
!uv add sentence-transformers

[2K[2mResolved [1m151 packages[0m [2min 2.46s[0m[0m                                       [0m
[2K[37m⠙[0m [2mPreparing packages...[0m (0/30)                                                  [37m⠋[0m [2mPreparing packages...[0m (0/0)                                                   
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/30)-------------[0m[0m     0 B/194.62 KiB          [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/30)-------------[0m[0m 14.88 KiB/194.62 KiB        [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/30)-------------[0m[0m 14.88 KiB/194.62 KiB        [1A
[2mfsspec              [0m [32m---[2m---------------------------[0m[0m 14.88 KiB/194.62 KiB
[2K[2A[37m⠙[0m [2mPreparing packages...[0m (0/30)-------------[0m[0m     0 B/11.44 MiB           [2A
[2mfsspec              [0m [32m---[2m---------------------------[0m[0m 14.88 KiB/194.62 KiB
[2K[2A[37m⠙[0m [2mPreparing packages...[0m (0/30)--------

In [18]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')

In [19]:
docs = [
    ["I just discovered the course, can I still join?"],
    ['I just found out about this program. Can I still enrol?'],
    ["you can join the course at any point of time"]
]


vectors = []
for d in docs:
    v = embedding_model.encode(d)
    vectors.append(v)

In [20]:
q1, q2, d = vectors

In [21]:
q1 = q1[0]
q2 = q2[0]
d = d[0]

In [22]:
q1.dot(q2)

np.float32(0.61739707)

In [23]:
q1.dot(d)

np.float32(0.72059387)

In [24]:
q2.dot(d)

np.float32(0.48841882)

In [25]:
!uv add tqdm

[2mResolved [1m152 packages[0m [2min 0.79ms[0m[0m
[2mAudited [1m147 packages[0m [2min 1ms[0m[0m


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [26]:
from tqdm.auto import tqdm


In [27]:
d  = documents[11]

In [28]:
d

{'text': "No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the course is running.",
 'section': 'General course-related questions',
 'question': 'Certificate - Can I follow the course in a self-paced mode and get a certificate?',
 'course': 'data-engineering-zoomcamp'}

In [29]:
text = d['question'] + ' ' + d['text']
text

"Certificate - Can I follow the course in a self-paced mode and get a certificate? No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the course is running."

In [30]:
from tqdm.auto import tqdm

In [31]:
import numpy as np

In [32]:
embeddings = []

for d in tqdm(documents):
    text = d['question'] + ' ' + d['text']
    v = embedding_model.encode(text)
    embeddings.append(v)

embeddings = np.array(embeddings)

  0%|          | 0/948 [00:00<?, ?it/s]

In [33]:
embeddings.shape

(948, 768)

In [34]:
from minsearch import VectorSearch

In [35]:
vindex = VectorSearch(keyword_fields=['course'])
vindex.fit(embeddings, documents)

<minsearch.vector.VectorSearch at 0x7b8dd30eb050>

In [36]:
vindex.search(q2, filter_dict={'course': 'data-engineering-zoomcamp'}, num_results=5)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 202

In [37]:
def vector_search(question):
    q = embedding_model.encode(question)

    return vindex.search(
        q,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        num_results=5
    )

In [38]:
question = 'I just found out about this program. Can I still enrol?'
vector_search(question)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 202

In [39]:
def rag(question):
    search_results = vector_search(question)
    user_prompt = build_prompt(question, search_results)
    return llm(user_prompt, instructions=instructions)

In [40]:
rag(question)

"Yes, you can still enroll and submit homework even if you didn't register before the course started. However, be mindful of deadlines for turning in final projects, so it’s advisable not to leave everything until the last minute."

In [41]:
def hybrid_search(question):
    r1 = search(question)
    r2 = vector_search(question)
    return r1 + r2