In [1]:
from openai import OpenAI

openai_client = OpenAI()

In [2]:
def llm(user_prompt, instructions=None, model="gpt-4o-mini"):
    messages = []

    if instructions:
        messages.append({
            "role": "system",
            "content": instructions
        })

    messages.append({
        "role": "user",
        "content": user_prompt
    })

    response = openai_client.responses.create(
        model=model,
        input=messages
    )

    return response.output_text

In [3]:
llm('When does the course start?')

"Could you please provide me with more details about the course you're referring to? That way, I can help you find the start date!"

#### Read in FAQ documents

In [4]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [5]:
documents[11]

{'text': "No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the course is running.",
 'section': 'General course-related questions',
 'question': 'Certificate - Can I follow the course in a self-paced mode and get a certificate?',
 'course': 'data-engineering-zoomcamp'}

In [6]:
len(documents)

948

## Text Search

In [7]:
from minsearch import Index

In [8]:
index = Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x7f7a95900c50>

In [9]:
def search(question):
    return index.search(
        question,
        boost_dict={'question': 3.0, 'section': 0.3},
        filter_dict={'course': 'data-engineering-zoomcamp'},
        num_results=5
    )

In [10]:
question = 'I just discovered the course, can I join now?'

In [11]:
search_results = search(question)

In [12]:
instructions = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.
""".strip()

prompt_template = """
<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

In [13]:
import json

In [14]:
def build_prompt(question, search_results):
    search_json = json.dumps(search_results)
    return prompt_template.format(
        question=question,
        context=search_json
    )

In [15]:
def rag(question):
    search_results = search(question)
    user_prompt = build_prompt(question, search_results)
    return llm(user_prompt, instructions=instructions)

In [16]:
question = 'I just discovered the course, can I join now?'
rag(question)

"Yes, you can still join the course even if you don't register. You're eligible to submit the homework, but be aware of deadlines for turning in the final projects. Make sure not to leave everything until the last minute."

## Vector search

In [17]:
!uv add sentence-transformers

[2K[2mResolved [1m151 packages[0m [2min 1.17s[0m[0m                                       [0m
[2K[37m⠙[0m [2mPreparing packages...[0m (0/8)                                                   [37m⠋[0m [2mPreparing packages...[0m (0/0)                                                   
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/8)--------------[0m[0m     0 B/566.81 MiB          [1A
[2K[1A[37m⠙[0m [2mPreparing packages...[0m (0/8)--------------[0m[0m     0 B/566.81 MiB          [1A
[2mnvidia-cublas-cu12  [0m [32m[2m------------------------------[0m[0m     0 B/566.81 MiB
[2K[2A[37m⠙[0m [2mPreparing packages...[0m (0/8)--------------[0m[0m     0 B/858.06 MiB          [2A
[2mnvidia-cublas-cu12  [0m [32m[2m------------------------------[0m[0m     0 B/566.81 MiB
[2K[2A[37m⠙[0m [2mPreparing packages...[0m (0/8)----------------[0m[0m     0 B/858.06 MiB        [2A
[2mnvidia-cublas-cu12    [0m [32m[2m---------------------------

In [18]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/523 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [21]:
docs = [
    ["I just discovered the course, can I still join?"],
    ['I just found out about this program. Can I still enrol?'],
    ["you can join the course at any point of time"]
]


vectors = []
for d in docs:
    v = embedding_model.encode(d)
    vectors.append(v)

In [22]:
q1, q2, d = vectors

In [23]:
q1 = q1[0]
q2 = q2[0]
d = d[0]

In [24]:
q1.dot(q2)

np.float32(0.6173972)

In [25]:
q1.dot(d)

np.float32(0.7205938)

In [26]:
q2.dot(d)

np.float32(0.48841894)

In [27]:
!uv add tqdm

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2K[37m⠧[0m [2mjupyter-events==0.12.0                                                        [0m[2mResolved [1m151 packages[0m [2min 1.46s[0m[0m
[2K[37m⠙[0m [2mscikit-learn==1.7.2                                                           [0m[2mAudited [1m146 packages[0m [2min 964ms[0m[0m


In [28]:
from tqdm.auto import tqdm


In [29]:
d  = documents[11]

In [30]:
d

{'text': "No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the course is running.",
 'section': 'General course-related questions',
 'question': 'Certificate - Can I follow the course in a self-paced mode and get a certificate?',
 'course': 'data-engineering-zoomcamp'}

In [34]:
text = d['question'] + ' ' + d['text']
text

"Certificate - Can I follow the course in a self-paced mode and get a certificate? No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the course is running."

In [35]:
from tqdm.auto import tqdm

In [36]:
import numpy as np

In [37]:
embeddings = []

for d in tqdm(documents):
    text = d['question'] + ' ' + d['text']
    v = embedding_model.encode(text)
    embeddings.append(v)

embeddings = np.array(embeddings)

  0%|          | 0/948 [00:00<?, ?it/s]

In [38]:
embeddings.shape

(948, 768)

In [39]:
from minsearch import VectorSearch

In [40]:
vindex = VectorSearch(keyword_fields=['course'])
vindex.fit(embeddings, documents)

<minsearch.vector.VectorSearch at 0x7f797d68d160>

In [41]:
vindex.search(q2, filter_dict={'course': 'data-engineering-zoomcamp'}, num_results=5)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 202

In [42]:
def vector_search(question):
    q = embedding_model.encode(question)

    return vindex.search(
        q,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        num_results=5
    )

In [43]:
question = 'I just found out about this program. Can I still enrol?'
vector_search(question)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 202

In [44]:
def rag(question):
    search_results = vector_search(question)
    user_prompt = build_prompt(question, search_results)
    return llm(user_prompt, instructions=instructions)

In [45]:
rag(question)

'Yes, you can still enroll in the program and submit the homework, but be aware that there will be deadlines for turning in the final projects, so it’s best not to leave everything to the last minute.'

In [46]:
def hybrid_search(question):
    r1 = search(question)
    r2 = vector_search(question)
    return r1 + r2

In [47]:
hybrid_search(question)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Check the .gitignore file and make sure you don’t have *.csv in it\n\nDbt error 404 was not found in location\nMy specific error:\nRuntime Error in rpc request (from remote system.sql) 404 Not found: Table dtc-de-0315:trips_data_all.green_tripdata_partitioned was not found in location europe-west6 Location: europe-west6 Job ID: 168ee9bd-07cd-4ca4-9ee0-4f6b0f33897c\nMake sure all of your datasets have the correct region and not a generalised region:\nEurope-west6 as opposed to EU\n\nMatch this in dbt settings:\ndbt -> projects -> optional settings -> manually set location to match',
  'sectio