# Creating the Information Retrieval Part of RAG

In [3]:
# Download the python file that contained the minisearch class
!wget https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/minsearch.py

--2024-06-25 06:42:48--  https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: ‘minsearch.py.1’


2024-06-25 06:42:48 (28.1 MB/s) - ‘minsearch.py.1’ saved [3832/3832]



In [1]:
# Import the custom class
import minsearch

In [2]:
import requests
# To get the documents I will download them for the GitHub repo
url_path = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/documents.json'
response = requests.get(url_path)
# Read the Json file 
docs_raw = response.json()
# See the first element of the Json
print(docs_raw[0]['course'])
print(docs_raw[0]['documents'][0])

data-engineering-zoomcamp
{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.", 'section': 'General course-related questions', 'question': 'Course - When will the course start?'}


In [3]:
# Flatten the json (add the course in each question)
documents = []
# For each course in the Json
for courses in docs_raw:
    # Add the course name to the document
    for doc in courses['documents']:
        doc['course'] = courses['course']
        documents.append(doc)
# See the first question of the document
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [4]:
# Initialize the Index
index = minsearch.Index(
    text_fields = ['question','text','section'], # We are specifying the fields containing the text to search - these will be vectorized
    keyword_fields = ['course'] # We are specifying the fields that we will filter the dataset
)

In [5]:
# Add the question fields to the database / index
index.fit(documents)

<minsearch.Index at 0x77c22c3dfe20>

In [6]:
# User query
q = 'the course has already started, can I still enroll?'
# Provide more search importance in the question field
boost = {'question':3.0,'section':0.5} # If not specified the boost of the field will be 1 by default

# Search for the relevant document in the index (database)
results = index.search(
    query = q,
    filter_dict = {'course':'data-engineering-zoomcamp'},
    boost_dict = boost,
    num_results = 5
)
results

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 202

# Using the OpenAI API as the LLM for answer generation

In [7]:
from openai import OpenAI

In [8]:
# Intialize the client
client = OpenAI()

In [9]:
# Ask in general the Open AI model the question without providing any context
response = client.chat.completions.create(
    model = 'gpt-3.5-turbo',
    messages = [{'role':'user','content':q}]
)

In [10]:
# Display the response
print(response.choices[0].message.content)

It depends on the policies of the institution offering the course. Some institutions may allow late enrollment with permission from the instructor, while others may have a strict deadline for enrollment. It is best to contact the institution or the instructor directly to inquire about enrolling in the course after it has already started.


In [11]:
# Create the prompt template
prompt_template = '''
You are a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.
If the CONTEXT doesn't contain the answer, output NONE.

QUESTION: {question}

CONTEXT:
{context}
'''.strip()

In [12]:
# Create the context from the search results
context = ''

for doc in results:
    context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

print(context)

section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section: General course-related questions
question: Course - Can I follow the course after it finishes?
answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.

section: General course-related questions
question: Course - When will the course start?
answer: The purpose of this document is to capture frequently asked technical questions
The exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start wit

In [13]:
prompt = prompt_template.format(question = q, context = context).strip()
print(prompt)

You are a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.
If the CONTEXT doesn't contain the answer, output NONE.

QUESTION: the course has already started, can I still enroll?

CONTEXT:
section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section: General course-related questions
question: Course - Can I follow the course after it finishes?
answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your f

In [15]:
# Ask the same question to OpenAI but by also providing the appropriate context from RAG
response = client.chat.completions.create(
    model = 'gpt-3.5-turbo',
    messages = [{'role':'user','content':prompt}]
)

# Display the response
print(response.choices[0].message.content)

Yes, even if the course has already started, you can still enroll. You are still eligible to submit the homeworks, but make sure to be aware of the deadlines for turning in the final projects.


# Making the elements of the RAG process Modular

Create a function:
- To search the provided knowledge base and retieve the necessary documents
- To create the prompt using the user query and the context retrieved from the search
- Use an LLM model to generate a response based on the user query
- To run sequentially all the steps together to run RAG

In [20]:
# Create the search function
def search(query):
    # Provide more search importance in the question field
    boost = {'question':3.0,'section':0.5} 
    
    # Search for the relevant document in the index (database)
    results = index.search(
        query = query,
        filter_dict = {'course':'data-engineering-zoomcamp'},
        boost_dict = boost,
        num_results = 5
    )
    return results

In [21]:
# Create the function to build the prompt
def build_prompt(query, search_results):
    # Create the prompt template
    prompt_template = '''
    You are a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    If the CONTEXT doesn't contain the answer, output NONE.
    
    QUESTION: {question}
    
    CONTEXT:
    {context}
    '''.strip()
    
    # Create the context from the search results
    context = ''
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    # Putting the context and query all together
    prompt = prompt_template.format(question = query, context = context).strip()
    return prompt

In [22]:
# Create the function to call the LLM model to generate the response
def llm(prompt):
    # Ask the same question to OpenAI but also provide the appropriate context from RAG
    response = client.chat.completions.create(
        model = 'gpt-3.5-turbo',
        messages = [{'role':'user','content':prompt}]
    )
    # Return the response
    return response.choices[0].message.content

In [23]:
# Create the rag system function
def rag(query):
    # Search the knowledge base
    search_results = search(query)
    # Prepare the prompt for the model
    prompt = build_prompt(query, search_results)
    # Use the llm to get the response
    answer = llm(prompt)
    return answer

In [24]:
# User query
query = 'the course has already started, can I still enroll?'
# Try the RAG system
answer = rag(query)
# See the answer
print(answer)

Yes, even if the course has already started, you can still enroll and participate. Make sure to not leave everything for the last minute as there will be deadlines for turning in final projects.
