In [7]:
import os
import requests 
import minsearch
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [8]:
os.environ['HF_HOME'] = '/run/cache/'

In [9]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query = query,
        filter_dict = {'course': 'data-engineering-zoomcamp'},
        boost_dict = boost,
        num_results = 5
    )

    return results

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)

    return answer

def build_prompt(query, search_results):
    prompt_template = """
        You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
        Use only the facts from the CONTEXT when answering the QUESTION.

        QUESTION: {question}

        CONTEXT: {context}
    """.strip()

    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question = query, context = context).strip()

    return prompt

def llm(prompt):
    input_ids = tokenizer(prompt, return_tensors = "pt").input_ids.to("cuda")
    # input_ids = tokenizer(prompt, return_tensors = "pt").input_ids.to("cpu")
    outputs = model.generate(input_ids, )
    result = tokenizer.decode(outputs[0])

    return result

def llm(prompt, generate_params = None):
    if generate_params is None:
        generate_params = {}

    input_ids = tokenizer(prompt, return_tensors = "pt").input_ids.to("cuda")
    # input_ids = tokenizer(prompt, return_tensors = "pt").input_ids.to("cpu")
    outputs = model.generate(
        input_ids,
        max_length = generate_params.get("max_length", 100),
        num_beams = generate_params.get("num_beams", 5),
        do_sample = generate_params.get("do_sample", False),
        temperature = generate_params.get("temperature", 1.0),
        top_k = generate_params.get("top_k", 50),
        top_p = generate_params.get("top_p", 0.95),
    )

    result = tokenizer.decode(outputs[0], skip_special_tokens = True)

    return result

In [10]:
docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

index = minsearch.Index(
    text_fields = ["question", "text", "section"],
    keyword_fields = ["course"]
)

index.fit(documents)

<minsearch.Index at 0x1bf17608880>

In [11]:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl", device_map = "auto")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some parameters are on the meta device device because they were offloaded to the cpu and disk.


In [12]:
rag("I just discovered the course. Can I still join it?")



"Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."