In [None]:
!rm -rf minsearch.py
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

In [None]:
import requests 
import minsearch

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

In [None]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [None]:
import os

In [None]:
token = os.environ.get('HF_TOKEN')

In [None]:
from huggingface_hub import login

In [None]:
login(token=token)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1",
    device_map="auto",
    load_in_4bit=True
)

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="left")

model.save_pretrained("./models/mistral-7B-model")
tokenizer.save_pretrained("./models/mistral-7B-tokenizer")


In [None]:
from transformers import pipeline

generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

In [None]:
def build_prompt(query, search_results):
    prompt_template = """
        QUESTION: {question}

        CONTEXT: {context}
    """.strip()

    context_str = ""

    for doc in search_results:
        context_str = context_str + f"{doc['question']}\n{doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context_str).strip()
    return prompt 

def llm(prompt):
    response = generator(prompt, max_length=500, temperature=0.7, top_p=0.95, num_return_sequences=1)
    response_gen = response[0]['generated_text']
    return response_gen[len(prompt):].strip()

In [None]:
# Define the prompt

prompt = """
QUESTION: I just discovered the course. Can I still join?

CONTEXT:
Course - Can I still join the course after the start date?
Yes, even if you don't register,,  you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave every assignment until the last minute.

Course - Can I follow the course after it finishes?
Yes, we will keep all he  materials after the course finishes, so you can follow the course at your own pace.
You can also contiuee looking at the homeworks and continue preparing for the next cohort.

ANSWER:
"""


In [None]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer