In [28]:
import os
from pathlib import Path
import sys


ROOT_DIR = Path(".").absolute().parent.parent
# PROJECT_DIR = Path(ROOT_DIR, "project")
sys.path.append(str(ROOT_DIR))

In [19]:
import torch
import transformers

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device `{device}`")

Using device `cpu`


## Starter RAG Flow

### load documents

In [21]:
import json

In [22]:
DATA_DIR = Path(ROOT_DIR, "data")
assert DATA_DIR.exists()

# read all documentds:
with open(Path(DATA_DIR, "documents.json"), "rt") as f_in:
    docs_raw = json.load(f_in)

# collect all documents from all the courses
documents = []
for course_dict in docs_raw:
    course_name = course_dict["course"]
    for doc in course_dict["documents"]:
        doc["course"] = course_name
        documents.append(doc)

print(f"# of Documents processed: {len(documents)}")

# of Documents processed: 948


### Index data with Elastic

In [42]:
from elasticsearch import Elasticsearch
from tqdm import tqdm

In [44]:
!curl http://elasticsearch:9200

{
  "name" : "b9dcb48aa018",
  "cluster_name" : "docker-cluster",
  "cluster_uuid" : "fUjwSc3BT9KhhbpJky3U9w",
  "version" : {
    "number" : "8.4.3",
    "build_flavor" : "default",
    "build_type" : "docker",
    "build_hash" : "42f05b9372a9a4a470db3b52817899b99a76ee73",
    "build_date" : "2022-10-04T07:17:24.662462378Z",
    "build_snapshot" : false,
    "lucene_version" : "9.3.0",
    "minimum_wire_compatibility_version" : "7.17.0",
    "minimum_index_compatibility_version" : "7.0.0"
  },
  "tagline" : "You Know, for Search"
}


In [45]:
es_client = Elasticsearch("http://elasticsearch:9200")

In [46]:
# Elastic config definition
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0,
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
        }
    },
}

index_name = "course-descripiton"
if es_client.indices.exists(index=index_name):
    es_client.indices.delete(index=index_name)

# create index
try:
    es_client.indices.create(
        index=index_name,
        body=index_settings
    )
    print("Index successfully created")
except:
    print("Index already exists")

Index successfully created


In [47]:
# Add document to index one by one:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 948/948 [00:01<00:00, 480.76it/s]


In [48]:
def elastic_search(
    query: str,
    size: int = 5,
    question_weight: int = 3,
    course: str = "machine-learning-zoomcamp"
) -> list[dict]:
    
    # setup elastic request
    search_query = {
        "size": size,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        # "fields": [f"question^{question_weight}", "text", "section"],
                        "fields": [f"question^{question_weight}", "text"],
                        "type": "best_fields",
                    }
                },
                "filter": {
                    "term": {
                        "course": f"{course}"
                    }
                },
            }
        }
    }
    
    # search in elastic
    response = es_client.search(index=index_name, body=search_query)
    
    # save initial documents found as doc list:
    result_docs = []
    for hit in response["hits"]["hits"]:
        result_docs.append(hit["_source"])

    return result_docs

### Index with NOT-Elastic

In [29]:
from project.src.external.minsearch import minsearch  # minsearch.Index

In [30]:
# Fit our search
model_index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

model_index.fit(documents);

In [31]:
def index_search(query: str, num_results: int = 5, course: str = "data-engineering-zoomcamp"):
    serch_results = model_index.search(
        query=query,
        filter_dict={"course": course},
        boost_dict={"question": 3.0, "section": 0.3},
        num_results=num_results
    )
    return serch_results

### build a Prompt

In [32]:
context_template = """
Q: {question}
A: {text}
""".strip()

prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.


QUESTION: {question}


CONTEXT:
{context}
""".strip()


def build_prompt(query: str, search_results: [dict]) -> str:
    context_qna = []
    for source_doc in search_results:
        context_i = context_template.format(
            question=source_doc["question"],
            text=source_doc["text"],
        )
        context_qna.append(context_i)

    context = "\n\n".join(context_qna)
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [33]:
def llm(prompt: str) -> str:
    """Request LLM to make a query"""
    raise NotImplementedError

### RAG

In [34]:
def rag(query: str, course: str = "machine-learning-zoomcamp") -> str:
    # 1. search in elastic
    faq_results = index_search(query, course=course)
    # 2. build a prompt
    prompt = build_prompt(query, search_results=faq_results)
    # 3. ask LLM
    answer = llm(prompt)
    return answer

## HuggingFace Flan-T5

In [65]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [66]:
# init HF model:
tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-xl", device_map=device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  5.16it/s]


In [56]:
input_text = "how are you?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)

outputs = model.generate(input_ids, max_length=1024)
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Output: `{result}`")

Output: I'm fine.


In [99]:
def llm(prompt: str) -> str:
    # tokenize prompt:
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    
    # generate output tokens
    # Split the input into chunks
    chunk_size = 512
    input_chunks = [input_ids[i:i+chunk_size] for i in range(0, len(input_ids), chunk_size)]
    
    # Generate and combine the outputs
    outputs = []
    for chunk in input_chunks:
        chunk = chunk.to(device)
        output = model.generate(chunk, max_length=1024)

        # decode tokens back to words
        outputs.append(tokenizer.decode(output[0], skip_special_tokens=True))
    
    # Combine the results
    answer = " ".join(outputs)
    return answer

In [111]:
def rag(query: str, course: str = "machine-learning-zoomcamp") -> str:
    # 1. search in elastic
    faq_results = index_search(query, course=course)
    # 2. build a prompt
    prompt = build_prompt(query, search_results=faq_results)
    # 3. ask LLM
    answer = llm(prompt)
    return answer

In [102]:
rag("Is is possible to join the course after it's start?")

'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.'

## Phi 3 Mini

In [106]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline
)

torch.random.manual_seed(0)
model = AutoModelForCausalLM.from_pretrained( 
    "microsoft/Phi-3-mini-128k-instruct",  
    device_map=device,    
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")

messages = [
    {"role": "system", "content": "You are a helpful AI assistant."}, 
    {"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"}, 
    {"role": "assistant", "content": "Sure! Here are some ways to eat bananas and dragonfruits together: 1. Banana and dragonfruit smoothie: Blend bananas and dragonfruits together with some milk and honey. 2. Banana and dragonfruit salad: Mix sliced bananas and dragonfruits together with some lemon juice and honey."}, 
    {"role": "user", "content": "What about solving an 2x + 3 = 7 equation?"}, 
]

pipe = pipeline( 
    "text-generation", 
    model=model, 
    tokenizer=tokenizer, 
) 

generation_args = { 
    "max_new_tokens": 1024,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
}

output = pipe(messages, **generation_args) 
answer = output[0]['generated_text']
print(answer)

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-128k-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3-mini-128k-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Downloading shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [07:13<00:00, 216.84s/it]
Loading checkpoint shards: 100%|███████████████████

 To solve the equation 2x + 3 = 7, follow these steps:

1. Subtract 3 from both sides of the equation:
   2x + 3 - 3 = 7 - 3
   2x = 4

2. Divide both sides of the equation by 2:
   2x/2 = 4/2
   x = 2

So, the solution to the equation 2x + 3 = 7 is x = 2.


In [109]:
def llm(prompt: str) -> str:
    # tokenize prompt:
    messages = [
        {"role": "user", "content": prompt},
    ]
    output = pipe(messages, **generation_args) 
    answer = output[0]['generated_text']
    return answer

In [114]:
def rag(query: str, course: str = "data-engineering-zoomcamp") -> str:
    # 1. search in elastic
    faq_results = index_search(query, course=course)
    # 2. build a prompt
    prompt = build_prompt(query, search_results=faq_results)
    # 3. ask LLM
    answer = llm(prompt)
    return answer

In [115]:
rag(query="I've just discovered the course. Can I still join?")

' Yes, you can still join the course even if you discover it after the start date. You are eligible to submit homeworks, but remember to meet the deadlines for final projects.'

## Mistral-7B

In [132]:
# TODO: ...

## Other open-source LLM

In [35]:
from openai import OpenAI

In [36]:
openai_client = OpenAI(
    base_url="http://ollama:11434/v1/",
    api_key="ollama",
)

In [None]:
openai_client.chat.completions.create()

In [57]:
def llm(prompt: str) -> str:
    """Request to ChatGPT API to make a query.
    Possible roles: user, assistant, system
    """
    response = openai_client.chat.completions.create(
        model="llama3",
        messages=[
            {
                "role": "system",
                "content": (
                    "You are a helpful AI assistant, providing information about the course. "
                    "Do not provide meta, just answer user questions."
                )
            },
            {
                "role": "user",
                "content": prompt
            }
        ],
        temperature=0.0
    )
    return response.choices[0].message.content

In [55]:
def rag(query: str, course: str = "data-engineering-zoomcamp") -> str:
    # 1. search in elastic
    faq_results = elastic_search(query, course=course)
    # 2. build a prompt
    prompt = build_prompt(query, search_results=faq_results)
    # 3. ask LLM
    answer = llm(prompt)
    return answer

In [56]:
ans = rag("I've just found this course. Is it possible to join it now?")
print(ans)

Based on the context from the FAQ database, the answer to your question is:

Yes, it is possible to join the course now. The FAQ states that even if you don't register ahead of time, you're still eligible to submit the homeworks and can continue with the course at your own pace after it finishes.
