In [None]:
#import wget

#wget.download("https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py")
#wget.download("https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/documents.json")

In [2]:
from openai import OpenAI
import json
from dotenv import load_dotenv

import minsearch

from elasticsearch import Elasticsearch
from tqdm.auto import tqdm

In [3]:
load_dotenv('../.env')

True

In [4]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [5]:
documents = []


for course_dict in docs_raw:

    for doc in course_dict['documents']:

        doc['course'] = course_dict['course']
        
        documents.append(doc)
        

In [6]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
    )


index.fit(documents)

<minsearch.Index at 0x2c0f12ae010>

In [7]:
client = OpenAI()

In [34]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [35]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [10]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-3.5-turbo',
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

In [11]:
def RAG(query):

    search_results = search(query)

    prompt = build_prompt(query, search_results)

    answer = llm(prompt)

    return answer


In [12]:
query = 'how do I run Docker?'


print(RAG(query))

To run Docker on Windows, you may encounter the error "the input device is not a TTY." The solution is to use `winpty` before the `docker` command. You can do this by running `winpty docker run -it ubuntu bash` or creating an alias by adding `alias docker='winpty docker'` to your `.bashrc` or `.bash_profile` file.


In [None]:
#docker run -it --rm --name elasticsearch -p 9200:9200 -p 9300:9300 -e "discovery.type=single-node" -e "xpack.security.enabled=false" docker.elastic.co/elasticsearch/elasticsearch:8.4.3

In [17]:
es_client = Elasticsearch("http://localhost:9200")

In [18]:
es_client.info()

ObjectApiResponse({'name': '35373feb37fe', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'T3O9qZ3xTeCZMuDewssaAQ', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [19]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}



index_name = "course-questions"


es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [None]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

In [25]:
query = 'I just discovered the course. Can I still join it?'

In [40]:
def elastic_search(query):

    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }


    response = es_client.search(index=index_name, body=search_query)

    result_docs = []

    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs




In [41]:
def RAG(query):

    search_results = elastic_search(query)

    prompt = build_prompt(query, search_results)

    answer = llm(prompt)

    return answer


In [42]:
RAG(query)

"Based on the provided context, yes, you can still join the course even if you didn't register. You are still eligible to submit homeworks. Just be aware of the deadlines for turning in the final projects and don't leave everything for the last minute."