In [1]:
import os

In [2]:
# os.environ

In [3]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [4]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200')

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "text"}
        }
    }
}

index_name = "course-questions"

# Check if index exists, create if it doesn't
if es_client.indices.exists(index=index_name):
    print(f"Index '{index_name}' already exists")
else:
    es_client.indices.create(
        index=index_name,
        settings=index_settings["settings"],
        mappings=index_settings["mappings"]
    )
    print(f"Index '{index_name}' created successfully!")

# Verify it was created
if es_client.indices.exists(index=index_name):
    print(f"✓ Index '{index_name}' is ready to use!")

Index 'course-questions' already exists
✓ Index 'course-questions' is ready to use!


In [5]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [6]:
# from tqdm.auto import tqdm

# for doc in tqdm(documents):
#     es_client.index(index=index_name, document=doc)

In [7]:
def elastic_search(query):
    search_query = {
        "size": 3,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                    }
                }
            }
        }
    }
    
    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        doc_with_score = hit['_source'].copy()
        doc_with_score['_score'] = hit['_score']
        result_docs.append(doc_with_score)
    
    return result_docs

In [8]:
query = "How to copy a file to a Docker container?"

In [9]:
search_results = elastic_search(query)
search_results[2]

{'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)',
 'section': '5. Deploying Machine Learning Models',
 'question': 'How do I debug a docker container?',
 'course': 'machine-learning-zoomcamp',
 '_score': 60.077137}

In [10]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.
QUESTION: {question}
CONTEXT: 
{context}
""".strip()
    
    context = ""
    
    for doc in search_results:
        context += f"Q: {doc['question']}\nA: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [11]:
prompt = build_prompt(query, search_results)
len(prompt)

1459

In [12]:
import openai
from openai import OpenAI

client = OpenAI()

def llm_1(prompt):
    response_1 = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response_1.choices[0].message.content

In [13]:
import tiktoken

token_count = len(tiktoken.encoding_for_model("gpt-4o").encode(prompt))
token_count

323

In [14]:
import google.generativeai as genai

def llm_2(prompt):
    genai.configure(api_key=os.environ["GOOGLE_API_KEY"])
    response_2 = genai.GenerativeModel('gemini-1.5-flash-latest').generate_content(prompt)
    return response_2.text

In [15]:
def rag_1(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer_1 = llm_1(prompt)
    return answer_1

In [16]:
def rag_2(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer_2 = llm_2(prompt)
    return answer_2

In [17]:
rag_1(query)

'To copy a file to a Docker container, you can use the `docker cp` command. The basic syntax is:\n\n```bash\ndocker cp /path/to/local/file_or_directory container_id:/path/in/container\n```\n\nThis command allows you to transfer files or directories from your local machine into a running Docker container.'

In [18]:
rag_2(query)

'To copy a file or directory from your local machine to a running Docker container, use the `docker cp` command.  The syntax is:  `docker cp /path/to/local/file_or_directory container_id:/path/in/container`\n'