In [2]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [3]:
documents[2]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

In [4]:
n = len(documents)

for i in range(n):
    documents[i]['id'] = i

In [5]:
documents

[{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'id': 0},
 {'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp',
  'id': 1},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that th

In [6]:
import hashlib

def generate_document_id(doc):
    # combined = f"{doc['course']}-{doc['question']}"
    combined = f"{doc['course']}-{doc['question']}-{doc['text']}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [7]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [8]:
documents[8]

{'text': 'Yes, the slack channel remains open and you can ask questions there. But always sDocker containers exit code w search the channel first and second, check the FAQ (this document), most likely all your questions are already answered here.\nYou can also tag the bot @ZoomcampQABot to help you conduct the search, but don’t rely on its answers 100%, it is pretty good though.',
 'section': 'General course-related questions',
 'question': 'Course - Can I get support if I take the course in the self-paced mode?',
 'course': 'data-engineering-zoomcamp',
 'id': '99647355'}

In [9]:
from collections import defaultdict

In [10]:
hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [11]:
len(hashes), len(documents)

(948, 948)

In [12]:
for k, values in hashes.items():
    if len(values) > 1:
        print(k, len(values))

In [13]:
hashes['ca3dc12d']

[]

In [14]:
import json

In [15]:
def clean_dict(d):
    return {k: v for k, v in d.items() if isinstance(k, (str, int, float, bool, type(None)))}

cleaned_documents = [clean_dict(doc) for doc in documents]

with open('documents-with-ids.json', 'wt') as f_out:
    json.dump(cleaned_documents, f_out, indent=2)

In [16]:
!head documents-with-ids.json

[
  {
    "text": "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \u201cOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel.",
    "section": "General course-related questions",
    "question": "Course - When will the course start?",
    "course": "data-engineering-zoomcamp",
    "id": "a9c449b5"
  },
  {
    "text": "GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites",


In [17]:
cleaned_documents [3]


{'text': "You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.",
 'section': 'General course-related questions',
 'question': 'Course - I have registered for the Data Engineering Bootcamp. When can I expect to receive the confirmation email?',
 'course': 'data-engineering-zoomcamp',
 'id': '36538384'}

In [18]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [19]:
!pip install python-dotenv





In [20]:
import dotenv
print("dotenv is available")

dotenv is available


In [21]:
from dotenv import load_dotenv
import os

load_dotenv()  # Make sure .env is in the same folder

api_key = os.getenv("OPENAI_API_KEY")

if api_key:
    print("API key loaded:", api_key[:8] + "...")
else:
    print("API key not found")


API key loaded: sk-proj-...


In [22]:
from openai import OpenAI
client = OpenAI()


In [23]:
doc= documents[3]
prompt = prompt_template.format(**doc)

In [24]:
print(prompt)

You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: General course-related questions
question: Course - I have registered for the Data Engineering Bootcamp. When can I expect to receive the confirmation email?
answer: You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]


In [25]:
response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

json_response = response.choices[0].message.content


In [26]:
print(json_response)

["When will I receive a confirmation email after registering for the bootcamp?", "Is it necessary to register before starting the Data Engineering Bootcamp?", "Do I need to wait for confirmation before starting the bootcamp assignments?", "Is my registration status verified against a list before I can start the course?", "Is the registration process mandatory to participate in the bootcamp?"]


In [27]:
json.loads(json_response)

['When will I receive a confirmation email after registering for the bootcamp?',
 'Is it necessary to register before starting the Data Engineering Bootcamp?',
 'Do I need to wait for confirmation before starting the bootcamp assignments?',
 'Is my registration status verified against a list before I can start the course?',
 'Is the registration process mandatory to participate in the bootcamp?']

In [28]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response
    

In [29]:
from tqdm.auto import tqdm

In [30]:
results = {}


In [31]:
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

  0%|          | 0/948 [00:00<?, ?it/s]

In [32]:
results

{'a9c449b5': '["What is the exact date and time the course will begin?", "How can I subscribe to the course\'s public Google Calendar?", "What should I do before the course starts?", "How can I join the Telegram channel for course announcements?", "Which platform should I join for course discussions?"]',
 'f8ea111e': '[\n    "What are the prerequisites for enrolling in this course?",\n    "Where can I find the prerequisites for this course?",\n    "Is there a specific GitHub link for course prerequisites?",\n    "How do I check the course requirements?",\n    "Where is the list of prerequisites for the course located?"\n]',
 'e4251647': '[\n    "Is registration required for homework submission in the course?",\n    "Can I enroll in the course once it has already begun?",\n    "Are there any deadlines I should be aware of for the course projects?",\n    "Can homework be submitted if I join the course after it starts?",\n    "What is necessary to keep in mind regarding the course\'s fina

In [None]:
results['eddfc068']

KeyError: '95025497'