In [1]:
import json 

with open('documents.json', 'rt') as f_in:
    documents_raw = json.load(f_in)

documents = []

for course in documents_raw:
    course_name = course['course'] # Getting the course name which is outside the array to append it to each member of the array

    for doc in course['documents']:
        doc['course'] = course_name # Appending the course name to each member of the array
        documents.append(doc)

In [2]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [3]:
# Assign an id to each member of the array

import hashlib

def generate_document_id(doc):
    # combined = f"{doc['course']}-{doc['question']}"
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

for doc in documents:
    doc['id'] = generate_document_id(doc)

In [4]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

In [5]:
# To see how unique the ids are

from collections import defaultdict

hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [6]:
len(hashes), len(documents)

(947, 948)

In [7]:
for k, values in hashes.items():
    if len(values) > 1:
        print(k, len(values))

593f7569 2


In [8]:
hashes['593f7569'] # We have a collision, this happened because the question is a duplicate and the answer is a duplicate as well, will typically noy happen in real life scenarios 

[{'text': "They both do the same, it's just less typing from the script.\nAsked by Andrew Katoch, Added by Edidiong Esu",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'},
 {'text': "They both do the same, it's just less typing from the script.",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'}]

In [9]:
# Save the updated document to json

with open('documents-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [10]:
!head documents-with-ids.json

[
  {
    "text": "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \u201cOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel.",
    "section": "General course-related questions",
    "question": "Course - When will the course start?",
    "course": "data-engineering-zoomcamp",
    "id": "c02e79ef"
  },
  {
    "text": "GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites",


In [11]:
# Now lets generate 5 unique user questions for each record
# Let's start with writing a prompt

prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [12]:
from openai import OpenAI
client = OpenAI()

In [13]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [14]:
from tqdm.auto import tqdm

results = {}

for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [31:36<00:00,  2.00s/it]


In [15]:
results

{'c02e79ef': '[\n    "What is the exact start date and time of the course?",\n    "How can I keep track of the course schedule on my calendar?",\n    "What must I do to register for the course?",\n    "Where should I go to receive course-related announcements?",\n    "Which platform should I join to communicate with peers during the course?"\n]',
 '1f6520ca': '["What is necessary before joining the course?", "Where can I find information about course requirements?", "Are there any skills needed before starting this course?", "How can I check if I\'m prepared for the course?", "Do I need prior knowledge before this course?"]',
 '7842b56a': '[\n  "Is it possible to enroll in the course after it has commenced?",\n  "Am I allowed to submit homeworks if I join the course late?",\n  "Are there specific deadlines for submitting final projects, even if I start late?",\n  "Can I participate in the course without completing the registration process?",\n  "If I don\'t register on time, can I stil

In [29]:
with open('results.json', 'wt') as f_out:
    json.dump(results, f_out, indent=2)

In [30]:
import pickle

# with open('results.json', 'r') as f_in:
#     results = pickle.load(f_in)

# Load the JSON file
with open("results.json", "r") as result_file:
    results = json.load(result_file)

# Save the loaded JSON data as a pickle file
with open("results.pkl", "wb") as pickle_file:
    pickle.dump(results, pickle_file)

In [31]:
results['c02e79ef']

'[\n    "What is the exact start date and time of the course?",\n    "How can I keep track of the course schedule on my calendar?",\n    "What must I do to register for the course?",\n    "Where should I go to receive course-related announcements?",\n    "Which platform should I join to communicate with peers during the course?"\n]'

In [32]:
parsed_results = {}

for doc_id, json_questions in results.items():
    parsed_results[doc_id] = json.loads(json_questions)

In [33]:
parsed_results

{'c02e79ef': ['What is the exact start date and time of the course?',
  'How can I keep track of the course schedule on my calendar?',
  'What must I do to register for the course?',
  'Where should I go to receive course-related announcements?',
  'Which platform should I join to communicate with peers during the course?'],
 '1f6520ca': ['What is necessary before joining the course?',
  'Where can I find information about course requirements?',
  'Are there any skills needed before starting this course?',
  "How can I check if I'm prepared for the course?",
  'Do I need prior knowledge before this course?'],
 '7842b56a': ['Is it possible to enroll in the course after it has commenced?',
  'Am I allowed to submit homeworks if I join the course late?',
  'Are there specific deadlines for submitting final projects, even if I start late?',
  'Can I participate in the course without completing the registration process?',
  "If I don't register on time, can I still meet the homework require

In [35]:
doc_index = {d['id']: d for d in documents}

final_results = []

for doc_id, questions in parsed_results.items():
    course = doc_index[doc_id]['course']
    for q in questions:
        final_results.append((q, course, doc_id))

In [36]:
final_results

[('What is the exact start date and time of the course?',
  'data-engineering-zoomcamp',
  'c02e79ef'),
 ('How can I keep track of the course schedule on my calendar?',
  'data-engineering-zoomcamp',
  'c02e79ef'),
 ('What must I do to register for the course?',
  'data-engineering-zoomcamp',
  'c02e79ef'),
 ('Where should I go to receive course-related announcements?',
  'data-engineering-zoomcamp',
  'c02e79ef'),
 ('Which platform should I join to communicate with peers during the course?',
  'data-engineering-zoomcamp',
  'c02e79ef'),
 ('What is necessary before joining the course?',
  'data-engineering-zoomcamp',
  '1f6520ca'),
 ('Where can I find information about course requirements?',
  'data-engineering-zoomcamp',
  '1f6520ca'),
 ('Are there any skills needed before starting this course?',
  'data-engineering-zoomcamp',
  '1f6520ca'),
 ("How can I check if I'm prepared for the course?",
  'data-engineering-zoomcamp',
  '1f6520ca'),
 ('Do I need prior knowledge before this cours

In [39]:
# Use pandas to save the results in csv format

import pandas as pd

df = pd.DataFrame(final_results, columns=['question', 'course', 'document'])

df.to_csv('ground-truth-data.csv', index=False)

In [40]:
!head ground-truth-data.csv

question,course,document
What is the exact start date and time of the course?,data-engineering-zoomcamp,c02e79ef
How can I keep track of the course schedule on my calendar?,data-engineering-zoomcamp,c02e79ef
What must I do to register for the course?,data-engineering-zoomcamp,c02e79ef
Where should I go to receive course-related announcements?,data-engineering-zoomcamp,c02e79ef
Which platform should I join to communicate with peers during the course?,data-engineering-zoomcamp,c02e79ef
What is necessary before joining the course?,data-engineering-zoomcamp,1f6520ca
Where can I find information about course requirements?,data-engineering-zoomcamp,1f6520ca
Are there any skills needed before starting this course?,data-engineering-zoomcamp,1f6520ca
How can I check if I'm prepared for the course?,data-engineering-zoomcamp,1f6520ca
