In [1]:
import json
import hashlib
import dotenv
from openai import OpenAI
from collections import defaultdict
from tqdm.auto import tqdm
import pickle

## Step 1: Prepare the documents

In [2]:
with open("documents.json", 'rt') as f_in:
    docs_raw = json.load(f_in)

documents = []
for course in docs_raw:
    for doc in course['documents']:
        doc['course'] = course['course']
        documents.append(doc)

In [3]:
def generate_document_id(doc):

    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]

    return document_id

In [4]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [5]:
with open('documents-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [6]:
prompt_template = """
You emulate a student taking our course.
Formulate 5 questions this student might ask based on the FAQ record provided. The record should contain the answer to the questions, and the
questions should be complete and not too short. If possible, use as few words as possible from the record.

The record:
section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", "question3", "question4", "question5"]

""".strip()

In [7]:
dotenv.load_dotenv('../.env')

True

In [8]:
ai_client = OpenAI()

In [9]:
def generate_questions(doc, prompt_template):
    
    prompt = prompt_template.format(**doc)
    
    resp = ai_client.chat.completions.create(
        model='gpt-4o',
        messages=[{'role': 'user', 'content': prompt}],
    )
    json_response = json.loads(resp.choices[0].message.content)
    return json_response

In [10]:
try:
    with open('results.pkl', 'rb') as f_in:
        results = pickle.load(f_in)
except FileNotFoundError:
    results = {}

In [12]:
for doc in tqdm(documents):
    id = doc['id']
    if id in results:
        continue
    try:
        questions = doc['questions'] = generate_questions(doc, prompt_template)
        results[id] = questions
    except Exception as e:
        with open('results.pkl', 'wb') as f_out:
            pickle.dump(results, f_out)
        continue

  0%|          | 0/948 [00:00<?, ?it/s]

In [13]:
with open('results.pkl', 'wb') as f_out:
    pickle.dump(results, f_out)

In [17]:
doc_index = {doc['id']: doc for doc in documents}