In [18]:
import json
import pandas as pd

from tqdm.auto import tqdm
from collections import defaultdict
from openai import OpenAI
from dotenv import load_dotenv

In [2]:
with open('documents.json', 'r') as file:
    documents = json.load(file)

documents[10]

{'Category': 'Temporarily unable to work',
 'Question': "Can I get sick pay if I'm self-isolating?",
 'Answer': "Yes Statutory Sick Pay is available if you're self-isolating.",
 'Section': 'general claim benefits'}

In [3]:
import uuid

def generate_document_id(doc):
    # Combine relevant fields to create a base for the UUID, ensuring it captures the unique aspects of the document.
    combined = f"{doc['Category']}-{doc['Question']}-{doc['Answer'][:10]}-{doc['Section']}"
    
    # Generate a UUID based on the combined string
    document_id = str(uuid.uuid5(uuid.NAMESPACE_DNS, combined))
    
    return document_id

# Assign a UUID to each document
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [4]:
with open('documents-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [5]:
!head documents-with-ids.json

[
  {
    "Category": "Manage existing benefit",
    "Question": "How do I update my benefit information?",
    "Answer": "You can update your benefit information online through your account.",
    "Section": "general claim benefits",
    "id": "30eada08-5708-5c5c-9df8-0f7d5d4dc131"
  },
  {
    "Category": "Manage existing benefit",


In [8]:
uuid_groups = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    # Generate a UUID for each document
    unique_id = uuid.uuid4()
    uuid_groups[unique_id].append(doc)

# Length comparison
print(len(uuid_groups), len(documents))

425 425


In [10]:
uuid_groups

defaultdict(list,
            {UUID('247582b2-29ea-4bac-9ab0-1bb93a3f5819'): [{'Category': 'Manage existing benefit',
               'Question': 'How do I update my benefit information?',
               'Answer': 'You can update your benefit information online through your account.',
               'Section': 'general claim benefits',
               'id': '30eada08-5708-5c5c-9df8-0f7d5d4dc131'}],
             UUID('35053a7c-9ac9-442f-a1a3-1ec8cc58d187'): [{'Category': 'Manage existing benefit',
               'Question': 'What if my circumstances change?',
               'Answer': 'Report changes in circumstances immediately to avoid issues.',
               'Section': 'general claim benefits',
               'id': '51fbbe84-2ac8-5daf-b0a5-8016ccd05b45'}],
             UUID('0d63a17c-610e-4dac-bab3-ffef2bc56cf6'): [{'Category': 'Manage existing benefit',
               'Question': 'Can I appeal a decision?',
               'Answer': 'Yes you can appeal within one month of the decision 

In [20]:
prompt_template = """
You act as a person who's curious about benefits and claims.
Formulate 5 questions this person might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {Section}
question: {Question}
answer: {Answer}
category: {Category}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [21]:
load_dotenv()
client = OpenAI()

In [22]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [23]:
questions = generate_questions(doc)
questions

'["What is the procedure to update my benefits info?",\n"Can I modify my benefit details through an online account?",\n"If I need to change my benefit information, what should I do?",\n"Is it possible to change my benefit information online?",\n"How can I update my benefit data?"]'

In [24]:
results = {}

In [25]:
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

  0%|          | 0/425 [00:00<?, ?it/s]

In [27]:
import pickle

In [28]:
with open('results.bin', 'wb') as file:
    pickle.dump(results, file)

print("Data has been saved to 'data.pickle'")

Data has been saved to 'data.pickle'


In [30]:
with open('results.bin', 'rb') as file:
    loaded_data = pickle.load(file)

In [32]:
results['9c92da3c-fafb-5711-b057-88cd220754b5']

'["How do I inform you about a change in my earnings?", "What steps should I take to update my income details?", "Where can I submit a change to my salary information?", "How do I notify you of an adjustment in my wages?", "Which platform allows me to report income variations?"]'

In [37]:
results

{'30eada08-5708-5c5c-9df8-0f7d5d4dc131': '[\n    "How can I change my existing benefit details?",\n    "Where can I update my benefit information?",\n    "What is the process to update my benefit information?",\n    "Can I update my benefit details through my account?",\n    "Is it possible to modify my benefit information online?"\n]',
 '51fbbe84-2ac8-5daf-b0a5-8016ccd05b45': '[\n  "What actions should I take if my situation changes?",\n  "How do I handle changes in my living conditions?",\n  "What steps are necessary when my circumstances alter?",\n  "What should be done if there are changes in my circumstances?",\n  "How do I manage changes in my current situation?"\n]',
 '8d000ade-6c2b-571c-aa61-5d38eb463cf8': '[\n  "Is it possible to appeal?",\n  "What is the time limit for appealing?",\n  "When can I challenge a benefit decision?",\n  "How long do I have to file an appeal?",\n  "Can I contest the decision if I disagree?"\n]',
 '9c92da3c-fafb-5711-b057-88cd220754b5': '["How do I i

In [42]:
parsed_json = {}

for doc_id, json_results in results.items():
    parsed_json[doc_id] = json.loads(json_results)

In [43]:
parsed_json

{'30eada08-5708-5c5c-9df8-0f7d5d4dc131': ['How can I change my existing benefit details?',
  'Where can I update my benefit information?',
  'What is the process to update my benefit information?',
  'Can I update my benefit details through my account?',
  'Is it possible to modify my benefit information online?'],
 '51fbbe84-2ac8-5daf-b0a5-8016ccd05b45': ['What actions should I take if my situation changes?',
  'How do I handle changes in my living conditions?',
  'What steps are necessary when my circumstances alter?',
  'What should be done if there are changes in my circumstances?',
  'How do I manage changes in my current situation?'],
 '8d000ade-6c2b-571c-aa61-5d38eb463cf8': ['Is it possible to appeal?',
  'What is the time limit for appealing?',
  'When can I challenge a benefit decision?',
  'How long do I have to file an appeal?',
  'Can I contest the decision if I disagree?'],
 '9c92da3c-fafb-5711-b057-88cd220754b5': ['How do I inform you about a change in my earnings?',
  'W

In [47]:
doc_index = {d['id']: d for d in documents}

{'30eada08-5708-5c5c-9df8-0f7d5d4dc131': {'Category': 'Manage existing benefit',
  'Question': 'How do I update my benefit information?',
  'Answer': 'You can update your benefit information online through your account.',
  'Section': 'general claim benefits',
  'id': '30eada08-5708-5c5c-9df8-0f7d5d4dc131'},
 '51fbbe84-2ac8-5daf-b0a5-8016ccd05b45': {'Category': 'Manage existing benefit',
  'Question': 'What if my circumstances change?',
  'Answer': 'Report changes in circumstances immediately to avoid issues.',
  'Section': 'general claim benefits',
  'id': '51fbbe84-2ac8-5daf-b0a5-8016ccd05b45'},
 '8d000ade-6c2b-571c-aa61-5d38eb463cf8': {'Category': 'Manage existing benefit',
  'Question': 'Can I appeal a decision?',
  'Answer': 'Yes you can appeal within one month of the decision notice.',
  'Section': 'general claim benefits',
  'id': '8d000ade-6c2b-571c-aa61-5d38eb463cf8'},
 '9c92da3c-fafb-5711-b057-88cd220754b5': {'Category': 'Manage existing benefit',
  'Question': 'How can I rep

In [50]:
final_results = []

for doc_id, questions in parsed_json.items():
    section = doc_index[doc_id]['Section']
    for q in questions:
        final_results.append((q, section, doc_id))

In [55]:
df = pd.DataFrame(final_results, columns=['question', 'claims_type', 'document'])

In [56]:
df.to_csv('ground-truth-data.csv', index=False)