## ground-truth-data

In [18]:
import os
import json

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), '..')) 
data_path = os.path.join(PROJECT_ROOT, 'data', 'data.jsonl')
output_path = os.path.join(PROJECT_ROOT, 'data', 'data.csv')

print(f"Loading document chunks from: {data_path}")

documents = []
with open(data_path, 'rt', encoding='utf-8') as f_in:
    for line in f_in:
        doc = json.loads(line)
        doc.pop('embedding', None)
        documents.append(doc)

print(f"Successfully loaded {len(documents)} document chunks (with 'embedding' field removed).")

# Checking first document to confirm
if documents:
    print("\nDocuments keys loaded:")
    print(list(documents[0].keys()))

Loading document chunks from: /workspaces/knowledge-base-assistant/data/data.jsonl
Successfully loaded 217 document chunks (with 'embedding' field removed).

Documents keys loaded:
['document_metadata', 'chunk_id', 'section_title', 'chunk_type', 'page_number', 'content', 'id', 'title', 'url', 'text']


In [20]:
import pandas as pd

df = pd.DataFrame(documents)
metadata_df = pd.json_normalize(df['document_metadata'])
metadata_df.rename(columns={'title': 'document_title', 'url': 'document_url'}, inplace=True)
df = df.drop(columns=['document_metadata']).join(metadata_df)

if 'embedding' in df.columns:
    df = df.drop(columns=['embedding'])
    print("'embedding' column removed.")

df.to_csv(output_path, index=False, encoding='utf-8')
print(f"\nSuccessfully saved {len(df)} records to: {output_path}")


Successfully saved 217 records to: /workspaces/knowledge-base-assistant/data/data.csv


In [1]:
# documents[0]

In [2]:
# from collections import defaultdict

In [3]:
# hashes = defaultdict(list)

# for doc in documents:
#     doc_id = doc['id']
#     hashes[doc_id].append(doc)

# len(hashes), len(documents)

In [9]:
from openai import OpenAI
client = OpenAI()

In [10]:
prompt_template = """
You are an AI assistant tasked with creating a high-quality ground-truth dataset for evaluating a Retrieval-Augmented Generation (RAG) system.
Your role is to act as an inquisitive researcher or engineer and generate **5 distinct, realistic, and high-quality questions** that can be **completely answered** using only the information in the provided CONTEXT.

### RULES:
1. **Exactly Five Questions**: Produce precisely 5 questions, no more, no less.
2. **Grounded in Context**: Each question must be answerable *exclusively* from the provided CONTEXT. Do not introduce outside knowledge.
3. **Realistic User Queries**: Formulate questions as if they were asked by a real person seeking information. Avoid exam-style or trivial “fill-in-the-blank” questions.
4. **Clarity and Precision**: Questions should be well-structured, specific, and self-contained. Avoid ambiguity and overly short phrasing.
5. **No Copy-Paste**: Do not copy sentences verbatim from the CONTEXT. Always rephrase naturally, ensuring the question sounds conversational.
6. **Variety of Question Types**: Mix styles and intents (e.g., definitions, comparisons, processes, causes/effects, implications, factual lookups). At least one “how/why” style question is required.
7. **Balanced Scope**: Avoid questions that are either too broad ("Explain everything about…") or too narrow ("What is the third word in…"). Each question should target a meaningful, self-contained piece of information.

### CONTEXT:
---
Source Document: {title}
Section: {section_title} (Page {page_number})
Content: {content}
---

### OUTPUT FORMAT:
Return the result as a single valid parsable JSON (without markdown/code block formatting), exactly in this structure:
{{"questions": ["question1", "question2", "question3", "question4", "question5"]}}
""".strip()

In [11]:
prompt = prompt_template.format(**documents[0])

In [12]:
def llm(prompt, model='gpt-4o-mini'):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [40]:
questions = llm(prompt)

In [41]:
json.loads(questions)

{'questions': ['What is the main factor contributing to safety incidents in high-risk industries according to the research mentioned?',
  'Can you explain what Crew Resource Management (CRM) refers to in the context of non-technical skills?',
  'What is the title of the program developed to enhance situational awareness competencies in employees?',
  'How does the Permanent Attention program aim to improve situational awareness among front-line employees?',
  'Which fields did the program draw best practices from to achieve improvements in non-technical skills?']}

In [22]:
import json
import time

MAX_RETRIES = 3

def safe_generate_questions(doc, max_retries=MAX_RETRIES):
    """
    Generate questions and ensure valid JSON output.
    Retries up to `max_retries` times if JSON decoding fails.
    """
    for attempt in range(1, max_retries + 1):
        questions_raw = generate_questions(doc)

        try:
            questions = json.loads(questions_raw)
            # Ensure expected structure
            if isinstance(questions, dict) and "questions" in questions:
                return questions
            else:
                raise ValueError("Invalid JSON structure, missing 'questions' key")

        except Exception as e:
            print(f"[Attempt {attempt}/{max_retries}] JSON parsing failed: {e}")
            
            # Retry by asking LLM to strictly fix the format
            fix_prompt = f"""
            The following text was supposed to be valid JSON but is malformed:

            {questions_raw}

            Please return ONLY a valid JSON object in the form:
            {{"questions": ["question1", "question2", "question3", "question4", "question5"]}}
            """
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role": "user", "content": fix_prompt}]
            )
            questions_raw = response.choices[0].message.content

            # Wait a bit between retries
            time.sleep(1)

    # If still failing after retries, log and skip
    print(f"❌ Failed to get valid JSON after {max_retries} attempts for doc {doc['id']}")
    return {"questions": []}  # fallback so pipeline continues

In [23]:
from tqdm.auto import tqdm

In [24]:
results = {}

In [25]:
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = safe_generate_questions(doc)
    results[doc_id] = questions['questions']

  0%|          | 0/217 [00:00<?, ?it/s]

[Attempt 1/3] JSON parsing failed: Expecting ',' delimiter: line 1 column 488 (char 487)


In [32]:
# import json
# import re
# import time
# from tqdm.auto import tqdm

# MAX_RETRIES = 3

# def auto_fix_json(bad_json: str) -> str:
#     """
#     Attempt to fix JSON formatting issues without calling LLM again.
#     - Removes markdown/code fences
#     - Strips extra text before/after JSON
#     - Fixes trailing commas
#     """
#     # Remove code fences (```json ... ```)
#     cleaned = re.sub(r"^```(?:json)?|```$", "", bad_json.strip(), flags=re.MULTILINE).strip()

#     # Extract the first {...} block
#     match = re.search(r"\{.*\}", cleaned, re.DOTALL)
#     if match:
#         cleaned = match.group(0)

#     # Fix trailing commas before ] or }
#     cleaned = re.sub(r",\s*([\]}])", r"\1", cleaned)

#     return cleaned


# def safe_generate_questions(doc, max_retries=MAX_RETRIES):
#     """
#     Generate questions and ensure valid JSON output.
#     Retries with LLM reformatting if JSON decoding fails,
#     and finally tries local auto-fix before giving up.
#     """
#     questions_raw = None
#     for attempt in range(1, max_retries + 1):
#         questions_raw = generate_questions(doc)

#         try:
#             questions = json.loads(questions_raw)
#             if isinstance(questions, dict) and "questions" in questions:
#                 return questions
#             else:
#                 raise ValueError("Invalid JSON structure, missing 'questions' key")

#         except Exception as e:
#             print(f"[Attempt {attempt}/{max_retries}] JSON parsing failed: {e}")

#             # Retry with LLM fix
#             fix_prompt = f"""
#             The following text was supposed to be valid JSON but is malformed:

#             {questions_raw}

#             Please return ONLY a valid JSON object in the form:
#             {{"questions": ["question1", "question2", "question3", "question4", "question5"]}}
#             """
#             response = client.chat.completions.create(
#                 model="gpt-4o-mini",
#                 messages=[{"role": "user", "content": fix_prompt}]
#             )
#             questions_raw = response.choices[0].message.content

#             time.sleep(1)

#     # Final attempt: local auto-fix
#     try:
#         fixed = auto_fix_json(questions_raw)
#         questions = json.loads(fixed)
#         if isinstance(questions, dict) and "questions" in questions:
#             print(f"✅ Auto-fixed JSON for doc {doc['id']}")
#             return questions
#     except Exception as e:
#         print(f"❌ Auto-fix failed for doc {doc['id']}: {e}")

#     # If still bad, return empty structure
#     return {"questions": []}


# results = {}

# for doc in tqdm(documents): 
#     doc_id = doc['id']
#     if doc_id in results:
#         continue

#     questions = safe_generate_questions(doc)
#     results[doc_id] = questions['questions']


In [33]:
final_results = []

In [27]:
for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [28]:
final_results[0]

('a-disruptive-approach-to-crm-and-situational-awareness-competenc_PAGE1001',
 'What role do non-technical skills play in safety incidents in high-risk industries?')

In [None]:
import pandas as pd

In [29]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])

In [30]:
df_results.to_csv('../data/ground-truth-retrieval.csv', index=False)

In [31]:
!head ../data/ground-truth-retrieval.csv

id,question
a-disruptive-approach-to-crm-and-situational-awareness-competenc_PAGE1001,What role do non-technical skills play in safety incidents in high-risk industries?
a-disruptive-approach-to-crm-and-situational-awareness-competenc_PAGE1001,Can you explain what the Permanent Attention program aims to achieve?
a-disruptive-approach-to-crm-and-situational-awareness-competenc_PAGE1001,How does the Permanent Attention program improve situational awareness competencies?
a-disruptive-approach-to-crm-and-situational-awareness-competenc_PAGE1001,What are the key components that the Permanent Attention program is based on?
a-disruptive-approach-to-crm-and-situational-awareness-competenc_PAGE1001,Why is situational awareness considered the most influential non-technical skill in the context of this research?
a-disruptive-approach-to-crm-and-situational-awareness-competenc_PAGE1002,What is the main focus of the paper discussed in the document?
a-disruptive-approach-to-crm-and-situational-aware