# Generate Test Data

Notebook to generate the data required for testing technical knowledge.


## Setup


In [1]:
from openai import OpenAI, OpenAIError
import json
from pathlib import Path
import glob
import uuid
import sys

## Configurations


In [2]:
SYSTEM_CONTEXT = """
You are a multiple-choice test that is assessing knowledge on the following topics:
- Computer Science
- Data Science
- Data Engineering
- Web Development
- AWS
- AI
- ML
- DevOps
- Platform Engineering
- React
- JavaScript
- Python
- Cloud Platforms

The test starts by assessing the breadth of knowledge. After these questions are completed, the user has the option to show the results or to continue on to the depth assessment. 

When assessing the breadth of knowledge ask 5 questions for each topic. Each question should get progressively more difficult. This should make the total question count for the bread assessment 5 * the number of topics. To complete thee breadth assessment all topics must completed. Don't offer to show the results until all topics have been completed.

When assessing the depth of knowledge ask 50 questions for each topic. Each question should get progressively more difficult based on the accuracy of the previous response. If there are 5 wrong answers in a row, advance to the next topic. A user can choose to select a subset of the topics for the depth assessment.

After all topics have been assessed, a report is shown with the results. The report should include information about both the breadth and depth of knowledge on each topic. The report should include a scale showing the level of knowledge for each topic. The scale should be from 0 - 100. The report should conclude with recommendations for other related topics to test.

The difficulty level of questions should range from 1 - 5, with 1 being the easiest and 5 being the hardest.

To answer a question, a user specifies the letter of the answer they want to select. The user should say "?" if they do not know the answer.
""".strip()

TOPICS = [
    "Artificial Intelligence",
    "Automated Testing",
    "AWS",
    "Cloud Platforms",
    "Cloud Security",
    "Computer Science",
    "Continuous Delivery",
    "Continuous Deployment",
    "Continuous Integration",
    "Cybersecurity",
    "Data Analysis",
    "Data Engineering",
    "Data Science",
    "DevOps",
    "Enterprise Architecture",
    "Git",
    "Infrastructure as Code",
    "JavaScript",
    "Machine Learning",
    "Microservices",
    "Network Security",
    "Penetration Testing",
    "Platform Engineering",
    "Python",
    "React",
    "Site Reliability Engineering",
    "Software Architecture",
    "Software Engineering",
    "Solutions Architecture",
    "Web Development",
]

# Uncomment features to enable them
ENABLED_FEATURES = [
    # "QUESTION_GENERATION",
    # "CHOICE_GENERATION"
]

In [3]:
client = OpenAI()
default_messages = [{"role": "system", "content": SYSTEM_CONTEXT}]
default_model = "gpt-4-turbo-preview"

## Utilities


In [4]:
def ask(question, messages=default_messages, model=default_model):
    try:
        conversation = []
        conversation.extend(messages)
        conversation.extend([{"role": "user", "content": question}])
        response = client.chat.completions.create(
            model=model,
            messages=conversation,
            response_format={"type": "json_object"},
        )
        complete_message = response.choices[0].message.content
        conversation.extend([{"role": "assistant", "content": complete_message}])

        return conversation
    except OpenAIError as e:
        # Handle all OpenAI API errors
        print(f"Error: {e}")
        sys.exit()


def generate_questions(topics):
    for current_topic in topics:
        current_key = "_".join(word.lower() for word in current_topic.split())
        data_path = f"./data/{current_key}_data.json"

        if not Path(data_path).exists():
            current_messages = ask(
                f"Provide a comprehensive list of questions for the {current_topic} topic. Provide 100 questions total, with 20 questions for each difficulty level. Sort the list by difficulty level from easiest to hardest. The response should be a list of strings. The list only contains questions. Each question should be format as '[difficulty]: [question]'. Eg. '1: What does AI stand for?' Do not include the choices. Do not include the question number. Return the list formatted as JSON."
            )
            Path(data_path).write_text(current_messages[-1]["content"])


def generate_choices(topics):
    for current_topic in topics:
        current_key = "_".join(word.lower() for word in current_topic.split())
        data_path = f"./data/{current_key}_data.json"
        data = json.loads(Path(data_path).read_text())
        questions = data["questions"]
        data["choices"] = [] if "choices" not in data else data["choices"]
        existing_choices = [choice["question_id"] for choice in data["choices"]]

        for question in questions:
            if question["id"] not in existing_choices:
                print(f"Generating choices ({current_topic}): {question['question']}")
                prompt_message = f"I'm creating a multiple choice test on a specific topic. Generate 4 choices for the provided question. There should be only one correct choice. Randomize the placement of the correct choice."
                prompt_details = (
                    f"Topic: {current_topic}\n\nQuestion: {question['question']}"
                )
                prompt_format_instructions = 'Return the response as JSON. Example: { "choices": ["first example choice", "second example choice", ...otherChoices], "answer": <index of correct choice> }'
                prompt = f"{prompt_message}\n\n{prompt_details}\n\n{prompt_format_instructions}"
                current_messages = ask(prompt)
                question_choices = json.loads(current_messages[-1]["content"])
                question_choices["question_id"] = question["id"]
                data["choices"].append(question_choices)
                Path(data_path).write_text(json.dumps(data, indent=2))


def transform_questions(topics):
    for current_topic in topics:
        transformed_questions = []
        current_key = "_".join(word.lower() for word in current_topic.split())
        data_path = f"./data/{current_key}_data.json"
        data = json.loads(Path(data_path).read_text())
        questions = data["questions"]

        for question_raw in questions:
            if isinstance(question_raw, str):
                difficulty = question_raw[0:1]
                question_text = question_raw[3:]
                question_dict = {
                    "id": str(uuid.uuid4()),
                    "difficulty": int(difficulty),
                    "question": question_text,
                }
            elif isinstance(question_raw, dict):
                question_dict = question_raw
                question_dict.setdefault("id", str(uuid.uuid4()))
                question_dict.setdefault("difficulty", None)
                question_dict.setdefault("question", None)

            transformed_questions.append(question_dict)

        Path(data_path).write_text(
            json.dumps(
                {"topic": current_topic, "questions": transformed_questions}, indent=2
            )
        )


def read_data(glob_pattern):
    files_contents = {}
    for file_path in glob.glob(glob_pattern):
        try:
            with open(file_path, "r") as file:
                files_contents[file_path] = file.read()

            if file_path.endswith(".json"):
                files_contents[file_path] = json.loads(files_contents[file_path])

        except Exception as e:
            print(f"Error reading {file_path}: {e}")

    return files_contents


def is_enabled(feature_name):
    if feature_name in ENABLED_FEATURES:
        return True
    else:
        print(f"feature disabled: {feature_name}")
        return False

## Generate Questions


In [5]:
if is_enabled("QUESTION_GENERATION"):
    generate_questions(TOPICS)
    transform_questions(TOPICS)

feature disabled: QUESTION_GENERATION


## Generate Choices


In [6]:
if is_enabled("CHOICE_GENERATION"):
    generate_choices(TOPICS)

feature disabled: CHOICE_GENERATION


## Load Data


In [None]:
data = read_data("./data/*.json")

for file_path, datum in data.items():
    print(f"Loaded file: {file_path}\n{datum}\n")