<a href="https://colab.research.google.com/github/croco22/CapstoneProjectTDS/blob/dev/notebooks/Generate_QA_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task 1: Generate Q&A Dataset

In [22]:
import json
import time
import google.generativeai as genai
from google.colab import userdata
import requests
import random
from datetime import datetime, timedelta
from itertools import combinations
import os

# Create userdata folder in Colab environment
if not os.path.exists("userdata"):
    os.mkdir("userdata")

# API setup
key = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=key)
ai_model = genai.GenerativeModel('gemini-1.5-flash')

In [23]:
def generate_random_timestamp():
    """
    Generate random timestamp within the last 30 days in the format '%Y%m%d_%H%M%S'.
    """
    start_date = datetime.now() - timedelta(days=30)
    random_seconds = random.randint(0, 30 * 24 * 60 * 60)
    random_date = start_date + timedelta(seconds=random_seconds)
    return random_date.strftime('%Y%m%d_%H%M%S')

## Define modules to process different types of data
### Handler method

In [24]:
def process_question(data):
    """
    Generate spoken answers for the passed question in the JSON data.
    A distinction is made between the different types of questions.
    """
    type_handlers = {
        "SINGLE_SELECT": handle_single_select,
        "MULTI_SELECT": handle_multi_select,
        "TEXT": handle_text,
        "NUMBER": handle_number,
        "DATE": handle_date,
    }

    data_type = data.get('type')
    handler = type_handlers.get(data_type)

    if handler:
        return handler(data)
    else:
        exit(f"Unhandled data type: {data_type}")

### Single Select

In [25]:
def handle_single_select(data):
    """
    Example output:
    [
      {"Yeah, sure thing, ...": ["Yes"]},
      {"Nope, I'd rather ...": ["No"]},
      ...
    ]
    """
    answers = list()
    for option in data['options']:
        response_text = generate_single_answers(item['question'], option['option'])
        texts_array = [answer.strip() for answer in response_text.split("§")]

        for text in texts_array:
          answers.append({
              text: [option['option']]
          })

        time.sleep(3) # Required in the free version to avoid exceeding API limits
    return answers


def generate_single_answers(question, option):
    """
    API call to generate spoken answers for each option.
    """
    prompt = f"""
    You are the user of an app and you are responding in a spoken style to the following question.
    You like to talk so you don't just say yes or no but rather answer with a whole sentence.
    Question: '{question}'
    Your answer should contain the following content, e.g. if the content is 'yes', you convey this in your response.
    The content must be stated explicitly in your answer.
    Content of the answer: '{option}'
    The responses should be in the following format and be random so each answer has to be different.
    Generate 5 answers that are split by a § sign and contain only text.
    answer1§answer2§...§answer5
    """
    try:
        response = ai_model.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        exit("Error during API call: ", e)

### Multi Select

In [26]:
def handle_multi_select(data):
    """
    Generates responses for multi-select questions.
    Example output:
    [
        {"Yeah, that would be MY-SYSTEM and Notion, ...": ["MY-SYSTEM", "Notion"]},
        {"Hmm, I think I'm mainly interested in Notion ...": ["Notion"]},
        ...
    ]
    """
    answers = list()
    options = [option['option'] for option in data['options']]

    # Generate all possible combinations of options (subsets)
    all_combinations = []
    for r in range(1, len(options) + 1):
        all_combinations.extend(list(combinations(options, r)))

    # Shuffle combinations for randomness
    random.shuffle(all_combinations)

    # Only generate answers for a random sample of combinations
    selected_combinations = random.sample(all_combinations, min(5, len(all_combinations)))

    for combo in selected_combinations:
        response_text = generate_multi_answers(data['question'], combo)
        texts_array = [answer.strip() for answer in response_text.split("§")]

        for text in texts_array:
            answers.append({
                text: list(combo)  # Store the options as a list
            })

        time.sleep(3)  # Avoid exceeding API limits in the free version
    return answers


def generate_multi_answers(question, options):
    """
    API call to generate spoken answers for multiple options.
    """
    options_text = ", ".join(options)
    prompt = f"""
    You are the user of an app and you are responding in a spoken style to the following question.
    You like to talk so you don't just list options but rather answer with a whole sentence.
    Question: '{question}'
    Your answer has to contain all of the following text elements explicity to be valid: '{options_text}'.
    The responses should be in the following format and random so each answer has to be different.
    Generate 5 answers that are split by a § sign and contain only text.
    answer1§answer2§...§answer5
    """
    try:
        response = ai_model.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        exit("Error during API call: ", e)

### Text

In [27]:
def handle_text(data):
    # Todo
    pass

### Number

In [28]:
def handle_number(data):
    # Todo
    pass

### Date

In [29]:
def handle_date(data):
    """
    Example output:
    [
        {"Tomorrow would be good ...": [86400]},
        {"How about in three weeks ...": [1814400]},
        ...
    ]
    """
    answers = list()
    for option in data['options']:
        response_text = generate_date_answers(item['question'])
        texts_array = [answer.strip() for answer in response_text.split("§")]

        texts = texts_array[:5]
        calculations = texts_array[5:]

        for i in range(5):
            answers.append({
                texts[i]: [int(calculations[i])]
            })

        time.sleep(3) # Required in the free version to avoid exceeding API limits
    return answers


def generate_date_answers(question):
    """
    Generates responses for date questions.
    """
    prompt = f"""
    You are the user of an app and you are responding in a spoken style to the following question.
    You like to talk so you don't just say yes or no but rather answer with a whole sentence.
    Question: '{question}'
    Your answer should contain a time reference in the future, such as 'tomorrow', 'in three weeks', etc.
    Additionally you have to give a calculation reference for this in seconds without naming a fixed date, e.g. 'tomorrow'=86.400; 'in three weeks'=1.814.400
    The responses should be in the following format and be random so each answer has to be different.
    Generate 5 answers that are split by a § sign and contain only text.
    After that follow the calculation references seperated by a § sign as Integer values.
    answer1§answer2§...§answer5§calculation1§calculation2§...§calculation5
    """
    try:
        response = ai_model.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        exit("Error during API call: ", e)

## Run defined modules for each question provided and save to a JSON-file
### Simulate real world questionnaires

In [30]:
# Currently questionnaire 3 & 4 selected
for questionnaire in [3, 4]:
    url = f'https://raw.githubusercontent.com/croco22/CapstoneProjectTDS/refs/heads/main/questionnaires/questionnaire{questionnaire}.json'
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        print(f"Retrieved file: questionnaire{questionnaire}.json")
    else:
        print("Error while parsing a file: ", response.status_code)

    # Generate a set of possible answer texts
    answers_for_questions = list()
    for item in data:
        if item['type'] not in ['SINGLE_SELECT', 'MULTI_SELECT', 'DATE']: # Todo: Remove
            answers_for_questions.append({})
            continue

        answers_for_questions.append({
            item['id']: process_question(item)
        })
        print(f"Generated answers for question '{item['question']}'.")

    # Generate 5 answer sheets
    for sheet in range(1, 6):
        result = list()
        for idx, item in enumerate(data):
            if item['type'] not in ['SINGLE_SELECT', 'MULTI_SELECT', 'DATE']: continue # Todo: Remove

            # Pick a random answer from the answer pool
            answer_list = list(answers_for_questions[idx].values())[0]
            random_answer = random.choice(answer_list)
            answer_key, answer_value = list(random_answer.items())[0]

            result.append({
                "question": item['question'], # question as a String
                "possible_answers": [option['option'] for option in item['options']], # Possible answer Strings
                "answer_text": answer_key, # Answer text of user
                "intended_answer": answer_value # Intended answers to evaluate later
            })

        # Save the sheet to a new JSON file
        output_filename = f"userdata/q{questionnaire}_{generate_random_timestamp()}.json"
        with open(output_filename, 'w', encoding='utf-8') as f:
            json.dump(result, f, ensure_ascii=False, indent=4)
        print(f"Answer sheet {sheet} saved to file: {output_filename}")

Retrieved file: questionnaire3.json
Generated answers for question 'What type of company is it?'.
Generated answers for question 'What is the size of your company?'.
Generated answers for question 'When do you wish to receive a follow-up?'.
Answer sheet 1 saved to file: userdata/q3_20241224_212503.json
Answer sheet 2 saved to file: userdata/q3_20241216_002453.json
Answer sheet 3 saved to file: userdata/q3_20241212_032513.json
Answer sheet 4 saved to file: userdata/q3_20250104_114335.json
Answer sheet 5 saved to file: userdata/q3_20241228_170057.json
Retrieved file: questionnaire4.json
Generated answers for question 'Which language is wanted for communication? '.
Generated answers for question 'What is the type of contact?'.
Generated answers for question 'What is the contact person interested in?'.
Generated answers for question 'When does the contact person wish to receive a follow up?'.
Answer sheet 1 saved to file: userdata/q4_20241226_222337.json
Answer sheet 2 saved to file: userd

### Generate large dataset

In [31]:
# result = list()

# # For each questionnaire (named 1-5)
# for questionnaire in range(1, 6):
#     url = f'https://raw.githubusercontent.com/croco22/CapstoneProjectTDS/refs/heads/main/questionnaires/questionnaire{questionnaire}.json'
#     response = requests.get(url)

#     if response.status_code == 200:
#         data = response.json()
#         print(f"Retrieved file: questionnaire{questionnaire}.json")
#     else:
#         print("Error while parsing a file: ", response.status_code)

#     # Generate dataset with all generated answers
#     for item in data:
#         if item['type'] != 'SINGLE_SELECT' and item['type'] != 'MULTI_SELECT': continue # Todo: Remove

#         answer_list = process_question(item)
#         for answer in answer_list:
#           answer_key, answer_value = list(answer.items())[0]
#           result.append({
#               "question": item['question'], # question as a String
#               "possible_answers": [option['option'] for option in item['options']], # Possible answer Strings
#               "answer_text": answer_key, # Answer text of user
#               "intended_answer": answer_value # Intended answer to evaluate later
#           })
#         print(f"Generated answers for question '{item['question']}'.")

# # Save dataset to a new JSON file
# with open("qa_dataset.json", 'w', encoding='utf-8') as f:
#     json.dump(result, f, ensure_ascii=False, indent=4)
# print("Q&A dataset saved to file: qa_dataset.json")

### Download all created files

In [32]:
# from google.colab import files

# # Download files
# !zip userdata.zip userdata/*
# files.download('userdata.zip')
# files.download('qa_dataset.json')

In [33]:
# Todo: Generate more data based on this dataset
# Should we convert the dataset in a pandas dataframe instead of a dict?