<a href="https://colab.research.google.com/github/croco22/CapstoneProjectTDS/blob/main/notebooks/Generate_QA_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task 1: Generate Q&A Dataset

In [1]:
import json
import time
import google.generativeai as genai
from google.colab import userdata
import requests
import random
from datetime import datetime, timedelta

# Create userdata folder in Colab environment
!mkdir userdata

# API setup
key = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=key)
ai_model = genai.GenerativeModel('gemini-1.5-flash')

## Define modules to process different types of data

In [2]:
def process_question(data):
    """
    Generate spoken answers for the passed question in the JSON data.
    A distinction is made between the different types of questions.
    """
    type_handlers = {
        "SINGLE_SELECT": handle_single_select,
        "MULTI_SELECT": handle_multi_select,
        "TEXT": handle_text,
        "DATE": handle_date,
        "NUMBER": handle_number,
    }

    data_type = data.get('type')
    handler = type_handlers.get(data_type)

    if handler:
        return handler(data)
    else:
        exit(f"Unhandled data type: {data_type}")

In [3]:
def handle_single_select(data):
    """
    Example output:
    [{"Yes, I am ...": "ee0437c0-6335-4b88-8bc5-d4eb8e2c68bf"},
    {"Yes, blabla ...": "ee0437c0-6335-4b88-8bc5-d4eb8e2c68bf"},
    ...,
    {"No, I don't ...": "d357ab84-929f-440a-b9ad-42ff36402a53"},
    {"Not agreed ...": "d357ab84-929f-440a-b9ad-42ff36402a53"},
    ...]
    """
    answers = list()
    for option in data['options']:
        response_text = generate_single_answers(item['question'], option['option'])
        texts_array = [answer.strip() for answer in response_text.split("§")]

        for text in texts_array:
          answers.append({
              text: option['id']
          })

        time.sleep(3) # Required in the free version to avoid exceeding API limits
    return answers

def handle_multi_select(data):
    # Todo
    pass

def handle_text(data):
    # Todo
    pass

def handle_date(data):
    # Todo
    pass

def handle_number(data):
    # Todo
    pass

## Define module to generate text via API including AI prompts

In [4]:
def generate_single_answers(question, option):
    """
    API call to generate spoken answers for each option.
    """
    prompt = f"""
    You are the user of an app and you are responding in a spoken style to the following question.
    You like to talk so you don't just say yes or no but rather answer with a whole sentence.
    Question: "{question}"
    Your answer should contain following content (e.g. if the content of the answer is yes, you convey this in your response):
    Content of the answer: "{option}"
    The responses should be in the following format and be kind of random so that each answer is in a different style.
    Generate 5 answers that are split by a § sign and contain only text.
    answer1§answer2§...§answer5
    """
    try:
        response = ai_model.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        exit("Error during API call: ", e)

## Run defined modules for each question provided and save to a JSON-file

In [5]:
def generate_random_timestamp():
    """
    Generate random timestamp within the last 30 days in the format '%Y%m%d_%H%M%S'.
    """
    start_date = datetime.now() - timedelta(days=30)
    random_seconds = random.randint(0, 30 * 24 * 60 * 60)
    random_date = start_date + timedelta(seconds=random_seconds)
    return random_date.strftime('%Y%m%d_%H%M%S')

### Close to real-word application

In [6]:
# For each questionnaire (named 1-5)
for questionnaire in range(1, 6):
    url = f'https://raw.githubusercontent.com/croco22/CapstoneProjectTDS/refs/heads/main/questionnaires/questionnaire{questionnaire}.json'
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        print(f"Retrieved file: questionnaire{questionnaire}.json")
    else:
        print("Error while parsing a file: ", response.status_code)

    # Generate a set of possible answers including the option id
    answers_for_questions = list()
    for item in data:
        if item['type'] != 'SINGLE_SELECT': # Todo: Remove
            answers_for_questions.append({})
            continue
        answers_for_questions.append({
            item['id']: process_question(item)
        })
        print(f"Generated answers for question '{item['question']}'.")

    # Generate 5 answer sheets
    for sheet in range(1, 6):
        result = list()
        for idx, item in enumerate(data):
            if item['type'] != 'SINGLE_SELECT': continue # Todo: Remove

            # Pick a random answer and remove it from the answer pool
            # Todo: Pfusch pls fix
            # Todo: Remove answer from list of possible answers, e.g., w/ pop()
            answer_list = list(answers_for_questions[idx].values())[0]
            random_answer = random.choice(answer_list)
            answer_key, answer_value = list(random_answer.items())[0]

            result.append({
                "qid": item['id'], # ID of question
                "question": item['question'], # question as text
                "answer": answer_key, # answer text of user
                "check_aid": answer_value # ID of the intended answer to evaluate its correctness later
            })

        # Save to a new JSON file
        output_filename = f"userdata/q{questionnaire}_{generate_random_timestamp()}.json"
        with open(output_filename, 'w', encoding='utf-8') as f:
            json.dump(result, f, ensure_ascii=False, indent=4)
        print(f"Answer sheet {sheet} saved to file: {output_filename}")

Retrieved file: questionnaire1.json
Generated answers for question 'Data processing consent'.
Generated answers for question 'Customer group'.
Answer sheet 1 saved to file: userdata/q1_20241217_162024.json
Answer sheet 2 saved to file: userdata/q1_20241205_151048.json
Answer sheet 3 saved to file: userdata/q1_20241121_091448.json
Answer sheet 4 saved to file: userdata/q1_20241214_043350.json
Answer sheet 5 saved to file: userdata/q1_20241213_040341.json
Retrieved file: questionnaire2.json
Generated answers for question 'Would you like to receive marketing information from via e-mail?'.
Generated answers for question 'What industry are you operating in?'.
Answer sheet 1 saved to file: userdata/q2_20241218_031258.json
Answer sheet 2 saved to file: userdata/q2_20241217_164605.json
Answer sheet 3 saved to file: userdata/q2_20241126_113815.json
Answer sheet 4 saved to file: userdata/q2_20241218_182606.json
Answer sheet 5 saved to file: userdata/q2_20241219_231508.json
Retrieved file: questi

### Large dataset --> 1 step further

In [7]:
result = list()

# For each questionnaire (named 1-5)
for questionnaire in range(1, 6):
    url = f'https://raw.githubusercontent.com/croco22/CapstoneProjectTDS/refs/heads/main/questionnaires/questionnaire{questionnaire}.json'
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        print(f"Retrieved file: questionnaire{questionnaire}.json")
    else:
        print("Error while parsing a file: ", response.status_code)

    # Generate dataset with all generated answers
    for item in data:
        if item['type'] != 'SINGLE_SELECT': continue # Todo: Remove
        answer_list = process_question(item)
        for answer in answer_list:
          answer_key, answer_value = list(answer.items())[0]
          result.append({
              "qid": item['id'], # ID of question
              "question": item['question'], # question as text
              "answer": answer_key, # answer text of user
              "check_aid": answer_value # ID of the intended answer to evaluate its correctness later
          })
        print(f"Generated answers for question '{item['question']}'.")

# Save to a new JSON file
with open("qa_dataset.json", 'w', encoding='utf-8') as f:
    json.dump(result, f, ensure_ascii=False, indent=4)
print("Q&A dataset saved to file: qa_dataset.json")

Retrieved file: questionnaire1.json
Generated answers for question 'Data processing consent'.
Generated answers for question 'Customer group'.
Retrieved file: questionnaire2.json
Generated answers for question 'Would you like to receive marketing information from via e-mail?'.
Generated answers for question 'What industry are you operating in?'.
Retrieved file: questionnaire3.json
Generated answers for question 'What type of company is it?'.
Generated answers for question 'What is the size of your company?'.
Retrieved file: questionnaire4.json
Generated answers for question 'Which language is wanted for communication? '.
Retrieved file: questionnaire5.json
Generated answers for question 'Customer type'.
Generated answers for question 'Customer satisfaction'.
Generated answers for question 'Size of the trade fair team (on average)'.
Generated answers for question 'CRM-System'.
Generated answers for question 'Next steps'.
Q&A dataset saved to file: qa_dataset.json


In [8]:
# Todo: Generate more data based on this dataset