<a href="https://colab.research.google.com/github/croco22/CapstoneProjectTDS/blob/philipp/notebooks/Generate_QA_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task 1: Generate Q&A Dataset

### Imports and Setup

In [1]:
import json
import time
import google.generativeai as genai
from google.colab import userdata
import requests
import random
from datetime import datetime, timedelta
from itertools import combinations
import os
import ast
import pandas as pd

# Gemini API Setup
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel('gemini-2.0-flash-exp')


def pause():
    """
    Required pause in the free version to avoid exceeding API limits.
    """
    time.sleep(5)

## Handle different types of data

In [2]:
def process_question(data):
    """
    Generate spoken answers for the passed question.
    A distinction is made between the different types of questions.
    """
    type_handlers = {
        "SINGLE_SELECT": handle_single_select,
        "MULTI_SELECT": handle_multi_select,
        "TEXT": handle_text,
        "NUMBER": handle_number,
        "DATE": handle_date,
    }

    data_type = data.get('type')
    handler = type_handlers.get(data_type)

    time.sleep(10)

    if handler:
        return handler(data)
    else:
        exit(f"Unhandled data type: {data_type}")

### Single-Select

In [3]:
def handle_single_select(data):
    """
    Example output:
    intended_answer: ['Yes', 'Yes', ..., 'No', 'No', ...]
    context: ['Yeah, sure thing, ...', 'Nope, I'd rather ...', ...]
    """
    intended_answer = list()
    context = list()

    for option in data['options']:
        response_text = generate_single_answers(data['question'], option)
        texts_array = [answer.strip() for answer in response_text.split("§")]

        intended_answer.extend([option] * 5)
        context.extend(texts_array)

        pause()

    print(f"Generated context for question: '{data['question']}'")

    return intended_answer, context


def generate_single_answers(question, option):
    """
    API call to generate spoken-style answers for text questions.
    """
    prompt = f"""
        You are an app user responding to the following question in a conversational, spoken style.
        You enjoy talking, so you respond with full sentences rather than a simple 'yes' or 'no'.
        Question: '{question}'
        Your response must explicitly convey the provided content: '{option}'.
        Generate 5 unique and varied responses, formatted as: 'answer1§answer2§...§answer5'.
        Return only the generated responses in the specified format, without any additional explanation or comments.
    """
    try:
        response = model.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        exit("Error during API call: ", e)

### Multi-Select

In [4]:
def handle_multi_select(data):
    """
    Example output:
    intended_answer: [["MY-SYSTEM", "Notion"], ["Notion"], ...]
    context: ['Yeah, that would be MY-SYSTEM and Notion, ...', 'Hmm, I think I'm mainly interested in Notion ...', ...]
    """
    intended_answer = list()
    context = list()

    # Generate all possible combinations of options (subsets)
    all_combinations = []
    for r in range(1, len(data['options']) + 1):
        all_combinations.extend(list(combinations(data['options'], r)))

    # Shuffle combinations for randomness
    random.shuffle(all_combinations)

    # Only generate answers for a random sample of combinations
    selected_combinations = random.sample(all_combinations, min(5, len(all_combinations)))

    for combo in selected_combinations:
        response_text = generate_multi_answers(data['question'], combo)
        texts_array = [answer.strip() for answer in response_text.split("§")]

        intended_answer.extend([combo] * 5)
        context.extend(texts_array)

        pause()

    print(f"Generated context for question: '{data['question']}'")

    return intended_answer, context


def generate_multi_answers(question, options):
    """
    API call to generate spoken answers for multiple options.
    """
    options_text = ", ".join(options)
    prompt = f"""
        You are an app user responding to the following question in a conversational, spoken style.
        You enjoy talking, so you respond with full sentences rather than a simple 'yes' or 'no'.
        Question: '{question}'
        Your response must contain all of the following text elements explicitly to be valid: '{options_text}'.
        Generate 5 unique and varied responses, formatted as: 'answer1§answer2§...§answer5'.
        Return only the generated responses in the specified format, without any additional explanation or comments.
    """
    try:
        response = model.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        exit("Error during API call: ", e)

### Text

In [5]:
def handle_text(data):
    """
    Example output:
    intended_answer: [None, None, ...]
    context: ['You can only reach me on Tuesdays or Wednesdays.', 'I have no notes to add.', ...]
    """
    intended_answer = list()
    context = list()

    response_text = generate_text_answers(data['question'])
    texts_array = [answer.strip() for answer in response_text.split("§")]

    intended_answer.extend([None] * 5)
    context.extend(texts_array)

    pause()

    print(f"Generated context for question: '{data['question']}'")

    return intended_answer, context


def generate_text_answers(question):
    """
    API call to generate spoken-style answers for text questions.
    """
    prompt = f"""
        You are an app user responding to the following question in a conversational, spoken style.
        You enjoy talking, so you respond with full sentences rather than a simple 'yes' or 'no'.
        Question: '{question}'
        Generate 5 unique and varied responses, formatted as: 'answer1§answer2§...§answer5'.
        Return only the generated responses in the specified format, without any additional explanation or comments.
    """
    try:
        response = model.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        exit("Error during API call: ", e)

### Number

In [6]:
def generate_random_phone_number():
    """
    Generates a random phone number in an international format.
    """
    country_code = random.choice(["+1", "+44", "+49", "+33", "+91"])
    area_code = random.randint(100, 999)
    local_number = f"{random.randint(100, 999)}-{random.randint(1000, 9999)}"
    return f"{country_code}-{area_code}-{local_number}"


def handle_number(data):
    """
    Example output:
    intended_answer: ['+1-555-123-4567', '+44-7700-900123', ...]
    context: ['Sure, you can reach me at +1-555-123-4567.', 'My number is +44-7700-900123.', ...]
    """
    intended_answer = list()
    context = list()

    phone_numbers = [generate_random_phone_number() for _ in range(5)]

    for option in phone_numbers:
        response_text = generate_number_answers(data['question'], option)
        texts_array = [answer.strip() for answer in response_text.split("§")]

        intended_answer.extend([option] * 5)
        context.extend(texts_array)

        pause()

    print(f"Generated context for question: '{data['question']}'")

    return intended_answer, context


def generate_number_answers(question, option):
    """
    API call to generate spoken-style answers for text questions.
    """
    prompt = f"""
        You are an app user responding to the following question in a conversational, spoken style.
        You enjoy talking, so you respond with full sentences rather than a simple 'yes' or 'no'.
        Question: '{question}'
        Your response must contain the following phone number explicitly to be valid: '{option}'.
        Generate 5 unique and varied responses, formatted as: 'answer1§answer2§...§answer5'.
        Return only the generated responses in the specified format, without any additional explanation or comments.
    """
    try:
        response = model.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        exit("Error during API call: ", e)

### Date

In [7]:
def handle_date(data):
    """
    Example output:
    intended_answer: [86400, 1814400, ...]
    context: ['Tomorrow would be good ...', 'How about in three weeks ...', ...]
    """
    response_text = generate_date_answers(data['question'])
    response_text = response_text.strip('"').strip("'")
    texts_array = [answer.strip() for answer in response_text.split("§")]

    intended_answer = texts_array[5:]
    context = texts_array[:5]

    pause()

    print(f"Generated context for question: '{data['question']}'")

    return intended_answer, context


def generate_date_answers(question):
    """
    Generates responses for date questions.
    """
    prompt = f"""
        You are an app user responding to the following question in a conversational, spoken style.
        You enjoy talking, so you respond with full sentences rather than a simple 'yes' or 'no'.
        Question: '{question}'
        Your answer must contain a time reference in the future, such as 'tomorrow', 'in three weeks', etc.
        Additionally you have to give a calculation reference as an Integer value for this
        in seconds without naming a fixed date, e.g. 'tomorrow'=86400; 'in three weeks'=1814400
        Generate 5 unique and varied responses, formatted as: 'answer1§answer2§...§answer5§calculation1§calculation2§...§calculation5'.
        Return only the generated responses in the specified format, without any additional explanation or comments.
    """
    try:
        response = model.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        exit("Error during API call: ", e)

## Import data from the provided questionnaires

In [8]:
dfs = list()

for questionnaire in range(1, 6):
    url = f'https://raw.githubusercontent.com/croco22/CapstoneProjectTDS/refs/heads/main/questionnaires/questionnaire{questionnaire}.json'

    df = pd.read_json(url)

    df['options'] = df['options'].apply(lambda x: [option['option'] for option in x])

    df = df.drop(columns=['id'], errors='ignore')

    df.loc[df['type'].isin(['TEXT', 'NUMBER', 'DATE']), 'options'] = None

    dfs.append(df)

df = pd.concat(dfs, ignore_index=True)

df.head()

Unnamed: 0,type,question,options
0,SINGLE_SELECT,Data processing consent,"[Yes, No]"
1,SINGLE_SELECT,Customer group,"[End User, Wholesaler, Distributor, Consultant..."
2,MULTI_SELECT,Products interested in,"[MY-SYSTEM, Notion, JTS, JS EcoLine, AKW100, A..."
3,MULTI_SELECT,What kind of follow up is planned,"[Email, Phone, Schedule a Visit, No action]"
4,MULTI_SELECT,Who to copy in follow up,"[Stephan Maier, Joachim Wagner, Erik Schneider..."


## Generate more questions

### Helper modules for each type of question

In [9]:
def generate_select_question():
    prompt = f"""
        Generate a question that could be asked to an app user in a business context, designed as
        either a single-choice or multiple-choice question. Provide the question
        and an array of answer options in the following format:
        [question, [option1, option2, ..., optionN]]
        Respond strictly in this format without additional explanations, comments, or text.
    """
    try:
        response = model.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        exit("Error during API call: ", e)


def generate_text_question():
    prompt = f"""
        Generate a question that could be asked to an app user in a business context, designed as
        an open text entry question. Return the generated question without
        additional explanations, comments, or text.
    """
    try:
        response = model.generate_content(prompt)
        return response.text.strip(), None
    except Exception as e:
        exit("Error during API call: ", e)


def generate_date_question():
    prompt = f"""
        Generate a question asking an app user in a business context to provide a date in the future.
        Respond with only the question text, without any additional
        explanations, comments, or extraneous content.
    """
    try:
        response = model.generate_content(prompt)
        return response.text.strip(), None
    except Exception as e:
        exit("Error during API call: ", e)


def generate_number_question():
    prompt = f"""
        Generate a question asking an app user in a business context to provide a phone number.
        Respond with only the question text, without any additional
        explanations, comments, or extraneous content.
    """
    try:
        response = model.generate_content(prompt)
        return response.text.strip(), None
    except Exception as e:
        exit("Error during API call: ", e)

In [10]:
ext_data = list()
n_questions_per_type = 10

for t in ["SINGLE_SELECT", "MULTI_SELECT", "TEXT", "NUMBER", "DATE"]:
    for _ in range(n_questions_per_type):
        if t == "SINGLE_SELECT":
            question, options = ast.literal_eval(generate_select_question())
        elif t == "MULTI_SELECT":
            question, options = ast.literal_eval(generate_select_question())
        elif t == "TEXT":
            question, options = generate_text_question()
        elif t == "DATE":
            question, options = generate_date_question()
        elif t == "NUMBER":
            question, options = generate_number_question()

        ext_data.append({"type": t, "question": question, "options": options})

        pause()

    time.sleep(30)
    print(f"Generated {n_questions_per_type} questions of type: {t}")

ext_df = pd.DataFrame(ext_data)

ext_df.head()

Generated 10 questions of type: SINGLE_SELECT


ERROR:tornado.access:503 POST /v1beta/models/gemini-2.0-flash-exp:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 759.19ms


Generated 10 questions of type: MULTI_SELECT
Generated 10 questions of type: TEXT
Generated 10 questions of type: NUMBER
Generated 10 questions of type: DATE


Unnamed: 0,type,question,options
0,SINGLE_SELECT,What is your primary role in your organization?,"[Executive, Manager, Team Lead, Individual Con..."
1,SINGLE_SELECT,Which of the following best describes your pri...,"[Schedule a meeting, Track project progress, C..."
2,SINGLE_SELECT,Which of these best describes your primary goa...,"[Track project progress, Manage my team's task..."
3,SINGLE_SELECT,Which of the following best describes your pri...,"[Generate a report, Update customer informatio..."
4,SINGLE_SELECT,What is your primary goal for using this proje...,"[Task Management, Collaboration, Resource Allo..."


## Create user inputs (context) for each question

In [11]:
def generate_random_timestamp():
    """
    Generate random timestamp within the last 30 days.
    """
    start_date = datetime.now() - timedelta(days=30)
    random_seconds = random.randint(0, 30 * 24 * 60 * 60)
    random_date = start_date + timedelta(seconds=random_seconds)
    return random_date # .strftime('%Y%m%d_%H%M%S')

In [12]:
df = pd.concat([df, ext_df], ignore_index=True)

new_cols = ['intended_answer', 'context']

df[new_cols] = df.apply(
    lambda row: pd.Series(process_question(row)), axis=1
)

df['timestamp'] = [generate_random_timestamp() for _ in range(len(df))]

df = df.explode(new_cols, ignore_index=True)

ERROR:tornado.access:503 POST /v1beta/models/gemini-2.0-flash-exp:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 532.40ms


Generated context for question: 'Data processing consent'
Generated context for question: 'Customer group'
Generated context for question: 'Products interested in'
Generated context for question: 'What kind of follow up is planned'
Generated context for question: 'Who to copy in follow up'
Generated context for question: 'Would you like to receive marketing information from via e-mail?'
Generated context for question: 'What industry are you operating in?'
Generated context for question: 'What products are you interested in?'
Generated context for question: 'Notes'
Generated context for question: 'What type of company is it?'
Generated context for question: 'What is the size of your company?'
Generated context for question: 'When do you wish to receive a follow-up?'
Generated context for question: 'Any additional notes?'
Generated context for question: 'Which language is wanted for communication? '
Generated context for question: 'What is the type of contact?'
Generated context for ques

ERROR:tornado.access:503 POST /v1beta/models/gemini-2.0-flash-exp:generateContent?%24alt=json%3Benum-encoding%3Dint (127.0.0.1) 633.04ms


Generated context for question: 'What is your primary goal for using this app today?'
Generated context for question: 'What is your primary goal for using this app today?'
Generated context for question: 'Which of these best describes your primary goal for using this app today?'
Generated context for question: 'Which of the following best describes your primary reason for using our project management app today?'
Generated context for question: 'What is the primary purpose of your visit today?'
Generated context for question: 'What is your primary goal for using this project management app?'
Generated context for question: 'Which of the following best describes your primary use of this app for your team?'
Generated context for question: 'Which department do you primarily work in?'
Generated context for question: 'What is the primary reason you are using our project management app today?'
Generated context for question: 'What is your primary goal for using this app today?'
Generated cont

In [13]:
# Save dataset to a new JSON file
df.to_json('qa_dataset.json', orient='records', indent=4)