<a href="https://colab.research.google.com/github/croco22/CapstoneProjectTDS/blob/main/notebooks/Generate_QA_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Task 1: Generate Q&A Dataset

In [32]:
import json
import time
import google.generativeai as genai
from google.colab import userdata
import requests
import random
from datetime import datetime, timedelta
from itertools import combinations
import os
import pandas as pd

# Gemini API Setup
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel('gemini-1.5-flash')

# Huggingface Token
HF_TOKEN = userdata.get('HF_API_KEY')

In [33]:
def generate_random_timestamp():
    """
    Generate random timestamp within the last 30 days in the format '%Y%m%d_%H%M%S'.
    """
    start_date = datetime.now() - timedelta(days=30)
    random_seconds = random.randint(0, 30 * 24 * 60 * 60)
    random_date = start_date + timedelta(seconds=random_seconds)
    return random_date.strftime('%Y%m%d_%H%M%S')

## Define modules to process different types of data
### Handler method

In [34]:
def process_question(data):
    """
    Generate spoken answers for the passed question.
    A distinction is made between the different types of questions.
    """
    type_handlers = {
        "SINGLE_SELECT": handle_single_select,
        "MULTI_SELECT": handle_multi_select,
        "TEXT": handle_text,
        "NUMBER": handle_number,
        "DATE": handle_date,
    }

    data_type = data.get('type')
    handler = type_handlers.get(data_type)

    if handler:
        return handler(data)
    else:
        exit(f"Unhandled data type: {data_type}")

### Single Select

In [35]:
def handle_single_select(data):
    """
    Example output:
    intended_answer: ['Yes', 'Yes', ..., 'No', 'No', ...]
    context: ['Yeah, sure thing, ...', 'Nope, I'd rather ...', ...]
    """
    intended_answer = list()
    context = list()

    for option in data['options']:
        response_text = generate_single_answers(data['question'], option)
        texts_array = [answer.strip() for answer in response_text.split("§")]

        intended_answer.extend([option] * 5)
        context.extend(texts_array)

        time.sleep(3) # Required in the free version to avoid exceeding API limits

    print(f"Generated context for question: '{data['question']}'")

    return intended_answer, context


def generate_single_answers(question, option):
    """
    API call to generate spoken-style answers for text questions.
    """
    prompt = f"""
        You are an app user responding to the following question in a conversational, spoken style.
        You enjoy talking, so you respond with full sentences rather than a simple 'yes' or 'no'.
        Question: '{question}'
        Your response must explicitly convey the provided content: '{option}'.
        Generate 5 unique and varied responses, formatted as: 'answer1§answer2§...§answer5'.
        Return only the generated responses in the specified format, without any additional explanation or comments.
    """
    try:
        response = model.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        exit("Error during API call: ", e)

### Multi Select

In [36]:
def handle_multi_select(data):
    """
    Example output:
    intended_answer: [["MY-SYSTEM", "Notion"], ["Notion"], ...]
    context: ['Yeah, that would be MY-SYSTEM and Notion, ...', 'Hmm, I think I'm mainly interested in Notion ...', ...]
    """
    intended_answer = list()
    context = list()

    # Generate all possible combinations of options (subsets)
    all_combinations = []
    for r in range(1, len(data['options']) + 1):
        all_combinations.extend(list(combinations(data['options'], r)))

    # Shuffle combinations for randomness
    random.shuffle(all_combinations)

    # Only generate answers for a random sample of combinations
    selected_combinations = random.sample(all_combinations, min(5, len(all_combinations)))

    for combo in selected_combinations:
        response_text = generate_multi_answers(data['question'], combo)
        texts_array = [answer.strip() for answer in response_text.split("§")]

        intended_answer.extend([combo] * 5)
        context.extend(texts_array)

        time.sleep(3) # Avoid exceeding API limits in the free version

    print(f"Generated context for question: '{data['question']}'")

    return intended_answer, context


def generate_multi_answers(question, options):
    """
    API call to generate spoken answers for multiple options.
    """
    options_text = ", ".join(options)
    prompt = f"""
        You are an app user responding to the following question in a conversational, spoken style.
        You enjoy talking, so you respond with full sentences rather than a simple 'yes' or 'no'.
        Question: '{question}'
        Your response must contain all of the following text elements explicitly to be valid: '{options_text}'.
        Generate 5 unique and varied responses, formatted as: 'answer1§answer2§...§answer5'.
        Return only the generated responses in the specified format, without any additional explanation or comments.
    """
    try:
        response = model.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        exit("Error during API call: ", e)

### Text

In [37]:
def handle_text(data):
    """
    Example output:
    intended_answer: [None, None, ...]
    context: ['You can only reach me on Tuesdays or Wednesdays.', 'I have no notes to add.', ...]
    """
    intended_answer = list()
    context = list()

    response_text = generate_text_answers(data['question'])
    texts_array = [answer.strip() for answer in response_text.split("§")]

    intended_answer.extend([None] * 5)
    context.extend(texts_array)

    time.sleep(3) # Avoid exceeding API limits in the free version

    print(f"Generated context for question: '{data['question']}'")

    return intended_answer, context


def generate_text_answers(question):
    """
    API call to generate spoken-style answers for text questions.
    """
    prompt = f"""
        You are an app user responding to the following question in a conversational, spoken style.
        You enjoy talking, so you respond with full sentences rather than a simple 'yes' or 'no'.
        Question: '{question}'
        Generate 5 unique and varied responses, formatted as: 'answer1§answer2§...§answer5'.
        Return only the generated responses in the specified format, without any additional explanation or comments.
    """
    try:
        response = model.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        exit("Error during API call: ", e)

### Number

In [38]:
def generate_random_phone_number():
    """
    Generates a random phone number in an international format.
    """
    country_code = random.choice(["+1", "+44", "+49", "+33", "+91"])
    area_code = random.randint(100, 999)
    local_number = f"{random.randint(100, 999)}-{random.randint(1000, 9999)}"
    return f"{country_code}-{area_code}-{local_number}"


def handle_number(data):
    """
    Example output:
    intended_answer: ['+1-555-123-4567', '+44-7700-900123', ...]
    context: ['Sure, you can reach me at +1-555-123-4567.', 'My number is +44-7700-900123.', ...]
    """
    intended_answer = list()
    context = list()

    phone_numbers = [generate_random_phone_number() for _ in range(5)]

    for option in phone_numbers:
        response_text = generate_number_answers(data['question'], option)
        texts_array = [answer.strip() for answer in response_text.split("§")]

        intended_answer.extend([option] * 5)
        context.extend(texts_array)

        time.sleep(3) # Required in the free version to avoid exceeding API limits

    print(f"Generated context for question: '{data['question']}'")

    return intended_answer, context


def generate_number_answers(question, option):
    """
    API call to generate spoken-style answers for text questions.
    """
    prompt = f"""
        You are an app user responding to the following question in a conversational, spoken style.
        You enjoy talking, so you respond with full sentences rather than a simple 'yes' or 'no'.
        Question: '{question}'
        Your response must contain the following phone number explicitly to be valid: '{option}'.
        Generate 5 unique and varied responses, formatted as: 'answer1§answer2§...§answer5'.
        Return only the generated responses in the specified format, without any additional explanation or comments.
    """
    try:
        response = model.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        exit("Error during API call: ", e)

### Date

In [39]:
def handle_date(data):
    """
    Example output:
    intended_answer: [86400, 1814400, ...]
    context: ['Tomorrow would be good ...', 'How about in three weeks ...', ...]
    """
    response_text = generate_date_answers(data['question'])
    response_text = response_text.strip('"').strip("'") # idk why this happens only here
    texts_array = [answer.strip() for answer in response_text.split("§")]

    intended_answer = texts_array[5:]
    context = texts_array[:5]

    time.sleep(3) # Required in the free version to avoid exceeding API limits

    print(f"Generated context for question: '{data['question']}'")

    return intended_answer, context


def generate_date_answers(question):
    """
    Generates responses for date questions.
    """
    prompt = f"""
        You are an app user responding to the following question in a conversational, spoken style.
        You enjoy talking, so you respond with full sentences rather than a simple 'yes' or 'no'.
        Question: '{question}'
        Your answer must contain a time reference in the future, such as 'tomorrow', 'in three weeks', etc.
        Additionally you have to give a calculation reference as an Integer value for this
        in seconds without naming a fixed date, e.g. 'tomorrow'=86400; 'in three weeks'=1814400
        Generate 5 unique and varied responses, formatted as: 'answer1§answer2§...§answer5§calculation1§calculation2§...§calculation5'.
        Return only the generated responses in the specified format, without any additional explanation or comments.
    """
    try:
        response = model.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        exit("Error during API call: ", e)

## Run defined modules for each question provided and save to a JSON-file

In [40]:
dfs = list()

for questionnaire in range(1, 6):
    url = f'https://raw.githubusercontent.com/croco22/CapstoneProjectTDS/refs/heads/main/questionnaires/questionnaire{questionnaire}.json'

    df = pd.read_json(url)

    df['options'] = df['options'].apply(lambda x: [option['option'] for option in x])

    df = df.drop(columns=['id'], errors='ignore')

    df.loc[df['type'].isin(['TEXT', 'NUMBER', 'DATE']), 'options'] = None

    dfs.append(df)

df = pd.concat(dfs, ignore_index=True)

df.head()

Unnamed: 0,type,question,options
0,SINGLE_SELECT,Data processing consent,"[Yes, No]"
1,SINGLE_SELECT,Customer group,"[End User, Wholesaler, Distributor, Consultant..."
2,MULTI_SELECT,Products interested in,"[MY-SYSTEM, Notion, JTS, JS EcoLine, AKW100, A..."
3,MULTI_SELECT,What kind of follow up is planned,"[Email, Phone, Schedule a Visit, No action]"
4,MULTI_SELECT,Who to copy in follow up,"[Stephan Maier, Joachim Wagner, Erik Schneider..."


In [41]:
df[df['type'] == "TEXT"]

Unnamed: 0,type,question,options
8,TEXT,Notes,
12,TEXT,Any additional notes?,


In [42]:
# Filter the first 'Notes' question
df = df[df['question'] != "Notes"]

In [43]:
new_cols = ['intended_answer', 'context']

df[new_cols] = df.apply(
    lambda row: pd.Series(process_question(row)), axis=1
)

df = df.explode(new_cols, ignore_index=True)

Generated context for question: 'Data processing consent'
Generated context for question: 'Customer group'
Generated context for question: 'Products interested in'
Generated context for question: 'What kind of follow up is planned'
Generated context for question: 'Who to copy in follow up'
Generated context for question: 'Would you like to receive marketing information from via e-mail?'
Generated context for question: 'What industry are you operating in?'
Generated context for question: 'What products are you interested in?'
Generated context for question: 'What type of company is it?'
Generated context for question: 'What is the size of your company?'
Generated context for question: 'When do you wish to receive a follow-up?'
Generated context for question: 'Any additional notes?'
Generated context for question: 'Which language is wanted for communication? '
Generated context for question: 'What is the type of contact?'
Generated context for question: 'What is the contact person intere

In [44]:
# Save dataset to a new JSON file
df.to_json('qa_dataset.json', orient='records', indent=4)

# Erweiterung durch KI

In [49]:
# import google.generativeai as genai

# model = genai.GenerativeModel("gemini-2.0-flash-exp")

In [48]:
# prompt = """
#     You are given a dataset with the following columns:
#     - 'type': Specifies the type of input. Possible values are ['SINGLE_SELECT', 'MULTI_SELECT', 'TEXT', 'NUMBER', 'DATE'].
#     - 'question': The question to be answered by an app user, which is a simple string.
#     - 'options': The possible answer options, which can either be a string, an array of strings, or None.
#     - 'intended_answer': The correct answer(s), which could be an integer, a string, or an array of values. If 'options' is not None, 'intended_answer' should be a subset of 'options'.
#     - 'context': Provides context, which helps explain how to answer the 'question' in a natural language manner, based on the 'intended_answer'.

#     Your task is to extend this dataset with similar pairs of samples. Return a JSON file with new data.
# """

# dataset = genai.upload_file('qa_dataset.json')

# result = model.generate_content(
#     [prompt, dataset],
#     generation_config=genai.GenerationConfig(
#         response_mime_type="application/json"),
# )
