<a href="https://colab.research.google.com/github/croco22/CapstoneProjectTDS/blob/branch1/Capstone_Project_Group8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Capstone Project: Questionnaires

In [2]:
import json
import time
import google.generativeai as genai
from google.colab import userdata
import requests

key = userdata.get('GOOGLE_API_KEY')

genai.configure(api_key=key)

ai_model = genai.GenerativeModel('gemini-1.5-flash')

url = 'https://raw.githubusercontent.com/croco22/CapstoneProjectTDS/refs/heads/main/questionnaires/questionnaire1.json'
response = requests.get(url)

if response.status_code == 200:
    data1 = response.json()
    print("Retrieved file 'questionnaire1.json'")
else:
    print("Error while parsing a file: ", response.status_code)

Retrieved file 'questionnaire1.json'


In [3]:
def process_json(data, verbose=False):
    """
    Generate spoken answers for each option in the JSON data.
    """
    for item in data:
        if 'options' in item:
            for option in item['options']:
                response_text = generate_answer(item['question'], option['option'])
                if response_text:
                    option['text'] = [answer.strip() for answer in response_text.split("§")]
                    if verbose:
                        print(f"Answers generated for question '{item['question']}' and option '{option['option']}'")
                    time.sleep(3) # Required in the free version to avoid exceeding API limits
    return data


def generate_answer(question, option):
    """
    API call to generate spoken answers for each option.
    """
    prompt = f"""
    You are the user of an app and you are responding in a spoken style to the following question.
    You like to talk so you don't just say yes or no but rather answer with a whole sentence.
    Question: "{question}"
    Your answer should contain following content (e.g. if the content of the answer is yes, you convey this in your response):
    Content of the answer: "{option}"
    The responses should be in the following format and be kind of random so that each answer is in a different style.
    Generate 5 answers that are split by a § sign and contain only text.
    answer1§answer2§...§answer5
    """
    try:
        response = ai_model.generate_content(prompt)
        return response.text.strip()
    except Exception as e:
        print("Error during API call: ", e)
        return None

In [4]:
def predict_answers(processed_data, verbose=False):
    """
    Predict answers for each option in the JSON data.
    """
    correct_count = 0
    total_count = 0

    for item in processed_data:
        question = item["question"]
        options = item["options"]

        for option in options:
            correct_option = option["option"]
            for text in option["text"]:
                predictions = []

                for other_option in options:
                    exact_match = other_option["option"].strip().lower() in text.strip().lower()
                    if exact_match:
                      predictions.append((other_option["option"], 1.0))
                    else:
                      result = qa_pipeline(question=question, context=f"{text} {other_option['option']}")
                      predictions.append((other_option["option"], result["score"]))

                predicted_option = max(predictions, key=lambda x: x[1])[0]

                if verbose:
                    print(f"Text: {text}")
                    print(f"Correct: {correct_option}, Predicted: {predicted_option}\n")
                if predicted_option == correct_option:
                    correct_count += 1
                total_count += 1

    accuracy = correct_count / total_count if total_count > 0 else 0
    return accuracy

In [5]:
processed_data = process_json(data1, verbose=True)

Answers generated for question 'Data processing consent' and option 'Yes'
Answers generated for question 'Data processing consent' and option 'No'
Answers generated for question 'Customer group' and option 'End User'
Answers generated for question 'Customer group' and option 'Wholesaler, Distributor'
Answers generated for question 'Customer group' and option 'Consultant, Planner, Architect'
Answers generated for question 'Customer group' and option 'R&D'
Answers generated for question 'Products interested in' and option 'MY-SYSTEM'
Answers generated for question 'Products interested in' and option 'Notion'
Answers generated for question 'Products interested in' and option 'JTS'
Answers generated for question 'Products interested in' and option 'JS EcoLine'
Answers generated for question 'Products interested in' and option 'AKW100'
Answers generated for question 'Products interested in' and option 'AX100'
Answers generated for question 'What kind of follow up is planned' and option 'Ema

In [6]:
with open('q1_processed.json', 'w', encoding='utf-8') as f:
        json.dump(processed_data, f, ensure_ascii=False, indent=4)

In [7]:
from transformers import pipeline

qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

accuracy = predict_answers(processed_data, verbose=True)
print(f"Accuracy: {accuracy * 100:.2f}%")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

Text: Yeah, sure, go ahead and process my data, I'm cool with that.
Correct: Yes, Predicted: Yes

Text: Absolutely!  I'm happy for you to use my data.
Correct: Yes, Predicted: Yes

Text: Yes, data processing consent granted.  No problem at all!
Correct: Yes, Predicted: Yes

Text: Okay, yes, you have my consent to process the data.
Correct: Yes, Predicted: Yes

Text: Yep, that's a yes from me!  Process away.
Correct: Yes, Predicted: Yes

Text: Nope, I'd rather not give consent for data processing, thanks.
Correct: No, Predicted: No

Text: Nah, I'm good on that whole data processing consent thing.  I'm a little wary of it.
Correct: No, Predicted: No

Text: No way, I'm not comfortable with my data being processed.
Correct: No, Predicted: No

Text: I'm gonna have to say no to data processing consent, I'm not feeling it today.
Correct: No, Predicted: No

Text: No, I don't consent to data processing.
Correct: No, Predicted: No

Text: Oh, definitely, I'd say the customer group is the end user

In [9]:
# For each questionnaire
for questionnaire in range(1, 6):
    url = f'https://raw.githubusercontent.com/croco22/CapstoneProjectTDS/refs/heads/main/questionnaires/questionnaire{questionnaire}.json'
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        print(f"Retrieved file 'questionnaire{questionnaire}.json'")
    else:
        print("Error while parsing a file: ", response.status_code)

    processed_data = process_json(data)

    accuracy = predict_answers(processed_data)
    print(f"Accuracy for questionnaire {questionnaire}: {accuracy * 100:.2f}%")

Retrieved file 'questionnaire1.json'


KeyboardInterrupt: 