<a href="https://colab.research.google.com/github/croco22/CapstoneProjectTDS/blob/philipp/notebooks/Dashboard_Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Extra Task 3: Live Demo

## Imports and Setup

In [1]:
%%capture
!pip install gradio
!pip install git+https://github.com/openai/whisper.git

import time
import os
from datetime import datetime

import pandas as pd
import gradio as gr
import whisper
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import google.generativeai as genai
from google.colab import userdata


# API Setup
genai.configure(api_key=userdata.get('GOOGLE_API_KEY'))
model = genai.GenerativeModel('gemini-2.0-flash-exp')
userdata.get('HF_TOKEN')


def generate_text(prompt):
    try:
        response = model.generate_content(
            prompt,
            generation_config = genai.GenerationConfig(
                temperature=2.0,
            )
        )
        time.sleep(5) # Avoid exceeding API limits
        return response.text.strip()
    except Exception as e:
        exit("Error during API call: ", e)

## Load data from the provided questionnaires

In [2]:
dfs = list()

for q in range(1, 6):
    url = f'https://raw.githubusercontent.com/croco22/CapstoneProjectTDS/refs/heads/main/questionnaires/questionnaire{q}.json'
    temp_df = pd.read_json(url)

    # Unpack options into an array
    temp_df['options'] = temp_df['options'].apply(lambda x: [option['option'] for option in x])

    # Remove options for specific question types
    # because irrelevant or do not contribute meaningfully to the dataset
    temp_df.loc[temp_df['type'].isin(['TEXT', 'NUMBER', 'DATE']), 'options'] = None

    temp_df['questionnaire'] = f"Questionnaire {q}"

    dfs.append(temp_df)

df = pd.concat(dfs, ignore_index=True)

## Rephrase questions if necessary

In [3]:
def rephrase_question(text):
    prompt = f"""Reformulate the following statement into a clear, concise,
        and grammatically correct question that maintains its original meaning.
        If the text is already a question, preserve its intent without altering
        content or facts. The statement is: '{text}'.
        Return the generated question without additional explanations,
        comments, or text.
    """
    return generate_text(prompt)


df['rephrased_question'] = df['question'].apply(rephrase_question)

df = df[['questionnaire', 'rephrased_question', 'options']].copy()
df['answer'] = None

df.head()

Unnamed: 0,questionnaire,rephrased_question,options,answer
0,Questionnaire 1,Is consent given for data processing?,"[Yes, No]",
1,Questionnaire 1,What is the customer group?,"[End User, Wholesaler, Distributor, Consultant...",
2,Questionnaire 1,What products are you interested in?,"[MY-SYSTEM, Notion, JTS, JS EcoLine, AKW100, A...",
3,Questionnaire 1,What kind of follow-up is planned?,"[Email, Phone, Schedule a Visit, No action]",
4,Questionnaire 1,Who should be copied on the follow-up?,"[Stephan Maier, Joachim Wagner, Erik Schneider...",


## Evaluate context
The code selects the most relevant question from a list based on a combination of semantic similarity (using embeddings) and contextual relevance (using a question-answering model).

In [None]:
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

In [36]:
def evaluate_context(context, unanswered_questions):
    best_question = None
    highest_score = 0
    context_embedding = embedding_model.encode([context])

    for question in unanswered_questions:
        qa_result = qa_pipeline(question=question, context=context)
        score = qa_result['score']

        question_embedding = embedding_model.encode([question])
        similarity = cosine_similarity(context_embedding, question_embedding)[0][0]

        combined_score = score * similarity

        if combined_score > highest_score:
            highest_score = combined_score
            best_question = question

    return best_question

## Gradio application

In [56]:
def evaluate_message(message, unanswered_questions, df_v):
    if isinstance(unanswered_questions, gr.State):
        uq_list = unanswered_questions.value
    else:
        uq_list = unanswered_questions
    real_question = evaluate_context(message, uq_list)
    # answer = evaluate_context(message, df[df['rephrased_question'] == real_question]['options'])
    df_v.loc[df_v['rephrased_question'] == real_question, 'answer'] = "ETSTSTSETETTE"
    uq_list.remove(real_question)
    return uq_list, df_v


def set_response(unanswered_questions):
    if isinstance(unanswered_questions, gr.State):
        uq_list = unanswered_questions.value
    else:
        uq_list = unanswered_questions

    if len(uq_list) == 0:
            return "Thank you! You have completed all the questions in this questionnaire. Please submit your responses by clicking the 'Send' button at the bottom."

    unanswered_questions_string = '\n'.join(uq_list)
    response = "Please answer the following questions:\n" + unanswered_questions_string
    return response

In [57]:
def start_chat(selected_questionnaire, history):
    q = df[df['questionnaire'] == selected_questionnaire]['rephrased_question'].tolist()
    history.append({"role": "assistant", "content": set_response(q)})
    return (
        gr.update(visible=False),
        gr.update(visible=False),
        gr.update(value=history, visible=True),
        gr.update(visible=True),
        gr.update(visible=True),
        gr.State(q)
    )


def add_message(history, message, unanswered_questions, df_state):
    if isinstance(df_state, gr.State):
        df_v = df_state.value
    else:
        df_v = df_state

    if message.get("audio") is not None:
        model_turbo = whisper.load_model("turbo")
        result = model_turbo.transcribe(message["audio"])
        message = result["text"]
    elif message["text"] is not None:
        message = message["text"]

    history.append({"role": "user", "content": message})
    uq_list, df_new = evaluate_message(message, unanswered_questions, df_v)
    if len(uq_list) == 0:
        return history, gr.update(visible=False), gr.State([]), gr.State({})
    else:
        return history, gr.MultimodalTextbox(value=None), gr.State(uq_list), gr.State(df_new)


def bot(history, unanswered_questions):
    response = set_response(unanswered_questions)
    history.append({"role": "assistant", "content": ""})
    for character in response: # Cool writing effect
        history[-1]["content"] += character
        time.sleep(0.01)
        yield history


def download(selected_questionnaire, df_state):
    if isinstance(df_state, gr.State):
        df_v = df_state.value
    else:
        df_v = df_state

    current_timestamp = datetime.now()
    formatted_timestamp = current_timestamp.strftime("%Y%m%d_%H%M%S")
    filename = f"answers_{formatted_timestamp}.json"

    filtered_df = df_v[df_v['questionnaire'] == selected_questionnaire].copy()
    filtered_df = filtered_df[['rephrased_question', 'answer']].rename(columns={"rephrased_question": "question"})
    filtered_df.to_json(filename, orient='records', indent=4)

    return filename


def reset_state():
    return (
        gr.update(visible=True),
        gr.update(visible=True),
        gr.update(value=list(), visible=False),
        gr.update(value=dict(), visible=False),
        gr.update(visible=False),
        gr.State([])
    )

In [59]:
with gr.Blocks(theme='Nymbo/Nymbo_Theme') as app:
    gr.Markdown("""
        # Questionnaire Chatbot 👨‍💻🚀
        ### by Philipp Landeck
        <br>
    """)

    unanswered_questions = gr.State(value=[])
    df_state = gr.State(value=df)

    dropdown = gr.Dropdown(
        choices=list(df['questionnaire'].unique()),
        # multiselect=True,
        label="Choose a questionnaire",
        interactive=True,
        visible=True,
        value=df['questionnaire'].iloc[0]
    )

    start_button = gr.Button("Start", visible=True)

    chatbot = gr.Chatbot(
        show_label=False,
        type="messages",
        visible=False
    )

    chat_input = gr.MultimodalTextbox(
        interactive=True,
        sources=['microphone'],
        placeholder="Enter message or record voice...",
        show_label=False,
        autofocus=True,
        visible=False
    )

    # Call download() function
    send_button = gr.DownloadButton("Send", value=download, visible=False, inputs=[dropdown, df_state])

    # Call start_chat() function
    start_button.click(
        start_chat, [dropdown, chatbot], [dropdown, start_button, chatbot, chat_input, send_button, unanswered_questions]
    )

    # Call add_message() function
    chat_msg = chat_input.submit(
        add_message, [chatbot, chat_input, unanswered_questions, df_state], [chatbot, chat_input, unanswered_questions, df_state]
    )

    # Call bot() function
    chat_msg.then(bot, [chatbot, unanswered_questions], chatbot)

    # Call reset_state() function
    send_button.click(
        reset_state, None, [dropdown, start_button, chatbot, chat_input, send_button, unanswered_questions]
    )


app.launch(share=True, debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://d8f519c0d342a4a8f1.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


      questionnaire                                 rephrased_question  \
0   Questionnaire 1              Is consent given for data processing?   
1   Questionnaire 1                        What is the customer group?   
2   Questionnaire 1               What products are you interested in?   
3   Questionnaire 1                 What kind of follow-up is planned?   
4   Questionnaire 1             Who should be copied on the follow-up?   
5   Questionnaire 2           Do you want to receive marketing emails?   
6   Questionnaire 2                What industry are you operating in?   
7   Questionnaire 2               What products are you interested in?   
8   Questionnaire 2                   What notes are you referring to?   
9   Questionnaire 3                        What type of company is it?   
10  Questionnaire 3                  What is the size of your company?   
11  Questionnaire 3                When do you want to be followed up?   
12  Questionnaire 3                   

