In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append("../src")

In [2]:
import logging
import os
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from ipywidgets import widgets

from benchmark import (
    INVALID_ANSWER,
    NO_ANSWER,
    VALID_ANSWER,
    aggregate_responses,
    calibration_curve,
    create_fig_accuracy_distribution,
    create_fig_calibration_curve,
    create_fig_calibration_ece,
    create_fig_confidence_distribution,
    create_fig_informativeness_diversity,
    create_fig_meaningfulness_kldiv,
    create_subplots,
    detect_names_from_dict,
    empirical_distr,
    extract_predictions,
    kl_div,
    load_predictions,
    load_responses,
    load_responses_all,
    plot_annotation,
    plot_calibration_curve,
    plot_confidence_histogram,
    plot_heatmap,
    save_fig,
    save_predictions,
)
from utils_ext.plot import get_figlayout
from utils_ext.tools import setup_logging
from utils_ext.widgets import (
    FileExplorerWidget,
    build_widget_outputs,
    display_table,
)

plt.ioff()
setup_logging()

logger = logging.getLogger(__name__)

PATH_OUTPUT = "../results"

In [3]:
DATASET_CACHE = {}

## Load results

In [None]:
# OPTION 1: load responses and extract predictions
responses_all = load_responses_all(f"{PATH_OUTPUT}/responses", dataset_cache=DATASET_CACHE)
y_true_all, y_pred_all = extract_predictions(responses_all, sample=1000)

# save_predictions(responses_all, f"{PATH_OUTPUT}/predictions", sample=None)
# save_predictions(responses_all, f"{PATH_OUTPUT}/predictions_sampled", sample=1000)

In [4]:
# OPTION 2: load predictions only
y_true_all, y_pred_all = load_predictions(f"{PATH_OUTPUT}/predictions_sampled")

## Answer statistics

In [None]:
def display_responses_all_answer_stats(responses_all, save=None):
    dataset_names, model_names, method_names = detect_names_from_dict(responses_all)

    # compute
    answer_stats = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: (0, 0, 0))))
    answer_stats_over_dataset = defaultdict(lambda: defaultdict(lambda: (0, 0, 0)))
    for model_name in model_names:
        for method_name in method_names:
            for dataset_name in dataset_names:
                responses = responses_all[dataset_name][model_name][method_name]
                if responses is None:
                    continue
                answer_stats[model_name][method_name][dataset_name] = (
                    len(responses[VALID_ANSWER]),
                    len(responses[NO_ANSWER]),
                    len(responses[INVALID_ANSWER]),
                )
            answer_stats_over_dataset[model_name][method_name] = (
                sum(answer_stats[model_name][method_name][dataset_name][0] for dataset_name in dataset_names),
                sum(answer_stats[model_name][method_name][dataset_name][1] for dataset_name in dataset_names),
                sum(answer_stats[model_name][method_name][dataset_name][2] for dataset_name in dataset_names),
            )

    # plot
    fig1, ax1 = create_subplots(method_names, model_names, sharey=True)
    for i, method_name in enumerate(method_names):
        for j, model_name in enumerate(model_names):
            n_valid_answer, n_no_answer, n_invalid_answer = answer_stats_over_dataset[model_name][method_name]
            n_total = n_valid_answer + n_no_answer + n_invalid_answer

            df = pd.DataFrame.from_dict(answer_stats[model_name][method_name], columns=[VALID_ANSWER, NO_ANSWER, INVALID_ANSWER], orient="index")
            df.plot.bar(ax=ax1[i, j], color={VALID_ANSWER: "green", NO_ANSWER: "orange", INVALID_ANSWER: "red"})
            ax1[i, j].legend(loc="upper right")

            text = f"valid:     {n_valid_answer}/{n_total}"
            text += f"\nno answer: {n_no_answer}/{n_total}"
            text += f"\ninvalid:   {n_invalid_answer}/{n_total}"
            plot_annotation(ax1[i, j], text)
    if save:
        save_fig(fig1, f"{save}/answer_statistics.png")
    else:
        plt.show(fig1)

    percentage_valid_answer = np.zeros((len(method_names), len(model_names)))
    percentage_no_answer = np.zeros((len(method_names), len(model_names)))
    for i, method_name in enumerate(method_names):
        for j, model_name in enumerate(model_names):
            n_valid_answer, n_no_answer, n_invalid_answer = answer_stats_over_dataset[model_name][method_name]
            n_total = n_valid_answer + n_no_answer + n_invalid_answer
            percentage_valid_answer[i, j] = n_valid_answer / n_total if n_total > 0 else None
            percentage_no_answer[i, j] = n_no_answer / n_total if n_total > 0 else None
    fig2, ax2 = plt.subplots(**get_figlayout(ncols=2, width=7, ratio=(len(method_names), len(model_names))), layout="constrained")
    ax2[0].set_title("% valid answer")
    plot_heatmap(ax2[0], percentage_valid_answer, method_names, model_names, vmin=0, vmax=1, cmap="RdYlGn")
    ax2[1].set_title("% no answer")
    plot_heatmap(ax2[1], percentage_no_answer, method_names, model_names, vmin=0, vmax=1, cmap="RdYlGn_r")
    if save:
        save_fig(fig2, f"{save}/answer_statistics_heatmap.png")
    else:
        plt.show(fig2)

display_responses_all_answer_stats(responses_all, save=f"{PATH_OUTPUT}/plots")

## Overall analysis

In [None]:
def display_responses_all_over_datasets(y_true_all, y_pred_all, n_bins=20, save=None):
    dataset_names, model_names, method_names = detect_names_from_dict(y_true_all)

    # compute
    scores = {
        "ece": np.zeros((len(method_names), len(model_names))),
        "accuracy": np.zeros((len(method_names), len(model_names))),
        "confidence": np.zeros((len(method_names), len(model_names))),
        "confidence_n_distinct": np.zeros((len(method_names), len(model_names))),
        "confidence_variance": np.zeros((len(method_names), len(model_names))),
        "kl_div_over_dataset": np.zeros((len(method_names), len(model_names))),
    }
    for i, method_name in enumerate(method_names):
        for j, model_name in enumerate(model_names):
            y_true = aggregate_responses(y_true_all, dataset_names, model_name, method_name)
            y_pred = aggregate_responses(y_pred_all, dataset_names, model_name, method_name)

            prob_true, prob_pred, bins, bin_count = calibration_curve(y_true, y_pred, n_bins=n_bins)
            ece = np.sum(bin_count / len(y_true) * np.abs(prob_true - prob_pred), where=bin_count > 0)
            scores["ece"][i, j] = ece
            scores["accuracy"][i, j] = np.mean(y_true)
            scores["confidence"][i, j] = np.mean(y_pred)
            scores["confidence_n_distinct"][i, j] = len(np.unique(y_pred))
            scores["confidence_variance"][i, j] = np.std(y_pred)
            scores["kl_div_over_dataset"][i, j] = np.mean([
                kl_div(
                    empirical_distr(y_pred_all[dataset_name][model_name][method_name], n_bins),
                    empirical_distr(aggregate_responses(y_pred_all, dataset_names, model_name, method_name), n_bins)
                )
                for dataset_name in dataset_names
            ])

    # plot
    create_fig_accuracy_distribution("all_over_datasets", y_true_all, [dataset_names, model_names, method_names], 2, 1, n_bins, save=save)
    create_fig_confidence_distribution("all_over_datasets", y_pred_all, [dataset_names, model_names, method_names], 2, 1, n_bins, save=save)
    create_fig_calibration_curve("all_over_datasets", y_true_all, y_pred_all, [dataset_names, model_names, method_names], 2, 1, n_bins, save=save)

    create_fig_calibration_ece("all_over_datasets", scores, method_names, model_names, save=save)
    create_fig_informativeness_diversity("all_over_datasets", scores, method_names, model_names, save=save)
    create_fig_meaningfulness_kldiv("all_over_datasets", scores, method_names, model_names, save=save)

display_responses_all_over_datasets(y_true_all, y_pred_all)

In [None]:
def display_responses_all_over_models(y_true_all, y_pred_all, n_bins=20, save=None):
    dataset_names, model_names, method_names = detect_names_from_dict(y_true_all)

    # compute
    scores = {
        "ece": np.zeros((len(method_names), len(dataset_names))),
        "accuracy": np.zeros((len(method_names), len(dataset_names))),
        "confidence": np.zeros((len(method_names), len(dataset_names))),
        "confidence_n_distinct": np.zeros((len(method_names), len(dataset_names))),
        "confidence_variance": np.zeros((len(method_names), len(dataset_names))),
        "kl_div_over_dataset": np.zeros((len(method_names), len(dataset_names))),
    }
    for i, method_name in enumerate(method_names):
        for j, dataset_name in enumerate(dataset_names):
            y_true = aggregate_responses(y_true_all, dataset_name, model_names, method_name)
            y_pred = aggregate_responses(y_pred_all, dataset_name, model_names, method_name)

            prob_true, prob_pred, bins, bin_count = calibration_curve(y_true, y_pred, n_bins=n_bins)
            ece = np.sum(bin_count / len(y_true) * np.abs(prob_true - prob_pred), where=bin_count > 0)
            scores["ece"][i, j] = ece
            scores["accuracy"][i, j] = np.mean(y_true)
            scores["confidence"][i, j] = np.mean(y_pred)
            scores["confidence_n_distinct"][i, j] = len(np.unique(y_pred))
            scores["confidence_variance"][i, j] = np.std(y_pred)
            scores["kl_div_over_dataset"][i, j] = np.mean([
                kl_div(
                    empirical_distr(y_pred_all[dataset_name][model_name][method_name], n_bins),
                    empirical_distr(aggregate_responses(y_pred_all, dataset_names, model_name, method_name), n_bins)
                )
                for model_name in model_names
            ])

    # plot
    create_fig_accuracy_distribution("all_over_models", y_true_all, [dataset_names, model_names, method_names], 2, 0, n_bins, save=save)
    create_fig_confidence_distribution("all_over_models", y_pred_all, [dataset_names, model_names, method_names], 2, 0, n_bins, save=save)
    create_fig_calibration_curve("all_over_models", y_true_all, y_pred_all, [dataset_names, model_names, method_names], 2, 0, n_bins, save=save)

    create_fig_calibration_ece("all_over_models", scores, method_names, dataset_names, save=save)
    create_fig_informativeness_diversity("all_over_models", scores, method_names, dataset_names, save=save)
    create_fig_meaningfulness_kldiv("all_over_models", scores, method_names, dataset_names, save=save)

display_responses_all_over_models(y_true_all, y_pred_all)

In [None]:
def display_responses_all_over_methods(y_true_all, y_pred_all, n_bins=20, save=None):
    dataset_names, model_names, method_names = detect_names_from_dict(y_true_all)

    # compute
    scores = {
        "ece": np.zeros((len(dataset_names), len(model_names))),
        "accuracy": np.zeros((len(dataset_names), len(model_names))),
        "confidence": np.zeros((len(dataset_names), len(model_names))),
        "confidence_n_distinct": np.zeros((len(dataset_names), len(model_names))),
        "confidence_variance": np.zeros((len(dataset_names), len(model_names))),
        "kl_div_over_dataset": np.zeros((len(dataset_names), len(model_names))),
    }
    for i, dataset_name in enumerate(dataset_names):
        for j, model_name in enumerate(model_names):
            y_true = aggregate_responses(y_true_all, dataset_name, model_name, method_names)
            y_pred = aggregate_responses(y_pred_all, dataset_name, model_name, method_names)

            prob_true, prob_pred, bins, bin_count = calibration_curve(y_true, y_pred, n_bins=n_bins)
            ece = np.sum(bin_count / len(y_true) * np.abs(prob_true - prob_pred), where=bin_count > 0)
            scores["ece"][i, j] = ece
            scores["accuracy"][i, j] = np.mean(y_true)
            scores["confidence"][i, j] = np.mean(y_pred)
            scores["confidence_n_distinct"][i, j] = len(np.unique(y_pred))
            scores["confidence_variance"][i, j] = np.std(y_pred)
            scores["kl_div_over_dataset"][i, j] = np.mean([
                kl_div(
                    empirical_distr(y_pred_all[dataset_name][model_name][method_name], n_bins),
                    empirical_distr(aggregate_responses(y_pred_all, dataset_names, model_name, method_name), n_bins)
                )
                for method_name in method_names
            ])

    # plot
    create_fig_accuracy_distribution("all_over_methods", y_true_all, [dataset_names, model_names, method_names], 0, 1, n_bins, save=save)
    create_fig_confidence_distribution("all_over_methods", y_pred_all, [dataset_names, model_names, method_names], 0, 1, n_bins, save=save)
    create_fig_calibration_curve("all_over_methods", y_true_all, y_pred_all, [dataset_names, model_names, method_names], 0, 1, n_bins, save=save)

    create_fig_calibration_ece("all_over_methods", scores, dataset_names, model_names, save=save)
    create_fig_informativeness_diversity("all_over_methods", scores, dataset_names, model_names, save=save)
    create_fig_meaningfulness_kldiv("all_over_methods", scores, dataset_names, model_names, save=save)

display_responses_all_over_methods(y_true_all, y_pred_all)

## Per-model analysis

In [None]:
def display_responses_per_model(y_true_all, y_pred_all, model_name, n_bins=20, save=None):
    dataset_names, _, method_names = detect_names_from_dict(y_true_all)

    # compute
    scores = {
        "ece": np.zeros((len(method_names), len(dataset_names))),
        "accuracy": np.zeros((len(method_names), len(dataset_names))),
        "confidence": np.zeros((len(method_names), len(dataset_names))),
        "confidence_n_distinct": np.zeros((len(method_names), len(dataset_names))),
        "confidence_variance": np.zeros((len(method_names), len(dataset_names))),
        "kl_div_over_dataset": np.zeros((len(method_names), len(dataset_names))),
    }
    for i, method_name in enumerate(method_names):
        for j, dataset_name in enumerate(dataset_names):
            y_true = y_true_all[dataset_name][model_name][method_name]
            y_pred = y_pred_all[dataset_name][model_name][method_name]
            prob_true, prob_pred, bins, bin_count = calibration_curve(y_true, y_pred, n_bins=n_bins)
            ece = np.sum(bin_count / len(y_true) * np.abs(prob_true - prob_pred), where=bin_count > 0)
            scores["accuracy"][i, j] = np.mean(y_true)
            scores["confidence"][i, j] = np.mean(y_pred)
            scores["ece"][i, j] = ece
            scores["confidence_n_distinct"][i, j] = len(np.unique(y_pred))
            scores["confidence_variance"][i, j] = np.std(y_pred)
            scores["kl_div_over_dataset"][i, j] = kl_div(
                empirical_distr(y_pred_all[dataset_name][model_name][method_name], n_bins),
                empirical_distr(aggregate_responses(y_pred_all, dataset_names, model_name, method_name), n_bins),
            )

    # plot
    create_fig_accuracy_distribution(f"per_model/{model_name}", y_true_all, [dataset_names, model_name, method_names], 2, 0, n_bins, save=save)
    create_fig_confidence_distribution(f"per_model/{model_name}", y_pred_all, [dataset_names, model_name, method_names], 2, 0, n_bins, save=save)
    create_fig_calibration_curve(f"per_model/{model_name}", y_true_all, y_pred_all, [dataset_names, model_name, method_names], 2, 0, n_bins, save=save)

    create_fig_calibration_ece(f"per_model/{model_name}", scores, method_names, dataset_names, save=save)
    create_fig_informativeness_diversity(f"per_model/{model_name}", scores, method_names, dataset_names, save=save)
    create_fig_meaningfulness_kldiv(f"per_model/{model_name}", scores, method_names, dataset_names, save=save)

display_responses_per_model(y_true_all, y_pred_all, "gemma1.1-2b-it", n_bins=10)
display_responses_per_model(y_true_all, y_pred_all, "gemma1.1-7b-it", n_bins=10)
display_responses_per_model(y_true_all, y_pred_all, "llama3-8b-instruct", n_bins=10)
display_responses_per_model(y_true_all, y_pred_all, "llama3-70b-instruct", n_bins=10)
display_responses_per_model(y_true_all, y_pred_all, "qwen1.5-7b-chat", n_bins=10)
display_responses_per_model(y_true_all, y_pred_all, "qwen1.5-32b-chat", n_bins=10)
display_responses_per_model(y_true_all, y_pred_all, "qwen1.5-72b-chat", n_bins=10)
display_responses_per_model(y_true_all, y_pred_all, "qwen1.5-110b-chat", n_bins=10)
display_responses_per_model(y_true_all, y_pred_all, "gpt3.5-turbo", n_bins=10)
display_responses_per_model(y_true_all, y_pred_all, "gpt4o-mini", n_bins=10)
display_responses_per_model(y_true_all, y_pred_all, "gpt4o", n_bins=10)

## Per-method analysis

In [None]:
def display_responses_per_method(y_true_all, y_pred_all, method_name, n_bins=20, save=None):
    dataset_names, model_names, _ = detect_names_from_dict(y_true_all)

    # compute
    scores = {
        "ece": np.zeros((len(dataset_names), len(model_names))),
        "accuracy": np.zeros((len(dataset_names), len(model_names))),
        "confidence": np.zeros((len(dataset_names), len(model_names))),
        "confidence_n_distinct": np.zeros((len(dataset_names), len(model_names))),
        "confidence_variance": np.zeros((len(dataset_names), len(model_names))),
        "kl_div_over_dataset": np.zeros((len(dataset_names), len(model_names))),
    }
    for i, dataset_name in enumerate(dataset_names):
        for j, model_name in enumerate(model_names):
            y_true = y_true_all[dataset_name][model_name][method_name]
            y_pred = y_pred_all[dataset_name][model_name][method_name]
            prob_true, prob_pred, bins, bin_count = calibration_curve(y_true, y_pred, n_bins=n_bins)
            ece = np.sum(bin_count / len(y_true) * np.abs(prob_true - prob_pred), where=bin_count > 0)
            scores["accuracy"][i, j] = np.mean(y_true)
            scores["confidence"][i, j] = np.mean(y_pred)
            scores["ece"][i, j] = ece
            scores["confidence_n_distinct"][i, j] = len(np.unique(y_pred))
            scores["confidence_variance"][i, j] = np.std(y_pred)
            scores["kl_div_over_dataset"][i, j] = kl_div(
                empirical_distr(y_pred_all[dataset_name][model_name][method_name], n_bins),
                empirical_distr(aggregate_responses(y_pred_all, dataset_names, model_name, method_name), n_bins),
            )

    # plot
    create_fig_accuracy_distribution(f"per_method/{method_name}", y_true_all, [dataset_names, model_names, method_name], 0, 1, n_bins, save=save)
    create_fig_confidence_distribution(f"per_method/{method_name}", y_pred_all, [dataset_names, model_names, method_name], 0, 1, n_bins, save=save)
    create_fig_calibration_curve(f"per_method/{method_name}", y_true_all, y_pred_all, [dataset_names, model_names, method_name], 0, 1, n_bins, save=save)

    create_fig_calibration_ece(f"per_method/{method_name}", scores, dataset_names, model_names, save=save)
    create_fig_informativeness_diversity(f"per_method/{method_name}", scores, dataset_names, model_names, save=save)
    create_fig_meaningfulness_kldiv(f"per_method/{method_name}", scores, dataset_names, model_names, save=save)

display_responses_per_method(y_true_all, y_pred_all, "basic_1s", n_bins=10, save=f"{PATH_OUTPUT}/plots")
display_responses_per_method(y_true_all, y_pred_all, "basic_1s_probscore", n_bins=10, save=f"{PATH_OUTPUT}/plots")
display_responses_per_method(y_true_all, y_pred_all, "basic_1s_1shot", n_bins=10, save=f"{PATH_OUTPUT}/plots")
display_responses_per_method(y_true_all, y_pred_all, "basic_1s_5shot", n_bins=10, save=f"{PATH_OUTPUT}/plots")
display_responses_per_method(y_true_all, y_pred_all, "advanced_1s", n_bins=10, save=f"{PATH_OUTPUT}/plots")
display_responses_per_method(y_true_all, y_pred_all, "advanced_1s_probscore", n_bins=10, save=f"{PATH_OUTPUT}/plots")
display_responses_per_method(y_true_all, y_pred_all, "combo_1s_v2", n_bins=10, save=f"{PATH_OUTPUT}/plots")
display_responses_per_method(y_true_all, y_pred_all, "tian2023just_1s_top1", n_bins=10, save=f"{PATH_OUTPUT}/plots")
display_responses_per_method(y_true_all, y_pred_all, "tian2023just_1s_top4", n_bins=10, save=f"{PATH_OUTPUT}/plots")
display_responses_per_method(y_true_all, y_pred_all, "xiong2023can_vanilla", n_bins=10, save=f"{PATH_OUTPUT}/plots")

## Responses viewer

In [None]:
def run_responses_viewer():
    def display_responses(*args):
        if fe_path.value is None:
            return

        path_components = fe_path.value.split(os.sep)
        path_responses = os.sep.join(path_components[:-3])
        dataset_name = path_components[-3]
        model_name = path_components[-2]
        method_name = os.path.splitext(path_components[-1])[0]

        # load dataset and responses
        with out_log:
            out_log.clear_output(wait=True)
            responses = load_responses(path_responses, dataset_name, model_name, method_name, dataset_cache=DATASET_CACHE)
            responses_valid_correct = [(prompt, response) for prompt, response in responses[VALID_ANSWER] if response["is_correct"] == 1]
            responses_valid_incorrect = [(prompt, response) for prompt, response in responses[VALID_ANSWER] if response["is_correct"] < 1]
        # display plot
        with out_responses:
            out_responses.clear_output(wait=True)

            y_true = [response["is_correct"] for _, response in responses[VALID_ANSWER]]
            y_pred = [response["confidence"] for _, response in responses[VALID_ANSWER]]

            fig, ax = plt.subplots(**get_figlayout(ncols=2, width=4), layout="constrained")
            fig.suptitle(f"{dataset_name} / {model_name} / {method_name}")
            ax[0].set(
                title="Histogram of confidence scores",
                xlabel="confidence",
                ylabel="count",
            )
            ax[0].title.set_fontsize("medium")
            plot_confidence_histogram(ax[0], y_pred, n_bins=20)
            ax[1].set(
                title="Calibration curve of confidence scores",
                xlabel="confidence",
                ylabel="accuracy",
            )
            ax[1].title.set_fontsize("medium")
            plot_calibration_curve(ax[1], y_true, y_pred, n_bins=20)
            plt.show(fig)

            n_valid_answer = len(responses[VALID_ANSWER])
            n_valid_answer_correct = len(responses_valid_correct)
            n_valid_answer_incorrect = len(responses_valid_incorrect)
            n_no_answer = len(responses[NO_ANSWER])
            n_invalid_answer = len(responses[INVALID_ANSWER])
            n_total = n_valid_answer + n_no_answer + n_invalid_answer
            print(f"valid answers:   {n_valid_answer}/{n_total}")
            print(f"  correct:         {n_valid_answer_correct}/{n_valid_answer}")
            print(f"  incorrect:       {n_valid_answer_incorrect}/{n_valid_answer}")
            print(f"no answers:      {n_no_answer}/{n_total}")
            print(f"invalid answers: {n_invalid_answer}/{n_total}")
        # display responses
        with out_responses_tables["valid_correct"]:
            out_responses_tables["valid_correct"].clear_output(wait=True)
            display_responses_table(responses_valid_correct)
        with out_responses_tables["valid_incorrect"]:
            out_responses_tables["valid_incorrect"].clear_output(wait=True)
            display_responses_table(responses_valid_incorrect)
        with out_responses_tables["no_answer"]:
            out_responses_tables["no_answer"].clear_output(wait=True)
            display_responses_table(responses[NO_ANSWER])
        with out_responses_tables["invalid"]:
            out_responses_tables["invalid"].clear_output(wait=True)
            display_responses_table(responses[INVALID_ANSWER])

    def display_responses_table(responses):
        responses = [
            {
                "id": prompt["id"],
                "prompt": prompt["content"],
                "correct answer": prompt["correct_answer"],
                "response": "\n----------\n".join(response["responses"]),
                "answer": response["answer"],
                "confidence": response["confidence"],
                "is_correct": response["is_correct"],
            } for prompt, response in responses
        ]
        responses = pd.DataFrame(responses).replace("\n", "<br>", regex=True)
        display_table(responses, html_align="left")

    fe_path = FileExplorerWidget(
        ["path", "dataset", "model", "method"],
        default=f"{PATH_OUTPUT}/responses",
    )
    fe_path.observe(display_responses)
    out_log = widgets.Output()
    out_responses = widgets.Output()
    out_responses_tables = build_widget_outputs(["valid_correct", "valid_incorrect", "no_answer", "invalid"], layout={"max_height": "500px", "overflow": "auto"})

    display(widgets.VBox([
        fe_path,
        out_log,
        out_responses,
        widgets.HTML("<b>valid answers (correct)</b>"),
        out_responses_tables["valid_correct"],
        widgets.HTML("<b>valid answers (incorrect)</b>"),
        out_responses_tables["valid_incorrect"],
        widgets.HTML("<b>no answers</b>"),
        out_responses_tables["no_answer"],
        widgets.HTML("<b>invalid answers</b>"),
        out_responses_tables["invalid"],
    ]))

run_responses_viewer()