# Evaluation demo
This notebook will present how to evalute the intent recognition including macro f1 scores for each subsets.
Download the predicted scores from our baseline models at [this link](https://cornell.box.com/s/5g5q7tnak1le5cxa3nv69xep6o7e7uwi) and save them to `ROOT`.

In [1]:
import numpy as np
import pandas as pd
import torch
import warnings

from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings(action='ignore', category=UndefinedMetricWarning)

from collections import defaultdict

from eval_utils import eval_all_metrics, SUBSET2IDS

from typing import List

ROOT = ""  # the folder which you place the downloaded scores

In [12]:
"""
organize the evaluation results in a table
"""
def get_allresults_df(root: str) -> pd.DataFrame:
    data_dict = defaultdict(list)
    # these are the three baseline model ablations
    for model_type in ["image", "image_cam", "image_hs_cam"]:
        data_dict["model"].append(model_type)
        val_f1s = []  # 5 x 28
        all_f1s = defaultdict(list)

        # get results for each run
        for run_num in range(5):
            d_dict = torch.load(f"{root}/{model_type}_{run_num}.pth")
            f1_dict = eval_all_metrics(
                d_dict["val_scores"], d_dict["test_scores"],
                d_dict["val_targets"], d_dict["test_targets"]
            )
            for k, v in f1_dict.items():
                if isinstance(v, float):
                    all_f1s[k].append(v * 100)
                else:
                    all_f1s[k].append(np.array(v)[np.newaxis, :] * 100)

        val_f1s = np.vstack(all_f1s["val_none"])

        for e_type, c_ids in SUBSET2IDS.items():
            e_f1s = np.mean(np.hstack([val_f1s[:, c:c+1] for c in c_ids]), 1)
            data_dict[f"val-{e_type}"].append("{:.2f} +- {:.2f}".format(
                np.mean(e_f1s), np.std(e_f1s)
            ))

        for k, values in all_f1s.items():
            if not k.endswith("none"):
                data_dict[k].append("{:.2f} +- {:.2f}".format(
                    np.mean(values), np.std(values)
                ))
    df = pd.DataFrame(data_dict)
    return df

In [13]:
# these are the same results reported in README.md
get_allresults_df(ROOT)

Unnamed: 0,model,val-easy,val-medium,val-hard,val-object,val-context,val-other,val_micro,val_samples,val_macro,test_micro,test_samples,test_macro
0,image,54.64 +- 2.54,24.92 +- 1.18,10.71 +- 1.33,25.58 +- 2.51,30.16 +- 2.97,21.34 +- 0.74,31.36 +- 1.16,29.91 +- 1.73,23.03 +- 0.79,30.23 +- 0.73,28.45 +- 1.71,22.77 +- 0.59
1,image_cam,57.10 +- 1.84,25.68 +- 1.24,12.72 +- 2.31,28.15 +- 1.94,28.62 +- 2.13,22.60 +- 1.40,32.87 +- 1.13,32.46 +- 1.18,24.42 +- 0.95,32.07 +- 0.84,30.91 +- 1.27,24.37 +- 0.65
2,image_hs_cam,58.86 +- 2.56,26.30 +- 1.42,13.11 +- 2.15,29.66 +- 2.19,32.48 +- 1.34,22.61 +- 0.48,32.94 +- 1.16,33.61 +- 0.92,25.07 +- 0.52,31.28 +- 0.36,31.39 +- 0.78,23.98 +- 0.85


In [11]:
"""
organize the evaluation results in a table
"""
def get_allresults_df(root: str, model_types: List[str], tags: List[str]) -> pd.DataFrame:
    data_dict = defaultdict(list)
    for model_type in model_types:
        data_dict["model"].append(model_type)
        val_f1s = []  # 5 x 28
        all_f1s = defaultdict(list)

        # get results for each run
        for run_tag in tags:
            d_dict = torch.load(f"{root}/{model_type}_{run_tag}.pt")
            f1_dict = eval_all_metrics(
                d_dict["val_scores"], d_dict["test_scores"],
                d_dict["val_targets"], d_dict["test_targets"]
            )
            print(f"{model_type}_{run_tag} best F1 threshold: {f1_dict['threshold']}")
            for k, v in f1_dict.items():
                if isinstance(v, float):
                    all_f1s[k].append(v * 100)
                else:
                    all_f1s[k].append(np.array(v)[np.newaxis, :] * 100)

        val_f1s = np.vstack(all_f1s["val_none"])

        for e_type, c_ids in SUBSET2IDS.items():
            e_f1s = np.mean(np.hstack([val_f1s[:, c:c+1] for c in c_ids]), 1)
            data_dict[f"val-{e_type}"].append("{:.2f} +- {:.2f}".format(
                np.mean(e_f1s), np.std(e_f1s)
            ))

        for k, values in all_f1s.items():
            if not k.endswith("none") and k != "threshold":
                data_dict[k].append("{:.2f} +- {:.2f}".format(
                    np.mean(values), np.std(values)
                ))
    df = pd.DataFrame(data_dict)
    return df

In [33]:
models = [
    "resnet50",
    "resnet50_aug",
    "resnet50_loc",
    "resnet50_aug_loc",
    "virtex",
    "virtex_loc",
    "swin_tiny",
    "swin_small",
    "resnet50_aug_ht",
    "resnet50_aug_ht_loc",
    "virtex_ht",
    "virtex_ht_loc",
    "swin_tiny_ht",
]
tags = [
    "best_macro_f1",
    "best_micro_f1",
    "best_samples_f1",
    "9",
    "19",
]
get_allresults_df("scores", models, tags)

resnet50_best_macro_f1 best F1 threshold: 0.1
resnet50_best_micro_f1 best F1 threshold: 0.05
resnet50_best_samples_f1 best F1 threshold: 0.1
resnet50_9 best F1 threshold: 0.1
resnet50_19 best F1 threshold: 0.05
resnet50_aug_best_macro_f1 best F1 threshold: 0.15
resnet50_aug_best_micro_f1 best F1 threshold: 0.1
resnet50_aug_best_samples_f1 best F1 threshold: 0.1
resnet50_aug_9 best F1 threshold: 0.1
resnet50_aug_19 best F1 threshold: 0.15
resnet50_loc_best_macro_f1 best F1 threshold: 0.1
resnet50_loc_best_micro_f1 best F1 threshold: 0.1
resnet50_loc_best_samples_f1 best F1 threshold: 0.1
resnet50_loc_9 best F1 threshold: 0.1
resnet50_loc_19 best F1 threshold: 0.1
resnet50_aug_loc_best_macro_f1 best F1 threshold: 0.1
resnet50_aug_loc_best_micro_f1 best F1 threshold: 0.1
resnet50_aug_loc_best_samples_f1 best F1 threshold: 0.1
resnet50_aug_loc_9 best F1 threshold: 0.1
resnet50_aug_loc_19 best F1 threshold: 0.1
virtex_best_macro_f1 best F1 threshold: 0.05
virtex_best_micro_f1 best F1 thresh

Unnamed: 0,model,val-easy,val-medium,val-hard,val-object,val-context,val-other,val_micro,val_samples,val_macro,test_micro,test_samples,test_macro
0,resnet50,63.79 +- 2.04,30.66 +- 3.13,13.93 +- 2.35,36.18 +- 1.50,33.67 +- 3.42,24.73 +- 1.45,37.24 +- 3.35,41.27 +- 2.90,28.23 +- 1.47,37.36 +- 3.47,40.50 +- 2.62,28.73 +- 1.61
1,resnet50_aug,68.27 +- 2.44,35.74 +- 1.33,15.64 +- 3.30,37.63 +- 2.33,45.77 +- 4.19,28.55 +- 1.35,43.09 +- 1.69,45.46 +- 1.81,32.05 +- 0.64,41.57 +- 2.93,43.19 +- 3.13,31.31 +- 1.49
2,resnet50_loc,65.62 +- 2.98,30.85 +- 2.49,14.03 +- 1.25,34.72 +- 2.87,38.12 +- 4.88,25.29 +- 1.48,39.42 +- 2.81,43.20 +- 2.48,28.57 +- 1.23,38.23 +- 2.69,40.95 +- 2.04,28.41 +- 0.65
3,resnet50_aug_loc,68.79 +- 1.80,35.23 +- 0.93,19.22 +- 2.16,37.90 +- 1.16,42.46 +- 2.07,30.35 +- 1.21,43.76 +- 1.61,46.71 +- 1.39,33.11 +- 0.92,43.02 +- 1.13,45.14 +- 0.93,32.91 +- 0.43
4,virtex,66.91 +- 1.82,34.51 +- 1.46,20.28 +- 3.29,37.60 +- 0.30,34.78 +- 4.97,30.97 +- 0.95,41.35 +- 2.05,45.29 +- 0.78,32.90 +- 0.34,40.89 +- 2.67,44.22 +- 1.10,32.31 +- 1.05
5,virtex_loc,68.13 +- 1.65,36.68 +- 2.05,15.69 +- 1.84,38.36 +- 2.80,40.26 +- 1.06,29.61 +- 0.58,44.68 +- 1.39,47.57 +- 0.86,32.56 +- 0.98,44.54 +- 1.27,46.40 +- 0.90,32.62 +- 0.98
6,swin_tiny,70.43 +- 2.27,39.57 +- 3.34,18.50 +- 2.41,40.45 +- 1.75,41.00 +- 6.03,32.88 +- 1.44,45.40 +- 3.49,48.53 +- 2.89,35.35 +- 1.65,44.54 +- 3.40,46.92 +- 2.96,33.08 +- 1.98
7,swin_small,70.49 +- 2.54,42.81 +- 1.49,16.14 +- 4.39,43.21 +- 2.13,45.31 +- 2.56,32.73 +- 1.32,47.38 +- 1.43,50.93 +- 1.12,36.25 +- 1.16,46.15 +- 1.10,48.48 +- 0.93,34.80 +- 0.71
8,resnet50_aug_ht,68.48 +- 3.73,33.99 +- 2.15,15.55 +- 3.77,35.14 +- 2.17,41.22 +- 1.08,28.55 +- 0.71,42.41 +- 2.68,45.85 +- 1.62,31.10 +- 0.33,41.55 +- 3.77,43.79 +- 3.11,30.91 +- 2.30
9,resnet50_aug_ht_loc,68.55 +- 2.75,34.44 +- 2.18,13.70 +- 2.13,35.83 +- 2.22,41.88 +- 2.51,27.61 +- 0.93,42.34 +- 2.55,45.39 +- 2.14,30.69 +- 1.29,41.90 +- 3.01,44.17 +- 2.39,31.62 +- 1.59
