### 0. Evaluation 과정을 옮긴 것 (분석 모듈 추가한 거 없음)

In [1]:
import pprint
from tqdm import tqdm, trange
import numpy as np
import os
from collections import OrderedDict, defaultdict
from utils.basic_utils import AverageMeter

import torch
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
from torch.utils.data import DataLoader

from qd_detr.config import TestOptions
from qd_detr.model import build_model
from qd_detr.span_utils import span_cxw_to_xx
from qd_detr.start_end_dataset import StartEndDataset, start_end_collate, prepare_batch_inputs
from qd_detr.start_end_dataset_audio import \
    StartEndDataset_audio, start_end_collate_audio, prepare_batch_inputs_audio
from qd_detr.postprocessing_qd_detr import PostProcessorDETR
from standalone_eval.eval import *
from utils.basic_utils import save_jsonl, save_json, load_jsonl
from utils.temporal_nms import temporal_nms

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
opt_dict = {
    "dset_name": "hl",
    "dset_domain": None,
    "eval_split_name": "val",
    "debug": False,
    "data_ratio": 1.0,
    "results_root": "results",
    "exp_id": "exp",
    "seed": 2018,
    "device": 0,
    "num_workers": 4,
    "no_pin_memory": False,
    "lr": 0.0001,
    "lr_drop": 400,
    "wd": 0.0001,
    "n_epoch": 200,
    "max_es_cnt": 200,
    "bsz": 32,
    "eval_bsz": 100,
    "grad_clip": 0.1,
    "eval_untrained": False,
    "resume": None,
    "resume_all": False,
    "start_epoch": None,
    "max_q_l": 32,
    "max_v_l": 75,
    "clip_length": 2,
    "max_windows": 5,
    "train_path": "data/highlight_train_release.jsonl",
    "eval_path": "data/highlight_val_release.jsonl",
    "no_norm_vfeat": False,
    "no_norm_tfeat": False,
    "v_feat_dirs": [
        "../features/slowfast_features"
    ],
    "t_feat_dir": "../features/clip_text_features/",
    "a_feat_dir": None,
    "v_feat_dim": 2304,
    "t_feat_dim": 512,
    "a_feat_dim": None,
    "ctx_mode": "video_tef",
    "use_cliptext": None,
    "text_ratio": 0.5,
    "position_embedding": "sine",
    "enc_layers": 2,
    "dec_layers": 2,
    "dim_feedforward": 1024,
    "hidden_dim": 256,
    "input_dropout": 0.5,
    "dropout": 0.1,
    "txt_drop_ratio": 0,
    "use_txt_pos": False,
    "nheads": 8,
    "num_queries": 10,
    "pre_norm": False,
    "n_input_proj": 2,
    "contrastive_hdim": 64,
    "temperature": 0.07,
    "lw_saliency": 1.0,
    "saliency_margin": 0.2,
    "aux_loss": True,
    "span_loss_type": "l1",
    "contrastive_align_loss": False,
    "set_cost_span": 10,
    "set_cost_giou": 1,
    "set_cost_class": 4,
    "span_loss_coef": 10,
    "giou_loss_coef": 1,
    "label_loss_coef": 4,
    "eos_coef": 0.1,
    "contrastive_align_loss_coef": 0.0,
    "no_sort_results": False,
    "max_before_nms": 10,
    "max_after_nms": 10,
    "conf_thd": 0.0,
    "nms_thd": -1,
    "results_dir": "results/hl-video_tef-exp-2024_01_23_17_11_15"
}

In [3]:
from argparse import Namespace
opt = Namespace(**opt_dict)

In [4]:
opt.resume = opt.results_dir + '/model_best.ckpt'
opt.resume

'results/hl-video_tef-exp-2024_01_23_17_11_15/model_best.ckpt'

In [5]:
from utils.basic_utils import mkdirp, load_json, save_json, make_zipfile, dict_to_markdown

saved_option_filename = "opt.json"
ckpt_filename = "model.ckpt"
tensorboard_log_dir = "tensorboard_log"
train_log_filename = "train.log.txt"
eval_log_filename = "eval.log.txt"

opt.model_dir = os.path.dirname(opt.resume)

saved_options = load_json(os.path.join(opt.model_dir, saved_option_filename))
for arg in saved_options:  # use saved options to overwrite all BaseOptions args.
    if arg not in ["results_root", "num_workers", "nms_thd", "debug",  # "max_before_nms", "max_after_nms"
                    "max_pred_l", "min_pred_l",
                    "resume", "resume_all", "no_sort_results"]:
        setattr(opt, arg, saved_options[arg])


args = vars(opt)
# Display settings
print(dict_to_markdown(vars(opt), max_str_len=120))

opt.ckpt_filepath = os.path.join(opt.results_dir, ckpt_filename)
opt.train_log_filepath = os.path.join(opt.results_dir, train_log_filename)
opt.eval_log_filepath = os.path.join(opt.results_dir, eval_log_filename)
opt.tensorboard_log_dir = os.path.join(opt.results_dir, tensorboard_log_dir)
opt.device = torch.device("cuda" if opt.device >= 0 else "cpu")
opt.pin_memory = not opt.no_pin_memory

opt.use_tef = "tef" in opt.ctx_mode
opt.use_video = "video" in opt.ctx_mode
if not opt.use_video:
    opt.v_feat_dim = 0
if opt.use_tef:
    opt.v_feat_dim += 2

|                             | 0                                                            |
|:----------------------------|:-------------------------------------------------------------|
| dset_name                   | hl                                                           |
| dset_domain                 |                                                              |
| eval_split_name             | val                                                          |
| debug                       | False                                                        |
| data_ratio                  | 1.0                                                          |
| results_root                | results                                                      |
| exp_id                      | exp                                                          |
| seed                        | 2018                                                         |
| device                      | 0                 

In [6]:
from qd_detr.inference import *

In [7]:
assert opt.eval_path is not None
if opt.eval_split_name == 'val':
    loadlabel = True
else:
    loadlabel = False
    
eval_dataset = StartEndDataset_audio(
    dset_name=opt.dset_name,
    data_path=opt.eval_path,
    v_feat_dirs=opt.v_feat_dirs,
    q_feat_dir=opt.t_feat_dir,
    a_feat_dir=opt.a_feat_dir,
    q_feat_type="last_hidden_state",
    max_q_l=opt.max_q_l,
    max_v_l=opt.max_v_l,
    ctx_mode=opt.ctx_mode,
    data_ratio=opt.data_ratio,
    normalize_v=not opt.no_norm_vfeat,
    normalize_t=not opt.no_norm_tfeat,
    clip_len=opt.clip_length,
    max_windows=opt.max_windows,
    load_labels=loadlabel,  # opt.eval_split_name == "val",
    span_loss_type=opt.span_loss_type,
    txt_drop_ratio=0,
    dset_domain=opt.dset_domain,
)

In [8]:
save_submission_filename = "hl_{}_submission.jsonl".format(opt.eval_split_name)
submission = load_jsonl(opt.model_dir + '/' + save_submission_filename)
ground_truth = eval_dataset.data


metrics = eval_submission(
            submission, eval_dataset.data,
            verbose=opt.debug, match_number=not opt.debug
        )

short: [0, 10], 429/1550=27.68 examples.
middle: [10, 30], 957/1550=61.74 examples.
long: [30, 150], 574/1550=37.03 examples.
full: [0, 150], 1550/1550=100.00 examples.


In [9]:
metrics

OrderedDict([('brief',
              OrderedDict([('MR-full-R1@0.5', 59.87),
                           ('MR-full-R1@0.7', 43.87),
                           ('MR-full-mAP', 38.64),
                           ('MR-full-mAP@0.5', 60.07),
                           ('MR-full-mAP@0.75', 38.38),
                           ('MR-long-mAP', 44.15),
                           ('MR-middle-mAP', 41.12),
                           ('MR-short-mAP', 6.78),
                           ('HL-min-Fair-mAP', 73.62),
                           ('HL-min-Fair-Hit1', 73.35),
                           ('HL-min-Good-mAP', 62.49),
                           ('HL-min-Good-Hit1', 71.16),
                           ('HL-min-VeryGood-mAP', 38.12),
                           ('HL-min-VeryGood-Hit1', 60.45)])),
             ('HL-min-Fair', {'HL-mAP': 73.62, 'HL-Hit1': 73.35}),
             ('HL-min-Good', {'HL-mAP': 62.49, 'HL-Hit1': 71.16}),
             ('HL-min-VeryGood', {'HL-mAP': 38.12, 'HL-Hit1': 60.45}),
   

In [10]:
moment_len_idx = 3

l_ranges = [[0, 10], [10, 30], [30, 150], [0, 150], ]  
names = ["short", "middle", "long", "full"]

l_range =  l_ranges[moment_len_idx]
name = names[moment_len_idx]


_submission, _ground_truth = get_data_by_range(submission, ground_truth, l_range)
print(f"{name}: {l_range}, {len(_ground_truth)}/{len(ground_truth)}="
              f"{100*len(_ground_truth)/len(ground_truth):.2f} examples.")

full: [0, 150], 1550/1550=100.00 examples.


In [11]:
iou_thds=np.linspace(0.5, 0.95, 10)
max_gt_windows=None
max_pred_windows=10

iou_thds = [float(f"{e:.2f}") for e in iou_thds]
pred_qid2data = defaultdict(list)
for d in _submission:
    pred_windows = d["pred_relevant_windows"][:max_pred_windows] \
        if max_pred_windows is not None else d["pred_relevant_windows"]
    qid = d["qid"]
    for w in pred_windows:
        pred_qid2data[qid].append({
            "video-id": d["qid"],  # in order to use the API
            "t-start": w[0],
            "t-end": w[1],
            "score": w[2]
        })

gt_qid2data = defaultdict(list)
for d in _ground_truth:
    gt_windows = d["relevant_windows"][:max_gt_windows] \
        if max_gt_windows is not None else d["relevant_windows"]
    qid = d["qid"]
    for w in gt_windows:
        gt_qid2data[qid].append({
            "video-id": d["qid"],
            "t-start": w[0],
            "t-end": w[1]
        })
qid2ap_list = {}
# start_time = time.time()
data_triples = [[qid, gt_qid2data[qid], pred_qid2data[qid]] for qid in pred_qid2data]

from functools import partial
compute_ap_from_triple = partial(
    compute_average_precision_detection_wrapper, tiou_thresholds=iou_thds)


for data_triple in data_triples:
    qid, scores = compute_ap_from_triple(data_triple)
    qid2ap_list[qid] = scores


# print(f"compute_average_precision_detection {time.time() - start_time:.2f} seconds.")
ap_array = np.array(list(qid2ap_list.values()))  # (#queries, #thd)
ap_thds = ap_array.mean(0)  # mAP at different IoU thresholds.
iou_thd2ap = dict(zip([str(e) for e in iou_thds], ap_thds))
iou_thd2ap["average"] = np.mean(ap_thds)
# formatting
iou_thd2ap = {k: float(f"{100 * v:.2f}") for k, v in iou_thd2ap.items()}


In [12]:
iou_thd2ap

{'0.5': 60.07,
 '0.55': 54.97,
 '0.6': 51.69,
 '0.65': 47.31,
 '0.7': 43.09,
 '0.75': 38.38,
 '0.8': 33.11,
 '0.85': 26.82,
 '0.9': 19.99,
 '0.95': 10.99,
 'average': 38.64}

In [13]:
qid2ap_list

{2579: array([1., 1., 1., 1., 0., 0., 0., 0., 0., 0.]),
 5071: array([1., 1., 0., 0., 0., 0., 0., 0., 0., 0.]),
 5342: array([1.  , 1.  , 1.  , 1.  , 1.  , 1.  , 1.  , 1.  , 1.  , 0.25]),
 8636: array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 8749: array([1., 1., 1., 1., 1., 1., 0., 0., 0., 0.]),
 2638: array([0.33333333, 0.33333333, 0.33333333, 0.33333333, 0.33333333,
        0.        , 0.        , 0.        , 0.        , 0.        ]),
 5979: array([1., 1., 1., 1., 1., 1., 1., 1., 0., 0.]),
 6223: array([1. , 1. , 1. , 1. , 1. , 0.5, 0.5, 0.5, 0. , 0. ]),
 8737: array([0.25, 0.25, 0.25, 0.25, 0.1 , 0.  , 0.  , 0.  , 0.  , 0.  ]),
 8783: array([0.1, 0.1, 0.1, 0. , 0. , 0. , 0. , 0. , 0. , 0. ]),
 781: array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 3359: array([0.08333333, 0.08333333, 0.08333333, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ]),
 3503: array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 4625: array([1., 1., 1., 1., 1., 1.,