In [1]:
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt
from collections import defaultdict
import seaborn as sns
from pprint import pprint
import json
from scipy.stats import mode
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

%matplotlib inline

In [2]:
result_files = glob.glob('../results/dvlog-baseline-model-size-ablation/*.csv')

dfs = []
for file in result_files:
    df = pd.read_csv(file)

    # get seconds_per_window
    df['seconds_per_window'] = int(file.split(':')[-2].split('-')[3])
    df['presence_threshold'] = float(file.split(':')[-2].split('-')[1])
    df['num_layers'] = int(file.split(':')[-2].split('-')[5])
    df['num_heads'] = int(file.split(':')[-2].split('-')[6][-1:])
    df['head_dim'] = int(file.split(':')[-2].split('-')[7][-2:])
    df['run_id'] = int(file.split(':')[-2].split('-')[9])
    df['filename'] = file.split('/')[-1]
    df['evaluator'] = file.split('/')[-1].split(':')[0]

    dfs.append(df)

dfs = pd.concat(dfs)


In [3]:
dfs.head()

Unnamed: 0,f1,recall,precision,auc,accuracy,name,dataset,dataset_kind,model,prediction_kind,...,num_heads,head_dim,run_id,filename,evaluator,modalities,model_args.num_layers,model_args.self_attn_num_heads,model_args.self_attn_dim_head,f1_weighted
0,0.590476,0.601942,0.579439,0.560962,0.544974,dvlog-baseline-model-size-ablation:pt-0.25-spw...,d-vlog,test,baseline,last,...,8,32,2,temporal-evaluator:dvlog-baseline-model-size-a...,temporal-evaluator,,,,,
1,0.743119,0.786408,0.704348,0.78494,0.703704,dvlog-baseline-model-size-ablation:pt-0.25-spw...,d-vlog,test,baseline,mean,...,8,32,2,temporal-evaluator:dvlog-baseline-model-size-a...,temporal-evaluator,,,,,
0,0.664,0.805825,0.564626,0.590088,0.555556,dvlog-baseline-model-size-ablation:pt-0.25-spw...,d-vlog,test,baseline,last,...,8,32,2,temporal-evaluator:dvlog-baseline-model-size-a...,temporal-evaluator,"['audio_embeddings', 'face_embeddings']",8.0,8.0,32.0,
1,0.733624,0.815534,0.666667,0.762362,0.677249,dvlog-baseline-model-size-ablation:pt-0.25-spw...,d-vlog,test,baseline,mean,...,8,32,2,temporal-evaluator:dvlog-baseline-model-size-a...,temporal-evaluator,"['audio_embeddings', 'face_embeddings']",8.0,8.0,32.0,
2,0.733624,0.815534,0.666667,0.663581,0.677249,dvlog-baseline-model-size-ablation:pt-0.25-spw...,d-vlog,test,baseline,threshold,...,8,32,2,temporal-evaluator:dvlog-baseline-model-size-a...,temporal-evaluator,"['audio_embeddings', 'face_embeddings']",8.0,8.0,32.0,


In [4]:
temporal_eval = dfs[dfs['evaluator'] == 'temporal-evaluator']

In [5]:
mean_eval = dfs[dfs['prediction_kind'] == 'mean']
mean_eval = mean_eval[mean_eval['num_heads'] == 8]
mean_eval = mean_eval[mean_eval['dataset_kind'] == 'test']
mean_eval.sort_values(by=['seconds_per_window', 'presence_threshold', 'run_id'], inplace=True)

In [6]:
grouped_mean = mean_eval.groupby(['seconds_per_window', 'presence_threshold', 'num_layers', 'num_heads', 'head_dim']).agg(
    {'run_id': 'count', 'f1': ['mean', 'std'], 'precision': ['mean', 'std'], 'recall': ['mean', 'std'], 'accuracy': ['mean', 'std']}).reset_index()
grouped_mean

Unnamed: 0_level_0,seconds_per_window,presence_threshold,num_layers,num_heads,head_dim,run_id,f1,f1,precision,precision,recall,recall,accuracy,accuracy
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,count,mean,std,mean,std,mean,std,mean,std
0,1,0.25,8,8,32,3,0.712833,0.066233,0.718564,0.047708,0.728155,0.163326,0.689594,0.026631
1,1,0.5,8,8,32,3,0.708243,0.05187,0.705925,0.00409,0.71521,0.099169,0.68254,0.033042
2,1,0.75,8,8,32,3,0.748737,0.011808,0.706449,0.05731,0.802589,0.053472,0.705467,0.038276
3,2,0.25,8,8,32,3,0.745678,0.018477,0.702861,0.035662,0.799353,0.072869,0.703704,0.019077
4,2,0.5,8,8,32,3,0.695957,0.064588,0.714293,0.058413,0.699029,0.160415,0.675485,0.035229
5,2,0.75,8,8,32,3,0.752484,0.007379,0.667034,0.025983,0.867314,0.066086,0.689594,0.011014
6,3,0.25,8,8,32,3,0.720157,0.04819,0.708506,0.040266,0.744337,0.123318,0.689594,0.0261
7,3,0.5,8,8,32,3,0.751901,0.021258,0.670197,0.026935,0.860841,0.073513,0.691358,0.021383
8,3,0.75,8,8,32,3,0.751051,0.025666,0.70355,0.023964,0.805825,0.035005,0.708995,0.029459
9,4,0.25,8,8,32,3,0.746082,0.04261,0.695707,0.035194,0.805825,0.067961,0.70194,0.046227


In [7]:
grouped_mean.columns = grouped_mean.columns.map(''.join)
grouped_mean.sort_values(by=['f1mean'], ascending=False)

Unnamed: 0,seconds_per_window,presence_threshold,num_layers,num_heads,head_dim,run_idcount,f1mean,f1std,precisionmean,precisionstd,recallmean,recallstd,accuracymean,accuracystd
25,9,0.5,8,8,32,3,0.764859,0.012449,0.703505,0.040423,0.84466,0.077061,0.717813,0.013315
26,9,0.75,8,8,32,3,0.764474,0.019809,0.741106,0.027464,0.79085,0.040817,0.735816,0.021497
23,8,0.75,8,8,32,3,0.76356,0.018412,0.718521,0.022514,0.81877,0.073513,0.724868,0.005291
11,4,0.75,8,8,32,3,0.76266,0.012336,0.716906,0.027892,0.815534,0.016816,0.723104,0.021383
18,7,0.25,8,8,32,3,0.760204,0.021113,0.728573,0.024657,0.796117,0.044491,0.726631,0.021383
22,8,0.5,8,8,32,3,0.759512,0.012188,0.762549,0.008881,0.757282,0.033632,0.738977,0.00611
5,2,0.75,8,8,32,3,0.752484,0.007379,0.667034,0.025983,0.867314,0.066086,0.689594,0.011014
27,10,0.25,8,8,32,3,0.752165,0.026042,0.705985,0.050541,0.815534,0.100896,0.708995,0.024246
7,3,0.5,8,8,32,3,0.751901,0.021258,0.670197,0.026935,0.860841,0.073513,0.691358,0.021383
17,6,0.75,8,8,32,3,0.751367,0.028589,0.731831,0.031999,0.776699,0.079469,0.72134,0.021383


In [8]:
def compute_metrics(labels, predictions):
    f1 = f1_score(labels, predictions)
    f1_weighted = f1_score(labels, predictions, average='weighted')
    precision = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)
    accuracy = accuracy_score(labels, predictions)

    return f1, f1_weighted, precision, recall, accuracy

In [21]:
json_preds_files = glob.glob('../results/dvlog-baseline-model-size-ablation/*over-time:test.json')

json_preds = []

for file in json_preds_files:
    print(file)
    preds = defaultdict(lambda: {'mode_preds': None, 'mode_preds_threshold': None, 'true_label': None})

    with open(file) as f:
        preds_over_time = json.load(f)
    
    for key in preds_over_time.keys():
        if 'pred' in key:
            continue

        preds[key]['mode_preds'] = int(mode(np.round(preds_over_time[key]['preds']))[0][0])

        if 'preds_threshold' in preds_over_time[key].keys() and len(preds_over_time[key]['preds_threshold']) > 0:
            preds[key]['mode_preds_threshold'] = int(mode(np.round(preds_over_time[key]['preds_threshold']))[0][0])

        preds[key]['true_label'] = int(preds_over_time[key]['true_label'])

    sorted_keys = sorted(preds.keys())

    y_preds_mode = np.array([preds[key]['mode_preds'] for key in sorted_keys])
    true_labels = np.array([preds[key]['true_label'] for key in sorted_keys])

    print(y_preds_mode[:10])
    print(true_labels[:10])
    y_preds_mode_threshold = np.array([preds[key]['mode_preds_threshold'] for key in sorted_keys])
    print(y_preds_mode_threshold[:10])

    if y_preds_mode_threshold[0] == None:
        f1_threshold, f1_weighted_threshold, precision_threshold, recall_threshold, accuracy_threshold = 0, 0, 0, 0, 0
    
    else:
        f1_threshold, f1_weighted_threshold, precision_threshold, recall_threshold, accuracy_threshold = compute_metrics(true_labels, y_preds_mode_threshold)

    true_labels = np.array([preds[key]['true_label'] for key in sorted_keys])

    f1, f1_weighted, precision, recall, accuracy = compute_metrics(true_labels, y_preds_mode)
    
    results = {
        'seconds_per_window': [int(file.split(':')[-3].split('-')[3])] * 2,
        'presence_threshold': [float(file.split(':')[-3].split('-')[1])] * 2,
        'num_layers': [int(file.split(':')[-3].split('-')[5])] * 2,
        'num_heads': [int(file.split(':')[-3].split('-')[6][-1:])] * 2,
        'head_dim': [int(file.split(':')[-3].split('-')[7][-2:])] * 2,
        'run_id': [int(file.split(':')[-3].split('-')[9])] * 2,
        'filename': [file.split('/')[-1]] * 2,
        'evaluator': [file.split('/')[-1].split(':')[0]] * 2,
        'f1': [f1, f1_threshold],
        'f1_weighted': [f1_weighted, f1_weighted_threshold],
        'precision': [precision, precision_threshold],
        'recall': [recall, recall_threshold],
        'accuracy': [accuracy, accuracy_threshold],
        'prediction_kind': ['mode', 'mode_threshold'],

    }

    results = pd.DataFrame.from_dict(results)
    json_preds.append(results)

json_all_preds = pd.concat(json_preds)
json_all_preds

../results/dvlog-baseline-model-size-ablation/temporal-evaluator:dvlog-baseline-model-size-ablation:pt-0.50-spw-8-nl-8-nh8-hd32-run-2:over-time:test.json
[0 1 1 0 1 0 0 0 1 0]
[1 1 1 0 0 0 1 1 1 1]
[0 1 1 0 1 0 0 0 1 0]
../results/dvlog-baseline-model-size-ablation/temporal-evaluator:dvlog-baseline-model-size-ablation:pt-0.75-spw-6-nl-8-nh8-hd32-run-1:over-time:test.json
[0 1 1 0 1 0 1 1 1 1]
[1 1 1 0 0 0 1 1 1 1]
[0 1 1 0 1 0 1 1 1 1]
../results/dvlog-baseline-model-size-ablation/temporal-evaluator:dvlog-baseline-model-size-ablation:pt-0.50-spw-5-nl-8-nh8-hd32-run-1:over-time:test.json
[0 1 1 0 1 0 0 1 1 0]
[1 1 1 0 0 0 1 1 1 1]
[0 1 1 0 1 0 0 1 1 0]
../results/dvlog-baseline-model-size-ablation/temporal-evaluator:dvlog-baseline-model-size-ablation:pt-0.25-spw-8-nl-8-nh8-hd32-run-2:over-time:test.json
[0 1 1 0 0 0 1 1 1 0]
[1 1 1 0 0 0 1 1 1 1]
[0 1 1 0 0 0 1 1 1 0]
../results/dvlog-baseline-model-size-ablation/temporal-evaluator:dvlog-baseline-model-size-ablation:pt-0.25-spw-9-nl-8-n

ValueError: Classification metrics can't handle a mix of binary and unknown targets