In [27]:
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt
from collections import defaultdict
import seaborn as sns
from pprint import pprint
import json
from scipy.stats import mode
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

%matplotlib inline

In [28]:
result_files = glob.glob('../results/dvlog-baseline-model-size-ablation/*.csv')

dfs = []
for file in result_files:
    df = pd.read_csv(file)

    # get seconds_per_window
    df['seconds_per_window'] = int(file.split(':')[-2].split('-')[3])
    df['presence_threshold'] = float(file.split(':')[-2].split('-')[1])
    df['num_layers'] = int(file.split(':')[-2].split('-')[5])
    df['num_heads'] = int(file.split(':')[-2].split('-')[6][-1:])
    df['head_dim'] = int(file.split(':')[-2].split('-')[7][-2:])
    df['run_id'] = int(file.split(':')[-2].split('-')[9])
    df['filename'] = file.split('/')[-1]
    df['evaluator'] = file.split('/')[-1].split(':')[0]

    dfs.append(df)

dfs = pd.concat(dfs)


In [29]:
dfs.head()

Unnamed: 0,f1,recall,precision,auc,accuracy,name,dataset,dataset_kind,model,prediction_kind,...,num_heads,head_dim,run_id,filename,evaluator,modalities,model_args.num_layers,model_args.self_attn_num_heads,model_args.self_attn_dim_head,f1_weighted
0,0.763636,0.857143,0.688525,0.775964,0.723404,dvlog-baseline-model-size-ablation:pt-0.25-spw...,d-vlog,validation,baseline,mean over 10 runs,...,4,64,3,majority-evaluator:dvlog-baseline-model-size-a...,majority-evaluator,,,,,
0,0.672199,0.786408,0.586957,0.612441,0.582011,dvlog-baseline-model-size-ablation:pt-0.25-spw...,d-vlog,test,baseline,last,...,4,64,2,temporal-evaluator:dvlog-baseline-model-size-a...,temporal-evaluator,,,,,
1,0.744395,0.805825,0.691667,0.774554,0.698413,dvlog-baseline-model-size-ablation:pt-0.25-spw...,d-vlog,test,baseline,mean,...,4,64,2,temporal-evaluator:dvlog-baseline-model-size-a...,temporal-evaluator,,,,,
0,0.561576,0.553398,0.57,0.539851,0.529101,dvlog-baseline-model-size-ablation:pt-0.75-spw...,d-vlog,test,baseline,last,...,8,32,2,temporal-evaluator:dvlog-baseline-model-size-a...,temporal-evaluator,"['audio_embeddings', 'face_embeddings']",8.0,8.0,32.0,
1,0.772727,0.825243,0.726496,0.797358,0.73545,dvlog-baseline-model-size-ablation:pt-0.75-spw...,d-vlog,test,baseline,mean,...,8,32,2,temporal-evaluator:dvlog-baseline-model-size-a...,temporal-evaluator,"['audio_embeddings', 'face_embeddings']",8.0,8.0,32.0,


In [30]:
temporal_eval = dfs[dfs['evaluator'] == 'temporal-evaluator']

In [31]:
mean_eval = dfs[dfs['prediction_kind'] == 'mean']
mean_eval = mean_eval[mean_eval['num_heads'] == 8]
mean_eval = mean_eval[mean_eval['dataset_kind'] == 'test']
mean_eval.sort_values(by=['seconds_per_window', 'presence_threshold', 'run_id'], inplace=True)

In [32]:
grouped_mean = mean_eval.groupby(['seconds_per_window', 'presence_threshold', 'num_layers', 'num_heads', 'head_dim']).agg(
    {'run_id': 'count', 'f1': ['mean', 'std'], 'precision': ['mean', 'std'], 'recall': ['mean', 'std'], 'accuracy': ['mean', 'std']}).reset_index()
grouped_mean

Unnamed: 0_level_0,seconds_per_window,presence_threshold,num_layers,num_heads,head_dim,run_id,f1,f1,precision,precision,recall,recall,accuracy,accuracy
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,count,mean,std,mean,std,mean,std,mean,std
0,1,0.25,8,8,32,3,0.712833,0.066233,0.718564,0.047708,0.728155,0.163326,0.689594,0.026631
1,1,0.5,8,8,32,3,0.708243,0.05187,0.705925,0.00409,0.71521,0.099169,0.68254,0.033042
2,1,0.75,8,8,32,3,0.748737,0.011808,0.706449,0.05731,0.802589,0.053472,0.705467,0.038276
3,2,0.25,8,8,32,3,0.745678,0.018477,0.702861,0.035662,0.799353,0.072869,0.703704,0.019077
4,2,0.5,8,8,32,3,0.695957,0.064588,0.714293,0.058413,0.699029,0.160415,0.675485,0.035229
5,2,0.75,8,8,32,3,0.752484,0.007379,0.667034,0.025983,0.867314,0.066086,0.689594,0.011014
6,3,0.25,8,8,32,3,0.720157,0.04819,0.708506,0.040266,0.744337,0.123318,0.689594,0.0261
7,3,0.5,8,8,32,3,0.751901,0.021258,0.670197,0.026935,0.860841,0.073513,0.691358,0.021383
8,3,0.75,8,8,32,3,0.751051,0.025666,0.70355,0.023964,0.805825,0.035005,0.708995,0.029459
9,4,0.25,8,8,32,3,0.746082,0.04261,0.695707,0.035194,0.805825,0.067961,0.70194,0.046227


In [33]:
grouped_mean.columns = grouped_mean.columns.map(''.join)
grouped_mean.sort_values(by=['f1mean'], ascending=False)

Unnamed: 0,seconds_per_window,presence_threshold,num_layers,num_heads,head_dim,run_idcount,f1mean,f1std,precisionmean,precisionstd,recallmean,recallstd,accuracymean,accuracystd
25,9,0.5,8,8,32,3,0.764859,0.012449,0.703505,0.040423,0.84466,0.077061,0.717813,0.013315
26,9,0.75,8,8,32,3,0.764474,0.019809,0.741106,0.027464,0.79085,0.040817,0.735816,0.021497
23,8,0.75,8,8,32,3,0.76356,0.018412,0.718521,0.022514,0.81877,0.073513,0.724868,0.005291
11,4,0.75,8,8,32,3,0.76266,0.012336,0.716906,0.027892,0.815534,0.016816,0.723104,0.021383
18,7,0.25,8,8,32,3,0.760204,0.021113,0.728573,0.024657,0.796117,0.044491,0.726631,0.021383
22,8,0.5,8,8,32,3,0.759512,0.012188,0.762549,0.008881,0.757282,0.033632,0.738977,0.00611
5,2,0.75,8,8,32,3,0.752484,0.007379,0.667034,0.025983,0.867314,0.066086,0.689594,0.011014
27,10,0.25,8,8,32,3,0.752165,0.026042,0.705985,0.050541,0.815534,0.100896,0.708995,0.024246
7,3,0.5,8,8,32,3,0.751901,0.021258,0.670197,0.026935,0.860841,0.073513,0.691358,0.021383
17,6,0.75,8,8,32,3,0.751367,0.028589,0.731831,0.031999,0.776699,0.079469,0.72134,0.021383


In [34]:
def compute_metrics(labels, predictions):

    f1 = f1_score(labels, predictions)
    f1_weighted = f1_score(labels, predictions, average='weighted')
    precision = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)
    accuracy = accuracy_score(labels, predictions)

    return f1, f1_weighted, precision, recall, accuracy

In [35]:
json_preds_files = glob.glob('../results/dvlog-baseline-model-size-ablation/temporal-evaluator*over-time:test.json')

json_preds = []

for file in json_preds_files:

    preds = defaultdict(lambda: {'mode_preds': None, 'mode_preds_threshold': None, 'true_label': None})

    with open(file) as f:
        preds_over_time = json.load(f)
    
    for key in preds_over_time.keys():
        if 'pred' in key:
            continue

        preds[key]['mode_preds'] = int(mode(np.round(preds_over_time[key]['preds']))[0][0])
        preds[key]['true_label'] = int(preds_over_time[key]['true_label'])

    sorted_keys = sorted(preds.keys())

    y_preds_mode = np.array([preds[key]['mode_preds'] for key in sorted_keys])
    true_labels = np.array([preds[key]['true_label'] for key in sorted_keys])

    f1_threshold, f1_weighted_threshold, precision_threshold, recall_threshold, accuracy_threshold = compute_metrics(true_labels, y_preds_mode)

    true_labels = np.array([preds[key]['true_label'] for key in sorted_keys])

    f1, f1_weighted, precision, recall, accuracy = compute_metrics(true_labels, y_preds_mode)
    
    results = {
        'seconds_per_window': [int(file.split(':')[-3].split('-')[3])],
        'presence_threshold': [float(file.split(':')[-3].split('-')[1])],
        'num_layers': [int(file.split(':')[-3].split('-')[5])],
        'num_heads': [int(file.split(':')[-3].split('-')[6][-1:])],
        'head_dim': [int(file.split(':')[-3].split('-')[7][-2:])],
        'run_id': [int(file.split(':')[-3].split('-')[9])],
        'filename': [file.split('/')[-1]],
        'evaluator': [file.split('/')[-1].split(':')[0]],
        'f1': [f1],
        'f1_weighted': [f1_weighted],
        'precision': [precision],
        'recall': [recall],
        'accuracy': [accuracy],
        'prediction_kind': ['mode'],

    }

    results = pd.DataFrame.from_dict(results)
    json_preds.append(results)

json_all_preds = pd.concat(json_preds)
json_all_preds

Unnamed: 0,seconds_per_window,presence_threshold,num_layers,num_heads,head_dim,run_id,filename,evaluator,f1,f1_weighted,precision,recall,accuracy,prediction_kind
0,8,0.50,8,8,32,1,temporal-evaluator:dvlog-baseline-model-size-a...,temporal-evaluator,0.758294,0.729316,0.740741,0.776699,0.730159,mode
0,9,0.25,8,4,64,3,temporal-evaluator:dvlog-baseline-model-size-a...,temporal-evaluator,0.711712,0.656213,0.663866,0.766990,0.661376,mode
0,9,0.25,8,8,32,2,temporal-evaluator:dvlog-baseline-model-size-a...,temporal-evaluator,0.713615,0.675706,0.690909,0.737864,0.677249,mode
0,6,0.75,8,8,32,2,temporal-evaluator:dvlog-baseline-model-size-a...,temporal-evaluator,0.729064,0.709339,0.740000,0.718447,0.708995,mode
0,3,0.25,8,8,32,3,temporal-evaluator:dvlog-baseline-model-size-a...,temporal-evaluator,0.763636,0.721433,0.717949,0.815534,0.724868,mode
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,2,0.75,8,8,32,2,temporal-evaluator:dvlog-baseline-model-size-a...,temporal-evaluator,0.757991,0.716436,0.715517,0.805825,0.719577,mode
0,2,0.25,8,8,32,1,temporal-evaluator:dvlog-baseline-model-size-a...,temporal-evaluator,0.751174,0.718236,0.727273,0.776699,0.719577,mode
0,9,0.50,8,8,32,2,temporal-evaluator:dvlog-baseline-model-size-a...,temporal-evaluator,0.776371,0.707090,0.686567,0.893204,0.719577,mode
0,3,0.50,8,8,32,1,temporal-evaluator:dvlog-baseline-model-size-a...,temporal-evaluator,0.773913,0.716914,0.700787,0.864078,0.724868,mode


In [36]:
json_all_preds[(json_all_preds['seconds_per_window'] == 8) & (json_all_preds['presence_threshold'] == 0.50)]

Unnamed: 0,seconds_per_window,presence_threshold,num_layers,num_heads,head_dim,run_id,filename,evaluator,f1,f1_weighted,precision,recall,accuracy,prediction_kind
0,8,0.5,8,8,32,1,temporal-evaluator:dvlog-baseline-model-size-a...,temporal-evaluator,0.758294,0.729316,0.740741,0.776699,0.730159,mode
0,8,0.5,8,8,32,3,temporal-evaluator:dvlog-baseline-model-size-a...,temporal-evaluator,0.752475,0.735837,0.767677,0.737864,0.73545,mode
0,8,0.5,8,8,32,2,temporal-evaluator:dvlog-baseline-model-size-a...,temporal-evaluator,0.730964,0.720143,0.765957,0.699029,0.719577,mode


In [37]:
csv_preds_files = glob.glob('../results/temporal-dvlog-baseline-model-size-ablation/temporal-evaluator*test.csv')
csv_preds = pd.concat([pd.read_csv(file) for file in csv_preds_files])
csv_preds.sort_values(by=['f1'], ascending = False)
# import pprint as pp
# pp.pprint(csv_preds['name'].values.tolist())

['dvlog-baseline-model-size-ablation:pt-0.25-spw-4-nl-8-nh8-hd32-run-2',
 'dvlog-baseline-model-size-ablation:pt-0.25-spw-4-nl-8-nh8-hd32-run-2',
 'dvlog-baseline-model-size-ablation:pt-0.25-spw-4-nl-8-nh8-hd32-run-2',
 'dvlog-baseline-model-size-ablation:pt-0.25-spw-4-nl-8-nh8-hd32-run-2',
 'dvlog-baseline-model-size-ablation:pt-0.25-spw-4-nl-8-nh8-hd32-run-2',
 'dvlog-baseline-model-size-ablation:pt-0.25-spw-4-nl-8-nh8-hd32-run-2',
 'dvlog-baseline-model-size-ablation:pt-0.25-spw-4-nl-8-nh8-hd32-run-2',
 'dvlog-baseline-model-size-ablation:pt-0.25-spw-4-nl-8-nh8-hd32-run-2',
 'dvlog-baseline-model-size-ablation:pt-0.25-spw-4-nl-8-nh8-hd32-run-2',
 'dvlog-baseline-model-size-ablation:pt-0.25-spw-4-nl-8-nh8-hd32-run-2',
 'dvlog-baseline-model-size-ablation:pt-0.50-spw-4-nl-8-nh8-hd32-run-2',
 'dvlog-baseline-model-size-ablation:pt-0.50-spw-4-nl-8-nh8-hd32-run-2',
 'dvlog-baseline-model-size-ablation:pt-0.50-spw-4-nl-8-nh8-hd32-run-2',
 'dvlog-baseline-model-size-ablation:pt-0.50-spw-4-

In [38]:
csv_preds[(csv_preds['seconds_per_window'] == 4) & (csv_preds['presence_threshold'] == 0.50)]

Unnamed: 0,name,run_id,f1,recall,precision,auc,accuracy,f1_weighted,dataset,dataset_kind,model,seconds_per_window,presence_threshold,modalities,model_args.num_layers,model_args.self_attn_num_heads,model_args.self_attn_dim_head,prediction_kind
0,dvlog-baseline-model-size-ablation:pt-0.50-spw...,2,0.633484,0.679612,0.59322,0.548205,0.571429,0.5655,d-vlog,test,baseline,4,0.5,"['audio_embeddings', 'face_embeddings']",8,8,32,last
1,dvlog-baseline-model-size-ablation:pt-0.50-spw...,2,0.786325,0.893204,0.70229,0.797923,0.73545,0.725557,d-vlog,test,baseline,4,0.5,"['audio_embeddings', 'face_embeddings']",8,8,32,mean
2,dvlog-baseline-model-size-ablation:pt-0.50-spw...,2,0.777778,0.883495,0.694656,0.797923,0.724868,0.71458,d-vlog,test,baseline,4,0.5,"['audio_embeddings', 'face_embeddings']",8,8,32,mode
3,dvlog-baseline-model-size-ablation:pt-0.50-spw...,2,0.7897,0.893204,0.707692,0.805261,0.740741,0.731624,d-vlog,test,baseline,4,0.5,"['audio_embeddings', 'face_embeddings']",8,8,32,threshold
4,dvlog-baseline-model-size-ablation:pt-0.50-spw...,2,0.787879,0.883495,0.710938,0.805261,0.740741,0.732724,d-vlog,test,baseline,4,0.5,"['audio_embeddings', 'face_embeddings']",8,8,32,mode_threshold
5,dvlog-baseline-model-size-ablation:pt-0.50-spw...,2,0.695652,0.854369,0.586667,0.597539,0.592593,0.553842,d-vlog,test,baseline,4,0.5,"['audio_embeddings', 'face_embeddings']",8,8,32,last_presence
6,dvlog-baseline-model-size-ablation:pt-0.50-spw...,2,0.774468,0.883495,0.689394,0.791149,0.719577,0.708445,d-vlog,test,baseline,4,0.5,"['audio_embeddings', 'face_embeddings']",8,8,32,mean_presence
7,dvlog-baseline-model-size-ablation:pt-0.50-spw...,2,0.777778,0.883495,0.694656,0.791149,0.724868,0.71458,d-vlog,test,baseline,4,0.5,"['audio_embeddings', 'face_embeddings']",8,8,32,mode_presence
8,dvlog-baseline-model-size-ablation:pt-0.50-spw...,2,0.774468,0.883495,0.689394,0.800406,0.719577,0.708445,d-vlog,test,baseline,4,0.5,"['audio_embeddings', 'face_embeddings']",8,8,32,threshold_presence
9,dvlog-baseline-model-size-ablation:pt-0.50-spw...,2,0.791304,0.883495,0.716535,0.800406,0.746032,0.73869,d-vlog,test,baseline,4,0.5,"['audio_embeddings', 'face_embeddings']",8,8,32,mode_threshold_presence


In [46]:
grouped = csv_preds.groupby(['seconds_per_window', 'presence_threshold', 'prediction_kind']).agg(
    {'run_id': 'count', 'f1': ['mean', 'std'], 'precision': ['mean', 'std'], 'recall': ['mean', 'std'], 'accuracy': ['mean', 'std'], 'prediction_kind': 'first'}).reset_index()
grouped.columns = grouped.columns.map(''.join)
grouped = grouped[(grouped['prediction_kindfirst'] == 'mode') | (grouped['prediction_kindfirst'] == 'mode_presence')]
grouped.sort_values(by=['f1mean'], ascending=False)

Unnamed: 0,seconds_per_window,presence_threshold,prediction_kind,run_idcount,f1mean,f1std,precisionmean,precisionstd,recallmean,recallstd,accuracymean,accuracystd,prediction_kindfirst
165,9,0.5,mode_presence,3,0.765645,0.013856,0.708246,0.030292,0.838188,0.071564,0.72134,0.003055,mode_presence
164,9,0.5,mode,3,0.761303,0.009793,0.701437,0.03703,0.838188,0.071564,0.714286,0.010582,mode
125,7,0.25,mode_presence,3,0.759893,0.01825,0.733125,0.017503,0.789644,0.039237,0.728395,0.017008,mode_presence
124,7,0.25,mode,3,0.758738,0.019996,0.730969,0.02042,0.789644,0.039237,0.726631,0.020031,mode
105,6,0.25,mode_presence,3,0.753546,0.023717,0.714175,0.008498,0.799353,0.059321,0.716049,0.016164,mode_presence
55,3,0.5,mode_presence,3,0.752622,0.018359,0.678205,0.024426,0.847896,0.053472,0.696649,0.021383,mode_presence
54,3,0.5,mode,3,0.751972,0.020605,0.679076,0.026182,0.84466,0.051374,0.696649,0.025004,mode
104,6,0.25,mode,3,0.75153,0.026795,0.713281,0.008824,0.796117,0.063664,0.714286,0.019077,mode
64,4,0.25,mode,3,0.751479,0.047803,0.702225,0.040569,0.809061,0.066086,0.708995,0.053698,mode
65,4,0.25,mode_presence,3,0.749083,0.041175,0.69839,0.032943,0.809061,0.066086,0.705467,0.044056,mode_presence


In [40]:
grouped_mode = json_all_preds.groupby(['seconds_per_window', 'presence_threshold', 'num_layers', 'num_heads', 'head_dim']).agg(
    {'run_id': 'count', 'f1': ['mean', 'std'], 'precision': ['mean', 'std'], 'recall': ['mean', 'std'], 'accuracy': ['mean', 'std']}).reset_index()
grouped_mode.columns = grouped_mode.columns.map(''.join)
grouped_mode.sort_values(by=['f1mean'], ascending=False)

Unnamed: 0,seconds_per_window,presence_threshold,num_layers,num_heads,head_dim,run_idcount,f1mean,f1std,precisionmean,precisionstd,recallmean,recallstd,accuracymean,accuracystd
27,9,0.5,8,8,32,3,0.76885,0.008508,0.71142,0.035742,0.841424,0.066086,0.724868,0.009164
28,9,0.75,8,8,32,3,0.764796,0.009805,0.750384,0.026674,0.781046,0.029951,0.739362,0.014073
19,7,0.25,8,8,32,3,0.758738,0.019996,0.730969,0.02042,0.789644,0.039237,0.726631,0.020031
11,4,0.75,8,8,32,3,0.757638,0.006551,0.721036,0.021672,0.799353,0.029661,0.72134,0.011014
24,8,0.75,8,8,32,3,0.754714,0.009553,0.713919,0.022892,0.802589,0.045882,0.716049,0.008082
5,2,0.75,8,8,32,3,0.754627,0.00589,0.680319,0.033731,0.851133,0.055206,0.698413,0.019077
7,3,0.5,8,8,32,3,0.751972,0.020605,0.679076,0.026182,0.84466,0.051374,0.696649,0.025004
18,6,0.75,8,8,32,3,0.751697,0.030229,0.739057,0.024314,0.770227,0.089685,0.724868,0.015873
16,6,0.25,8,8,32,3,0.75153,0.026795,0.713281,0.008824,0.796117,0.063664,0.714286,0.019077
9,4,0.25,8,8,32,3,0.751479,0.047803,0.702225,0.040569,0.809061,0.066086,0.708995,0.053698
