In [1]:
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt
from collections import defaultdict
import seaborn as sns
from pprint import pprint
import json
from scipy.stats import mode
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

%matplotlib inline

In [53]:
from collections import defaultdict

def manipulate_predictions(df, source_dir, min_threshold = 0.35, max_threshold = 0.75):
    new_df = defaultdict(list)
    
    for i, row in df.iterrows():
        name = f"temporal-evaluator:{row['name']}:over-time:test.json"
        with open(f'../results/{source_dir}/{name}', 'rt') as f:
            data = json.load(f)

        preds = []
        true_labels = []
        for key, value in data.items():
            if key == 'preds_mean':
                continue
                
            if key == 'preds_threshold_mean':
                continue

            the_preds = np.array(value['preds'])

            positive_predictions = np.ceil(the_preds[the_preds > max_threshold]).astype(np.int32)
            negative_predictions = np.floor(the_preds[the_preds < min_threshold]).astype(np.int32)

            tpreds = np.concatenate([positive_predictions, negative_predictions])

            if not len(tpreds):
                the_mode = the_preds.mean() > 0.5 # if no preds, use mean
            else:
                the_mode = mode(tpreds, keepdims = True)[0][0]

            # if the_mode != value['true_label']:
            #     print(key, the_mode, value['true_label'], the_preds.mean())

            # get mode of preds
            preds.append(the_mode)
            true_labels.append(value['true_label'])

        f1 = f1_score(true_labels, preds)
        precision = precision_score(true_labels, preds)
        recall = recall_score(true_labels, preds)
        accuracy = accuracy_score(true_labels, preds)

        new_df['f1'].append(f1)
        new_df['recall'].append(recall)
        new_df['precision'].append(precision)
        new_df['accuracy'].append(accuracy)
        new_df['seconds_per_window'].append(row['seconds_per_window'])
        if 'modality' in row:
            new_df['modality'].append(row['modality'])
        # new_df['num_layers'].append(row['model_args.num_layers'])
        # new_df['num_heads'].append(row['model_args.num_heads'])
        # new_df['head_dim'].append(row['model_args.head_dim'])
        new_df['run_id'].append(row['run_id'])

    new_df = pd.DataFrame(new_df)
    return new_df


In [19]:
result_files = glob.glob('../results/temporal-dvlog-baseline-model-size-ablation/*:test.csv')


dfs = []
for file in result_files:
    df = pd.read_csv(file)
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True).reset_index(drop = True)

In [20]:
grouped = df.groupby(['seconds_per_window', 'presence_threshold', 'prediction_kind']).agg({'f1': ['mean', 'std'], 'precision': ['mean', 'std'], 'recall': ['mean', 'std'], 'accuracy': ['mean', 'std'], 'run_id': 'count'})

grouped.sort_values(by=[('f1', 'mean')], ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,f1,f1,precision,precision,recall,recall,accuracy,accuracy,run_id
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,std,mean,std,mean,std,mean,std,count
seconds_per_window,presence_threshold,prediction_kind,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
9,0.50,mode_presence,0.765645,0.013856,0.708246,0.030292,0.838188,0.071564,0.721340,0.003055,3
9,0.50,mode_threshold_presence,0.764425,0.018534,0.702856,0.032633,0.844660,0.086293,0.717813,0.003055,3
9,0.50,mean,0.764272,0.013509,0.700311,0.036950,0.847896,0.080841,0.716049,0.008082,3
9,0.50,mean_presence,0.762252,0.018065,0.699412,0.035554,0.844660,0.086293,0.714286,0.009164,3
9,0.50,threshold,0.761994,0.015237,0.696267,0.034280,0.847896,0.080841,0.712522,0.008082,3
...,...,...,...,...,...,...,...,...,...,...,...
4,0.25,last,0.605862,0.040389,0.561567,0.020364,0.660194,0.077670,0.534392,0.027997,3
2,0.50,last,0.599706,0.055829,0.581316,0.031268,0.624595,0.102900,0.550265,0.037037,3
5,0.25,last,0.597476,0.076000,0.616596,0.011916,0.592233,0.154121,0.576720,0.032184,3
8,0.25,last,0.597419,0.065762,0.568467,0.031603,0.653722,0.179634,0.534392,0.013999,3


In [21]:
df_new = df[(df['prediction_kind'] == 'mode') & (df['presence_threshold'] == 0.5) & (df['seconds_per_window'] == 9.0)]
df_new

Unnamed: 0,name,run_id,f1,recall,precision,auc,accuracy,f1_weighted,dataset,dataset_kind,model,seconds_per_window,presence_threshold,modalities,model_args.num_layers,model_args.self_attn_num_heads,model_args.self_attn_dim_head,prediction_kind
192,dvlog-baseline-model-size-ablation:pt-0.50-spw...,2,0.766667,0.893204,0.671533,0.802326,0.703704,0.688191,d-vlog,test,baseline,9,0.5,"['audio_embeddings', 'face_embeddings']",8,8,32,mode
202,dvlog-baseline-model-size-ablation:pt-0.50-spw...,3,0.767241,0.864078,0.689922,0.800858,0.714286,0.704855,d-vlog,test,baseline,9,0.5,"['audio_embeddings', 'face_embeddings']",8,8,32,mode
562,dvlog-baseline-model-size-ablation:pt-0.50-spw...,1,0.75,0.757282,0.742857,0.807519,0.724868,0.724572,d-vlog,test,baseline,9,0.5,"['audio_embeddings', 'face_embeddings']",8,8,32,mode


In [31]:
# for min_threshold in np.arange(0.1, 0.5, 0.05):
#     for max_threshold in np.arange(0.5, 0.9, 0.05):
#         manipulated_df = manipulate_predictions(df_new, 'temporal-dvlog-baseline-model-size-ablation', min_threshold, max_threshold)
#         print(min_threshold, max_threshold, manipulated_df['f1'].mean())

manipulated_df = manipulate_predictions(df_new, 'temporal-dvlog-baseline-model-size-ablation', min_threshold = 0.2, max_threshold = 0.8)

manipulated_df.mean(), manipulated_df.std()


(f1                    0.773266
 recall                0.857605
 precision             0.709112
 accuracy              0.726631
 seconds_per_window    9.000000
 run_id                2.000000
 dtype: float64,
 f1                    0.009237
 recall                0.078475
 precision             0.044025
 accuracy              0.013315
 seconds_per_window    0.000000
 run_id                1.000000
 dtype: float64)

In [45]:
result_files = glob.glob('../results/dvlog-baseline-ablation-modalities-final/*:test.csv')

dfs = []
for file in result_files:
    df = pd.read_csv(file)

    df['presence_threshold'] = 0.25
    df['seconds_per_window'] = 6
    modality = file.split('/')[-1].split(':')[2]

    if 'av-lm-eyes' in modality:
        df['modality'] = 'av-lm-eyes'
    elif 'av-lm' in modality:
        df['modality'] = 'av-lm'
    elif 'av-eyes' in modality:
        df['modality'] = 'av-eyes'
    else:
        print(file)
        continue

    dfs.append(df)

df = pd.concat(dfs, ignore_index=True).reset_index(drop = True)

df_new = df[(df['prediction_kind'] == 'mean')]

../results/dvlog-baseline-ablation-modalities-final/majority-evaluator:dvlog-baseline-ablation-modalities-final:lm-eyes-run-2:test.csv
../results/dvlog-baseline-ablation-modalities-final/majority-evaluator:dvlog-baseline-ablation-modalities-final:lm-eyes-run-3:test.csv
../results/dvlog-baseline-ablation-modalities-final/temporal-evaluator:dvlog-baseline-ablation-modalities-final:lm-eyes-run-1:test.csv
../results/dvlog-baseline-ablation-modalities-final/majority-evaluator:dvlog-baseline-ablation-modalities-final:lm-run-1:test.csv
../results/dvlog-baseline-ablation-modalities-final/temporal-evaluator:dvlog-baseline-ablation-modalities-final:eyes-run-1:test.csv
../results/dvlog-baseline-ablation-modalities-final/majority-evaluator:dvlog-baseline-ablation-modalities-final:eyes-run-3:test.csv
../results/dvlog-baseline-ablation-modalities-final/majority-evaluator:dvlog-baseline-ablation-modalities-final:eyes-run-2:test.csv
../results/dvlog-baseline-ablation-modalities-final/temporal-evaluato

In [46]:
grouped = df.groupby('modality').agg({'f1': ['mean', 'std'], 'precision': ['mean', 'std'], 'recall': ['mean', 'std'], 'accuracy': ['mean', 'std'], 'run_id': 'count'})

In [55]:
# for min_threshold in np.arange(0.1, 0.5, 0.05):
#     for max_threshold in np.arange(0.5, 0.9, 0.05):
#         manipulated_df = manipulate_predictions(df_new, 'dvlog-baseline-ablation-modalities-final', min_threshold, max_threshold)
#         grouped = manipulated_df.groupby('modality')['f1'].mean()
#         print(min_threshold, max_threshold, grouped)

manipulated_df = manipulate_predictions(df_new, 'dvlog-baseline-ablation-modalities-final', min_threshold = 0.25, max_threshold = 0.8)

manipulated_df.groupby('modality').agg({'f1': ['mean', 'std'], 'precision': ['mean', 'std'], 'recall': ['mean', 'std'], 'accuracy': ['mean', 'std'], 'run_id': 'count'})

Unnamed: 0_level_0,f1,f1,precision,precision,recall,recall,accuracy,accuracy,run_id
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,count
modality,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
av-eyes,0.727534,0.045438,0.770913,0.029614,0.68932,0.059056,0.719577,0.041324,2
av-lm,0.780159,0.026286,0.767775,0.023864,0.796117,0.067961,0.756614,0.019077,0
av-lm-eyes,0.719916,0.046913,0.73004,0.014235,0.711974,0.078475,0.700176,0.037538,0


In [65]:
result_files = glob.glob('../results/dvlog-baseline-model-size-ablation/temporal-evaluator:dvlog-baseline-model-size-ablation:pt-0.25-spw-6-nl-8-nh8-hd32*:test.csv')

dfs = []
for file in result_files:
    df = pd.read_csv(file)
    df['seconds_per_window'] = 6
    df['run_id'] = file.split('/')[-1].split(':')[2].split('-')[-1]

    dfs.append(df)

df = pd.concat(dfs, ignore_index=True).reset_index(drop = True)
df_new = df[(df['prediction_kind'] == 'mean')]

df_new

Unnamed: 0,f1,recall,precision,auc,accuracy,name,dataset,dataset_kind,model,prediction_kind,seconds_per_window,run_id
1,0.729858,0.747573,0.712963,0.793181,0.698413,dvlog-baseline-model-size-ablation:pt-0.25-spw...,d-vlog,test,baseline,mean,6,1
3,0.743119,0.786408,0.704348,0.78494,0.703704,dvlog-baseline-model-size-ablation:pt-0.25-spw...,d-vlog,test,baseline,mean,6,2
5,0.767241,0.864078,0.689922,0.799955,0.714286,dvlog-baseline-model-size-ablation:pt-0.25-spw...,d-vlog,test,baseline,mean,6,3


In [69]:
manipulated_df = manipulate_predictions(df_new, 'dvlog-baseline-model-size-ablation', min_threshold = 0.25, max_threshold = 0.8)
manipulated_df.mean(), manipulated_df.std()

  manipulated_df.mean(), manipulated_df.std()


(f1                     0.747087
 recall                 0.776699
 precision              0.723095
 accuracy               0.714286
 seconds_per_window     6.000000
 run_id                41.000000
 dtype: float64,
 f1                    0.014184
 recall                0.063664
 precision             0.029856
 accuracy              0.005291
 seconds_per_window    0.000000
 dtype: float64)