In [1]:
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt
from collections import defaultdict
import seaborn as sns
from pprint import pprint
import json
from scipy.stats import mode
import sklearn
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from tqdm import tqdm

%matplotlib inline

In [2]:
######################
## CHECK GROUP NAME ##
######################
exps_group = 'perceiver-edaic-ablation-modalities-big-arch'

In [3]:
from collections import defaultdict

########################################################################
## COMPUTING PERFORMANCE METRICS USING DIFFERENT THRESHOLDS ##
########################################################################
def manipulate_predictions(df, source_dir, evaluate_dataset, min_threshold = 0.35, max_threshold = 0.75, simulate_no_thr = False):
    new_df = defaultdict(list)
    
    for i, row in df.iterrows():
        name = f"temporal-evaluator:{row['name']}:over-time:{evaluate_dataset}.json"
        with open(f'../results/{source_dir}/{name}', 'rt') as f:
            data = json.load(f)

        preds = []
        true_labels = []
        for key, value in data.items():
            if key == 'preds_mean':
                continue
                
            if key == 'preds_threshold_mean':
                continue

            the_preds = np.array(value['preds'])

            if simulate_no_thr:
                the_mode = mode(the_preds.round(), keepdims = True)[0][0]
            else:
                positive_predictions = np.ceil(the_preds[the_preds > max_threshold]).astype(np.int32)
                negative_predictions = np.floor(the_preds[the_preds < min_threshold]).astype(np.int32)

                tpreds = np.concatenate([positive_predictions, negative_predictions])

                if not len(tpreds):
                    the_mode = the_preds.mean() > 0.5 # if no preds, use mean
                else:
                    the_mode = mode(tpreds, keepdims = True)[0][0]

            # if the_mode != value['true_label']:
            #     print(key, the_mode, value['true_label'], the_preds.mean())

            # get mode of preds
            preds.append(the_mode)
            true_labels.append(value['true_label'])

        f1 = f1_score(true_labels, preds)
        f1_weighted = f1_score(true_labels, preds, average = "weighted")
        precision = precision_score(true_labels, preds)
        precision_weighted = precision_score(true_labels, preds, average = "weighted")
        recall = recall_score(true_labels, preds)
        recall_weighted = recall_score(true_labels, preds, average = "weighted")
        accuracy = accuracy_score(true_labels, preds)

        new_df['f1'].append(f1)
        new_df['f1_weighted'].append(f1_weighted)
        new_df['precision'].append(precision)
        new_df['precision_weighted'].append(precision_weighted)
        new_df['recall'].append(recall)
        new_df['recall_weighted'].append(recall_weighted)
        new_df['accuracy'].append(accuracy)
        new_df['seconds_per_window'].append(row['seconds_per_window'])
        if 'modality' in row:
            new_df['modality'].append(row['modality'])
        new_df['run_id'].append(row['run_id'])

    new_df = pd.DataFrame(new_df)
    return new_df

##################################
## COLLECTING RESULTS FROM CSVs ##
##################################
def get_results(evaluate_dataset, evaluation_type = 'mode'):
    dfs = []
    result_files = glob.glob(f'../results/{exps_group}/*:{evaluate_dataset}.csv')

    for file in result_files:
        df = pd.read_csv(file)

        modality = file.split('/')[-1].split(':')[2]

        if 'audiovisual' in modality:
            df['modality'] = '3. audiovisual'
        elif 'video' in modality:
            df['modality'] = '2. video'
        elif 'audio' in modality:
            df['modality'] = '1. audio'
        else:
            print(file)
            continue

        dfs.append(df)

    all_results = pd.concat(dfs, ignore_index=True).reset_index(drop = True)
    results = all_results[(all_results['prediction_kind'] == evaluation_type)]

    return results


In [4]:
import warnings
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)
###############################################
## SHOWING RESULTS WITHOUT THRESHOLD OPTIMUM ##
###############################################
for evaluate_dataset in ['validation', 'test']:
    dataset_results = get_results(evaluate_dataset)
    manipulated_df = manipulate_predictions(dataset_results, exps_group, evaluate_dataset, simulate_no_thr = True)
    to_print = manipulated_df.groupby('modality').agg({'f1_weighted': ['mean', 'std'], 'precision_weighted': ['mean', 'std'], 'recall_weighted': ['mean', 'std'], 'run_id': 'count'})
    print('\n\n', evaluate_dataset.upper())
    print(to_print)



 VALIDATION
               f1_weighted           precision_weighted            \
                      mean       std               mean       std   
modality                                                            
1. audio          0.428197  0.254553           0.596522  0.313629   
2. video          0.475312  0.302737           0.760079  0.067436   
3. audiovisual    0.662101  0.051132           0.720157  0.024491   

               recall_weighted           run_id  
                          mean       std  count  
modality                                         
1. audio              0.453571  0.195044      5  
2. video              0.517857  0.249042      5  
3. audiovisual        0.650000  0.085267      5  


 TEST
               f1_weighted           precision_weighted            \
                      mean       std               mean       std   
modality                                                            
1. audio          0.362790  0.172949           0.660419 

In [6]:
import warnings
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)
##########################################################
## EXPLORING DIFFERENT THRESHOLDS ACCORDING TO A METRIC ##
##########################################################
for evaluate_dataset in ['validation', 'test']:
    global_sum_f1s = 0.0
    best_grouped = None
    best_max_threshold = 0.0
    best_min_threshold = 1.0

    dataset_results = get_results(evaluate_dataset)
    for min_threshold in tqdm(np.arange(0.1, 0.5, 0.05)):
        for max_threshold in tqdm(np.arange(0.5, 0.9, 0.05), leave = False):
            manipulated_df =  manipulate_predictions(dataset_results, exps_group, evaluate_dataset, min_threshold, max_threshold)
            grouped = manipulated_df.groupby('modality')
            local_sum_f1s = grouped['f1_weighted'].mean().sum()

            if local_sum_f1s > global_sum_f1s:
                global_sum_f1s = local_sum_f1s
                best_grouped = grouped
                best_max_threshold = max_threshold
                best_min_threshold = min_threshold
            # print(min_threshold, max_threshold, grouped)

    print(f'\n\nBEST based on {evaluate_dataset}:')
    print(best_min_threshold, best_max_threshold)

100%|██████████| 8/8 [02:04<00:00, 15.57s/it]




BEST based on validation:
0.45000000000000007 0.8000000000000003


100%|██████████| 8/8 [02:10<00:00, 16.32s/it]



BEST based on test:
0.45000000000000007 0.7500000000000002





In [7]:
###################################################
## SHOWING OBTAINED RESULTS FOR THE BEST SETTING ##
###################################################
# threholds based on validation set --> 0.45 0.80
# thresholds based on test set      --> 0.45 0.75
for min_thr, max_thr in [(0.45, 0.80), (0.35, 0.85)]:
    print('\n\n', f'{min_thr} || {max_thr}')
    for evaluate_dataset in ['validation', 'test']:
        dataset_results = get_results(evaluate_dataset)
        manipulated_df = manipulate_predictions(dataset_results, exps_group, evaluate_dataset, min_threshold = min_thr, max_threshold = max_thr)
        to_print = manipulated_df.groupby('modality').agg({'precision_weighted': ['mean', 'std'], 'recall_weighted': ['mean', 'std'], 'f1_weighted': ['mean', 'std'], 'run_id': 'count'})
        print('\n', evaluate_dataset.upper())
        print(to_print)



 0.45 || 0.8

 VALIDATION
               precision_weighted           recall_weighted            \
                             mean       std            mean       std   
modality                                                                
1. audio                 0.681783  0.048566        0.660714  0.177676   
2. video                 0.704905  0.069459        0.492857  0.168748   
3. audiovisual           0.752808  0.057458        0.635714  0.203054   

               f1_weighted           run_id  
                      mean       std  count  
modality                                     
1. audio          0.629872  0.155519      5  
2. video          0.486705  0.153980      5  
3. audiovisual    0.605040  0.220418      5  

 TEST
               precision_weighted           recall_weighted            \
                             mean       std            mean       std   
modality                                                                
1. audio                 0.5771