In [2]:
import pandas as pd
import numpy as np
import glob
import matplotlib.pyplot as plt
from collections import defaultdict
import seaborn as sns
from pprint import pprint
import json
from scipy.stats import mode
import sklearn
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from tqdm import tqdm

%matplotlib inline

In [3]:
######################
## CHECK GROUP NAME ##
######################
exps_group = 'baseline-original-dvlog-new-split'

In [13]:
from collections import defaultdict

########################################################################
## COMPUTING PERFORMANCE METRICS USING DIFFERENT THRESHOLDS ##
########################################################################
def manipulate_predictions(df, source_dir, evaluate_dataset, min_threshold = 0.35, max_threshold = 0.75, simulate_no_thr = False):
    new_df = defaultdict(list)
    
    for i, row in df.iterrows():
        name = f"temporal-evaluator:{row['name']}:over-time:{evaluate_dataset}.json"
        with open(f'../results/{source_dir}/{name}', 'rt') as f:
            data = json.load(f)

        preds = []
        true_labels = []
        for key, value in data.items():
            if key == 'preds_mean':
                continue
                
            if key == 'preds_threshold_mean':
                continue

            the_preds = np.array(value['preds'])

            if simulate_no_thr:
                the_mode = mode(the_preds.round(), keepdims = True)[0][0]
            else:
                positive_predictions = np.ceil(the_preds[the_preds > max_threshold]).astype(np.int32)
                negative_predictions = np.floor(the_preds[the_preds < min_threshold]).astype(np.int32)

                tpreds = np.concatenate([positive_predictions, negative_predictions])

                if not len(tpreds):
                    the_mode = the_preds.mean() > 0.5 # if no preds, use mean
                else:
                    the_mode = mode(tpreds, keepdims = True)[0][0]

            # if the_mode != value['true_label']:
            #     print(key, the_mode, value['true_label'], the_preds.mean())

            # get mode of preds
            preds.append(the_mode)
            true_labels.append(value['true_label'])

        f1 = f1_score(true_labels, preds)
        f1_weighted = f1_score(true_labels, preds, average = "weighted")
        precision = precision_score(true_labels, preds)
        precision_weighted = precision_score(true_labels, preds, average = "weighted")
        recall = recall_score(true_labels, preds)
        recall_weighted = recall_score(true_labels, preds, average = "weighted")
        accuracy = accuracy_score(true_labels, preds)

        new_df['f1'].append(f1)
        new_df['f1_weighted'].append(f1_weighted)
        new_df['precision'].append(precision)
        new_df['precision_weighted'].append(precision_weighted)
        new_df['recall'].append(recall)
        new_df['recall_weighted'].append(recall_weighted)
        new_df['accuracy'].append(accuracy)
        new_df['seconds_per_window'].append(row['seconds_per_window'])
        if 'modality' in row:
            new_df['modality'].append(row['modality'])
        new_df['run_id'].append(row['run_id'])

    new_df = pd.DataFrame(new_df)
    return new_df

##################################
## COLLECTING RESULTS FROM CSVs ##
##################################
def get_results(evaluate_dataset, evaluation_type = 'mode'):
    dfs = []
    result_files = glob.glob(f'../results/{exps_group}/*:{evaluate_dataset}.csv')

    for file in result_files:
        df = pd.read_csv(file)

        modality = file.split('/')[-1].split(':')[2]

        if 'av+lm+eyes-' in modality:
            df['modality'] = '4. av+lm+eyes'
        elif 'av+lm-' in modality:
            df['modality'] = '3. av+lm'
        elif 'av+eyes-' in modality:
            df['modality'] = '2. av+eyes'
        elif 'av-' in modality:
            df['modality'] = '1. av'
        else:
            print(file)
            continue

        dfs.append(df)

    all_results = pd.concat(dfs, ignore_index=True).reset_index(drop = True)
    results = all_results[(all_results['prediction_kind'] == evaluation_type)]

    return results


In [14]:
import warnings
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)
###############################################
## SHOWING RESULTS WITHOUT THRESHOLD OPTIMUM ##
###############################################
for evaluate_dataset in ['validation', 'test']:
    dataset_results = get_results(evaluate_dataset)
    manipulated_df = manipulate_predictions(dataset_results, exps_group, evaluate_dataset, simulate_no_thr = True)
    to_print = manipulated_df.groupby('modality').agg({'f1': ['mean', 'std'], 'precision': ['mean', 'std'], 'recall': ['mean', 'std'], 'run_id': 'count'})
    print('\n\n', evaluate_dataset.upper())
    print(to_print)



 VALIDATION
                f1           precision              recall           run_id
              mean       std      mean       std      mean       std  count
modality                                                                   
1. av     0.726271  0.009511  0.602444  0.029321  0.918367  0.045634      5


 TEST
                f1           precision              recall           run_id
              mean       std      mean       std      mean       std  count
modality                                                                   
1. av     0.732286  0.011774  0.625137  0.011662  0.885437  0.045226      5


In [5]:
import warnings
warnings.filterwarnings("ignore", category=sklearn.exceptions.UndefinedMetricWarning)
##########################################################
## EXPLORING DIFFERENT THRESHOLDS ACCORDING TO A METRIC ##
##########################################################
for evaluate_dataset in ['validation', 'test']:
    global_sum_f1s = 0.0
    best_grouped = None
    best_max_threshold = 0.0
    best_min_threshold = 1.0

    dataset_results = get_results(evaluate_dataset)
    for min_threshold in tqdm(np.arange(0.1, 0.5, 0.05)):
        for max_threshold in tqdm(np.arange(0.5, 0.9, 0.05), leave = False):
            manipulated_df =  manipulate_predictions(dataset_results, exps_group, evaluate_dataset, min_threshold, max_threshold)
            grouped = manipulated_df.groupby('modality')
            local_sum_f1s = grouped['f1_weighted'].mean().sum()

            if local_sum_f1s > global_sum_f1s:
                global_sum_f1s = local_sum_f1s
                best_grouped = grouped
                best_max_threshold = max_threshold
                best_min_threshold = min_threshold
            # print(min_threshold, max_threshold, grouped)

    print(f'\n\nBEST based on {evaluate_dataset}:')
    print(best_min_threshold, best_max_threshold)

100%|██████████| 8/8 [00:21<00:00,  2.75s/it]




BEST based on validation:
0.45000000000000007 0.8500000000000003


100%|██████████| 8/8 [00:37<00:00,  4.66s/it]



BEST based on test:
0.40000000000000013 0.8500000000000003





In [7]:
###################################################
## SHOWING OBTAINED RESULTS FOR THE BEST SETTING ##
###################################################
# threholds based on validation set --> 0.45 0.85
# thresholds based on test set      --> 0.40 0.85
for min_thr, max_thr in [(0.45, 0.85), (0.40, 0.85)]:
    print('\n\n', f'{min_thr} || {max_thr}')
    for evaluate_dataset in ['validation', 'test']:
        dataset_results = get_results(evaluate_dataset)
        manipulated_df = manipulate_predictions(dataset_results, exps_group, evaluate_dataset, min_threshold = min_thr, max_threshold = max_thr)
        to_print = manipulated_df.groupby('modality').agg({'precision_weighted': ['mean', 'std'], 'recall_weighted': ['mean', 'std'], 'f1_weighted': ['mean', 'std'], 'run_id': 'count'})
        print('\n', evaluate_dataset.upper())
        print(to_print)



 0.45 || 0.85

 VALIDATION
              precision_weighted           recall_weighted            \
                            mean       std            mean       std   
modality                                                               
1. av                   0.683231  0.036397        0.668085  0.037912   
2. av+eyes              0.695799  0.024494        0.680851  0.027122   
3. av+lm                0.610922  0.195165        0.627660  0.100362   
4. av+lm+eyes           0.690083  0.028488        0.653191  0.082266   

              f1_weighted           run_id  
                     mean       std  count  
modality                                    
1. av            0.662674  0.043485      5  
2. av+eyes       0.672933  0.030869      5  
3. av+lm         0.575081  0.164954      5  
4. av+lm+eyes    0.627918  0.133737      5  

 TEST
              precision_weighted           recall_weighted            \
                            mean       std            mean       std   
