# AI-Benchmarking face recognition dataset - evaluation example
This is a Jupyter Notebook to compute classic metrics for the AI-Benchmarking face recognition dataset. 

In [1]:
import boto3
import json
import pandas as pd
import numpy as np

from collections import Counter
from datetime import timedelta

---
### Load annotations file from dataset S3 bucket 

In [4]:
# AWS S3 parameters
# You need an aws profile set up with your credentials in your aws CLI configuration file
session = boto3.Session(profile_name='mturk')
s3_client = session.client('s3')
s3_bucket = 'video-face-reco-dataset'

In [5]:
# Load AI-Benchmarking dataset annotations file from S3 bucket
# You can also load it from a local folder if you have already downloaded the dataset
annotations_path = 'video-annotation-dataset/Graham Norton Show 12.json'

annotations_obj = s3_client.get_object(Bucket=s3_bucket,Key=annotations_path)
annotations_json = json.loads(annotations_obj['Body'].read())

In [6]:
print(annotations_json.keys())

dict_keys(['video_title', 'video_url', 'program_name', 'all_personalities', 'annotation'])


In [49]:
annotations_json

{'video_title': 'The.Graham.Norton Show S30E5',
 'video_url': 'https://www.dailymotion.com/video/x8f0vto',
 'program_name': 'Graham Norton Show',
 'all_personalities': ['Graham Norton',
  'Bono',
  'Taylor Swift',
  'Eddie Redmayne',
  'Alex Scott',
  'Lady Blackbird'],
 'annotation': {'0': {'time_interval': '[00:00:00.000,00:00:13.000,1.0]',
   'frame_interval': '[0,325,25]',
   'personalities': ['Taylor Swift', 'Graham Norton', 'Bono']},
  '1': {'time_interval': '[00:00:14.000,00:00:43.000,1.0]',
   'frame_interval': '[350,1075,25]',
   'personalities': ['Taylor Swift', 'Graham Norton', 'Bono']},
  '2': {'time_interval': '[00:00:44.000,00:01:13.000,1.0]',
   'frame_interval': '[1100,1825,25]',
   'personalities': ['Graham Norton']},
  '3': {'time_interval': '[00:01:14.000,00:01:43.000,1.0]',
   'frame_interval': '[1850,2575,25]',
   'personalities': ['Graham Norton']},
  '4': {'time_interval': '[00:01:44.000,00:02:03.000,1.0]',
   'frame_interval': '[2600,3075,25]',
   'personalities

---
### Examples of confusion matrices extraction from predictions and annotations files

We define for the personalities present in the metadata of the video that we want to evaluate:
- **True Positive (tp)**: When a personality is correctly detected present in a video interval.
- **False Positive (fp)**: When a personality is incorrectly detected present in a video interval when they are not.
- **True Negative (tn)**: When a personality is correctly detected not present in a video interval.
- **False Negative (fn)**: When a personality is incorrectly detected not present in a video interval when they are.

We can also define personalities not present in the video metadata and labeled as present in an interval when they obviously are not, as False Positives. We count them separately because this may be due to incomplete metadata and distort the evaluation, even if it can also be due to a prediction error.

---

The preferred input format to compute the classic metrics is a pandas DataFrame with as indices the names of the personalities present in the metadata of the video we want to evaluate. And 4 columns: tp, fp, tn and fn which respect the previous definitions.

You can find below an example that uses the AwsCelebrityRecognition service.

In [7]:
# Here is a minimal example of an input format that you can use to compute classic metrics from the dataset.

# Each element of the list is personality detection sorted by time. 
# There can be several elements with the same Timestamp if multiple detection on the same frame.

predictions_celebs_format = [
    {'Timestamp': 0, # in milliseconds
     'Celebrity': {
        'Urls': ['www.wikidata.org/wiki/Q19154', 'www.imdb.com/name/nm0636218'], # if you have it, can be useful to create celebrity ID
        'Name': 'Graham Norton'} # similar to the video metadata (all_personalities field in the annotation file)
    },
    {'Timestamp': 1000, 
     'Celebrity': {
        'Urls': ['www.wikidata.org/wiki/Q26876', 'www.imdb.com/name/nm2357847'],
        'Name': 'Taylor Swift'}
    },
    {'Timestamp': 1480, 
     'Celebrity': {
        'Urls': ['www.wikidata.org/wiki/Q834621', 'www.imdb.com/name/nm0095104'],
        'Name': 'Bono'}},
]

In [8]:
# Load complete prediction files from AwsCelebrityRecognition service as example
# The predictions for the "Graham Norton Show 12" videos are divided into 5 files
predictions_celebs = []

for i in range(1,6):
    predictions_path = f'../data/aws_data/Graham Norton Show 12_{i}.json'
    print(f"Load {predictions_path}")
    with open(predictions_path, "rb") as f:
        predictions_json = json.load(f)
    predictions_celebs += predictions_json['Celebrities']
    
print(f"\nNumber of personalities detected : {len(predictions_celebs)}")

Load ../data/aws_data/Graham Norton Show 12_1.json
Load ../data/aws_data/Graham Norton Show 12_2.json
Load ../data/aws_data/Graham Norton Show 12_3.json
Load ../data/aws_data/Graham Norton Show 12_4.json
Load ../data/aws_data/Graham Norton Show 12_5.json

Number of personalities detected : 4584


In [9]:
predictions_celebs[0]

{'Timestamp': 0,
 'Celebrity': {'Urls': ['www.wikidata.org/wiki/Q19154',
   'www.imdb.com/name/nm0636218'],
  'Name': 'Graham Norton',
  'Id': '1u2oh9S',
  'Confidence': 91.83985900878906,
  'Face': {'BoundingBox': {'Width': 0.05159798264503479,
    'Height': 0.13792622089385986,
    'Left': 0.5035192370414734,
    'Top': 0.31078198552131653},
   'Landmarks': [{'Type': 'eyeLeft',
     'X': 0.5172826647758484,
     'Y': 0.3678494691848755},
    {'Type': 'eyeRight', 'X': 0.5417059659957886, 'Y': 0.36701586842536926},
    {'Type': 'mouthLeft', 'X': 0.5195242762565613, 'Y': 0.4120739996433258},
    {'Type': 'mouthRight', 'X': 0.5398578643798828, 'Y': 0.41139599680900574},
    {'Type': 'nose', 'X': 0.528906524181366, 'Y': 0.3884477913379669}],
   'Pose': {'Roll': -1.282999873161316,
    'Yaw': -0.22642219066619873,
    'Pitch': 15.497112274169922},
   'Quality': {'Brightness': 61.47097396850586,
    'Sharpness': 60.49041748046875},
   'Confidence': 99.99595642089844},
  'KnownGender': {'Typ

In [24]:
def aws_confusion_df(predictions_celebs, annotations_json):
    """
    Generate confusion matrices of all personalities present in the video we want to evaluate.
    
    :param predictions_json dict: 
        dictionary of celebrities predictions generated with AwsCelebrityRecognition service.
    :param annotations_json dict: 
        dictionary of annotations for a given video from the AI-Benchmarking dataset.
    :return: 
        a pandas DataFrame confusion matrices of all personalities present in the video we want to evaluate and
        a dictionary of personalities detected but not in the metadata of the video (specific fp)
    :rtype: (pandas.DataFrame, dict)
    """
    # initiate confusion df with all personalities present in the video's metadata
    confusion_df = pd.DataFrame(index=annotations_json['all_personalities'])
    confusion_df = confusion_df.assign(tp=0, fp=0, tn=0, fn=0)
    # dictionary of predicted personalities not present in the video's metadata 
    other_pred = {}
        
    # aws index of the last predicted celebrities in the current interval
    last_pred_id = 0
    
    for interval_id, interval in annotations_json['annotation'].items():
        
        # start time and end time of the interval in milliseconds (because AWS Timestamp in ms)
        interval_split = interval['time_interval'][1:-1].split(',')
        interval_start = pd.Timedelta(interval_split[0]).total_seconds()*1000
        interval_end = pd.Timedelta(interval_split[1]).total_seconds()*1000

        # set of predicted personality in the current interval
        pred_ps = set()
        annot_ps = interval['personalities']
                
        for i, celebs in enumerate(predictions_celebs[last_pred_id:]):
            # if the current predicted personality is in the current interval add it to the set else go to the next interval
            if interval_start <= celebs['Timestamp'] <= interval_end:
                pred_ps.add(celebs['Celebrity']['Name'])
                
            # if the current predicted personality is between the previous and the current interval, pass to the next celebrity
            # So, we do not take into account the personalities that appear only between the 1 second intervals.
            elif celebs['Timestamp'] < interval_start :
                pass
            
            # if the current predicted personality is after the current interval, keep this celebrity and break to the next interval
            elif celebs['Timestamp'] > interval_end: 
                last_pred_id = last_pred_id+i
                break
            

        # From the personalities in the video's metadata, add the celebrities in the current interval to the corresponding tp, fp, tn and fn counts.
        for all_p in annotations_json['all_personalities']:
            if (all_p in pred_ps) and (all_p in annot_ps):
                confusion_df.loc[all_p]['tp'] += 1
            elif (all_p in pred_ps) and (all_p not in annot_ps):
                confusion_df.loc[all_p]['fp'] += 1
            elif (all_p not in pred_ps) and (all_p not in annot_ps):
                confusion_df.loc[all_p]['tn'] += 1
            elif (all_p not in pred_ps) and (all_p in annot_ps):
                confusion_df.loc[all_p]['fn'] += 1

        # add predicted personalities not present in video's metadata in external dictionary
        for pred_p in pred_ps: 
            if pred_p not in annotations_json['all_personalities']:
                other_pred.setdefault(pred_p, 0)
                other_pred[pred_p] += 1
                
    return confusion_df, other_pred

In [25]:
confusion_df, other_pred = aws_confusion_df(predictions_celebs, annotations_json)

In [26]:
# Examples of confusion dataframes format
confusion_df

Unnamed: 0,tp,fp,tn,fn
Graham Norton,90,0,105,6
Bono,89,1,72,39
Taylor Swift,89,0,57,55
Eddie Redmayne,123,0,63,15
Alex Scott,31,1,86,83
Lady Blackbird,0,0,181,20


In [28]:
# Personalities detected by the AwsCelebrityRecognition service that were not in the video's metadata
other_pred

{'Rohini Hattangadi': 1,
 'Rishi Sunak': 1,
 'Frank Sinatra': 1,
 'Sir John Tenniel': 1,
 "Dylan O'Brien": 1,
 'Jessica Chastain': 3,
 'Boris Strugatsky': 1,
 'John Paul II': 1,
 'Dennis Marcellino': 1,
 'Betsey Johnson': 6,
 'Richard Ayoade': 1,
 'Michaela Coel': 1,
 'Winston Duke': 1,
 'BeBe Zahara Benet': 1}

In [29]:
# If you want to evaluate a bigger sample of the dataset instead of one video
# You can concatenate several confusion DataFrames and other predictions dictionaries  
#db_confusion_df = confusion_df.add(confusion_df_2, fill_value=0).astype(int)
#db_other_pred = dict(Counter(other_pred)+Counter(other_pred_2))

---
### Compute Metrics

We compute the accuracy, precision, recall and f1-score from the confusion matrices.

In [33]:
def compute_metrics(confusion_df):
    """
    Computes the following classic metrics from the confusion matrices:
        - accuracy / precision / recall / f1
    
    :param confusion_df pandas.DataFrame: 
        pandas DataFrame composed of all the confusion matrices 
        of all the personalities present in the sample we want to evaluate.
    :return: a copy of confusion_df with all the computed classifical metrics
    :rtype: pandas.DataFrame
    """
    metrics_df = confusion_df.copy()
    
    metrics_df.loc['total'] = metrics_df.sum(axis=0)
    
    metrics_df['accuracy'] = (metrics_df['tp']+metrics_df['tn'])/metrics_df.sum(axis=1)
    metrics_df['precision'] = metrics_df['tp']/(metrics_df['fp']+metrics_df['tp'])
    metrics_df['recall'] = metrics_df['tp']/(metrics_df['fn']+metrics_df['tp'])
    metrics_df['f1'] = 2*metrics_df['precision']*metrics_df['recall']/(metrics_df['precision']+metrics_df['recall'])
    
    #metrics_df.loc['macro_average'] = metrics_df.iloc[:-1].mean(axis=0)
        
    return metrics_df.round(2)

In [34]:
metrics_df = compute_metrics(confusion_df)

In [35]:
metrics_df

Unnamed: 0,tp,fp,tn,fn,accuracy,precision,recall,f1
Graham Norton,90,0,105,6,0.97,1.0,0.94,0.97
Bono,89,1,72,39,0.8,0.99,0.7,0.82
Taylor Swift,89,0,57,55,0.73,1.0,0.62,0.76
Eddie Redmayne,123,0,63,15,0.93,1.0,0.89,0.94
Alex Scott,31,1,86,83,0.58,0.97,0.27,0.42
Lady Blackbird,0,0,181,20,0.9,,0.0,
total,422,2,564,218,0.82,1.0,0.66,0.79


## TEST JSONs
### This is a an example to explain the format of the JSON files
- JSON file of the annotation 
- JSON expected file 

In [44]:
# JSON published by the EBU AI-Benchmarking Group
annotations_json_test = {
    # personalities annotated in the video
    'all_personalities': [
        'Name True Positive',
        'Name False Positive',
        'Name True Negative',
        'Name False Negative'],
    # key : interval index
    # value : annotations
    'annotation': {
        '0': {
            # time in second defining the intervall in the video
            # 'time_interval': [start,end,sampling period in second]
            'time_interval': '[00:00:00.000,00:00:29.000,1.0]',
            # 'frame_interval':  [start,end,sampling period in Frame]
            'frame_interval': '[0,725,25]',
            # 'personalities': [ "Detected Celibitiy 1", "Detected Celebrity 2"]
            'personalities': ['Name True Positive', 'Name False Negative']
        }
    }
}

In [50]:
#JSON mininimal format from the face recognition system 
predictions_celebs_test = [
    {'Timestamp': 25000, 'Celebrity': {'Name': 'Name True Positive'}},
    {'Timestamp': 26000, 'Celebrity': {'Name': 'Name False Positive'}},
    {'Timestamp': 27000, 'Celebrity': {'Name': 'Name Other Pred'}}
]

In [51]:
confusion_df_test, other_pred_test = aws_confusion_df(predictions_celebs_test, annotations_json_test)

In [52]:
confusion_df_test

Unnamed: 0,tp,fp,tn,fn
Name True Positive,1,0,0,0
Name False Positive,0,1,0,0
Name True Negative,0,0,1,0
Name False Negative,0,0,0,1


In [53]:
other_pred_test

{'Name Other Pred': 1}

In [54]:
compute_metrics(confusion_df_test)

Unnamed: 0,tp,fp,tn,fn,accuracy,precision,recall,f1
Name True Positive,1,0,0,0,1.0,1.0,1.0,1.0
Name False Positive,0,1,0,0,0.0,0.0,,
Name True Negative,0,0,1,0,1.0,,,
Name False Negative,0,0,0,1,0.0,,0.0,
total,1,1,1,1,0.5,0.5,0.5,0.5
