In [None]:
import numpy as np
import torch
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# load data and predictions
x = # input data
y = # true locations
y_hat = # predictions

n, m = [*y.shape] # n = num_samples, m = 2*num_features

In [None]:
"""Set d to be either interocular distance or bounding box diagonal length. d should have size n
"""
d = 

In [None]:
"""
Z = list of lists, where each entry is [sensitive attribute column number, majority condition]
alpha = failure rate threshold   
calculate_landmark: if True, calculates certain bias metrics overall and per landmark. if false, only overall
"""
Z = []
alpha = 0.05
calculate_landmark = False

In [None]:
""" Accuracy Metrics: Normalized Mean Error (NME) and CED/AUC/FR
y, y_hat have dimensions of n x m
d is the interocular distance
"""

def nme_images(y_hat, y, m, d):
    # Output: NME of each image over all landmarks (size n)
    return torch.square(torch.norm(y_hat-y, dim=1)) / d / m


def nme_landmark(y_hat, y, m, d):
    # Output: NME of each image for each facial landmark (size n x m/2)
    nme = torch.square(y_hat-y) / torch.unsqueeze(d, 1) 
    nme = torch.reshape(nme, (n, 2, m/2))
    return torch.mean(nme, axis=1)


def total_nme(nme):
    # Output: average NME across all images
    return torch.mean(nme)


def ced(nme):
    # Output: plot of the CED
    x = torch.sort(nme).numpy
    y = np.arange(n) / float(n)
    plt.xlabel('Normalized Mean Error (NME)')
    plt.ylabel('Percent of Samples')
  
    plt.title('Plot of Cumulative Error Distribution (CED) Curve')
    plt.plot(x, y, marker='o')
    plt.show


def auc(nme, alpha):
    # Output: percentage of samples with nme=0 to alpha
    x = torch.where(nme < alpha, 1, 0)
    return torch.sum(failure) / n


def fr(nme, alpha):
    # Output: failure rate, where failure is defined w.r.t a threshold, alpha (percent of samples with nme>alpha)
    return 1-auc(nme, alpha)

In [None]:
nme_images = nme_images(y_hat, y, m, d)
nme_landmarks = nme_landmarks(y_hat, y, m, d)
total_nme = total_nme(nme_images)

print('Average Normalized Mean Error:' total_nme)

In [None]:
ced = ced(nme_images)

In [None]:
"""Bias Metrics: Majority/Minority Difference in Error/Failure (MMD)"""
def mmd_err(majority, minority):
    difference = torch.mean(majority, dim=0) - torch.mean(minority, dim=0)
    return difference

def mmd_fr(majority, minority, alpha):
    difference = fr(majority, alpha, dim=0) - fr(minority, alpha, dim=0)
    return difference

In [None]:
""" Within each loop:
z = sensitive attribute group (as column number)
maj = value of majority group
nme = num_samples dimension tensor
"""

mmd_err = {}
mmd_fr = {}


if calculate_landmark:
    mmd_error_landmarks = {}

# calculates MMD Error and MMD Failure averaged across all landmarks
for z, maj in Z:
    majority = nme_images[tensor.where(x[:, z] == maj)] # size # images in majority
    minority = nme_images[tensor.where(x[:, z] != maj)] # size # images in minority

    mmd_err[z] = mmd_err(majority, minority)
    mmd_fr[z] = mmd_fr(majority, minority, alpha)

# calculates MMD Error for each landmark    
if calculate_landmark:
    for z, maj in Z:
        majority = nme_landmarks[tensor.where(x[:, z] == maj)] # size # images in majority x m/2
        minority = nme_landmarks[tensor.where(x[:, z] != maj)] # size # images in minority x m/2
        mmd_error_landmarks[z] = mmd_error_landmarks(majority, minority)

In [None]:
print('MMD in Error:', mmd_error)
print('MMD in Failure Rate:', mmd_fr)

if calculate_landmark:
    print('MMD in Error per landmark')
    for z, error in mmd_error_landmarks.items():
        print(f'Landmark {z}: {error}')