In [1]:
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
import numpy as np
import datasets

In [15]:
# Load validation data and test on this
valid_ds = datasets.load_dataset('HuggingFaceM4/FairFace', '1.25', split="validation", verification_mode="no_checks")
valid_ds = valid_ds.shuffle(seed=42)
# valid_ds = valid_ds.shuffle(seed=42)
y_true = np.array(valid_ds["age"])

In [16]:
def evaluate(y_preds, y_true, header=''):
    precision, recall, f_score_weighted, _ = precision_recall_fscore_support(y_true, y_preds, average='weighted')
    _, _, f_score_macro, _ = precision_recall_fscore_support(y_true, y_preds, average='macro')
    _, _, f_score_micro, _ = precision_recall_fscore_support(y_true, y_preds, average='micro')
    class_rep = classification_report( y_true, y_preds, output_dict=True)
    print(f"{header} \n" + "="*40)
    print(f"Accuracy: {class_rep['accuracy']:.4f} Precision: {precision:.4f}, Recall: {recall:.4f}, F-Score(Weighted): {f_score_weighted:.4f}, F-Score(Micro): {f_score_micro:.4f}, F-Score(Macro): {f_score_macro:.4f}")

In [17]:
def map_over_under_50(y_preds, y_true):
    # Up to 50 is 0, over 50 is 1 
    y_ou_preds = np.array(y_preds) >= 6 # Classes 0 to 5 are up to 50, 6 and above are over 50 
    y_ou_true = np.array(y_true) >= 6 # Classes 0 to 5 are up to 50, 6 and above are over 50 
    print(classification_report( y_ou_true, y_ou_preds, labels=[0,1],target_names=["Up to 50", "Over 50"]))


In [18]:
def map_regression_to_class(y_preds_reg):
    # Define the boundaries
    bins = [-np.inf, 2, 9, 19, 29, 39, 49, 59, 69, np.inf]
    # Map each prediction to its bin
    y_preds = np.digitize(y_preds_reg, bins) - 1  # np.digitize returns indices starting from 1
    return y_preds
# y_preds_reg = np.array([1, 4, 15, 22, 35, 46, 55, 62, 77])
# y_preds = map_regression_to_class(y_preds_reg)

# print(y_preds)
# Output: [0 1 2 3 4 5 6 7 8]


In [19]:
# CLIP ZS evaluation
y_preds = np.load("clip_zs_age_preds_val_42.npy")
evaluate(y_preds, y_true, header='Validation set metrics - Age (CLIP ZS) ')
map_over_under_50(y_preds, y_true)


Validation set metrics - Age (CLIP ZS)  
Accuracy: 0.3984 Precision: 0.4782, Recall: 0.3984, F-Score(Weighted): 0.4110, F-Score(Micro): 0.3984, F-Score(Macro): 0.3714
              precision    recall  f1-score   support

    Up to 50       0.98      0.88      0.93      9719
     Over 50       0.47      0.88      0.61      1235

    accuracy                           0.88     10954
   macro avg       0.73      0.88      0.77     10954
weighted avg       0.93      0.88      0.89     10954



In [20]:
# CLIP LR evaluation
y_preds = np.load("clip_lr_age_preds_val_42.npy")
evaluate(y_preds, y_true, header='Validation set metrics - Age (CLIP + LR) ')
map_over_under_50(y_preds, y_true)

Validation set metrics - Age (CLIP + LR)  
Accuracy: 0.6013 Precision: 0.5953, Recall: 0.6013, F-Score(Weighted): 0.5958, F-Score(Micro): 0.6013, F-Score(Macro): 0.5667
              precision    recall  f1-score   support

    Up to 50       0.96      0.97      0.97      9719
     Over 50       0.75      0.70      0.73      1235

    accuracy                           0.94     10954
   macro avg       0.86      0.84      0.85     10954
weighted avg       0.94      0.94      0.94     10954



In [21]:
# Deepface 
classes = ["0-2", "3-9", "10-19", "20-29", "30-39", "40-49", "50-59", "60-69", "more than 70"]
y_preds_reg = np.load("deepace_age_preds_val_42.npy")
y_preds = map_regression_to_class(y_preds_reg)
evaluate(y_preds, y_true, header='Validation set metrics - Age (Deepface) ')
map_over_under_50(y_preds, y_true)
print(classification_report(y_true, y_preds))


Validation set metrics - Age (Deepface)  
Accuracy: 0.2663 Precision: 0.2495, Recall: 0.2663, F-Score(Weighted): 0.2191, F-Score(Micro): 0.2663, F-Score(Macro): 0.1545
              precision    recall  f1-score   support

    Up to 50       0.92      0.98      0.95      9719
     Over 50       0.70      0.29      0.41      1235

    accuracy                           0.91     10954
   macro avg       0.81      0.64      0.68     10954
weighted avg       0.89      0.91      0.89     10954

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       199
           1       0.00      0.00      0.00      1356
           2       0.20      0.01      0.01      1181
           3       0.37      0.22      0.28      3300
           4       0.24      0.59      0.34      2330
           5       0.24      0.45      0.31      1353
           6       0.37      0.17      0.24       796
           7       0.37      0.15      0.21       321
           8      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Validate GENDER for FAIRFACE


In [9]:
y_true = np.array(valid_ds["gender"])

In [None]:
y_preds = np.load("clip_zs_gender_preds_val_42.npy")
evaluate(y_preds, y_true, header='Validation set metrics - Gender (ZS) ')
print(classification_report(y_true, y_preds, labels=[0,1], target_names=["Male", "Female"]))

Validation set metrics - Age (ZS)  
Accuracy: 0.9506 Precision: 0.9512, Recall: 0.9506, F-Score(Weighted): 0.9506, F-Score(Micro): 0.9506, F-Score(Macro): 0.9505
              precision    recall  f1-score   support

        Male       0.97      0.94      0.95      5792
      Female       0.93      0.96      0.95      5162

    accuracy                           0.95     10954
   macro avg       0.95      0.95      0.95     10954
weighted avg       0.95      0.95      0.95     10954



In [None]:
y_preds = np.load("clip_lr_gender_preds_val_42.npy")
evaluate(y_preds, y_true, header='Validation set metrics - Gender (CLIP + LR) ')
print(classification_report(y_true, y_preds, labels=[0,1], target_names=["Male", "Female"]))

Validation set metrics - Age (CLIP + LR)  
Accuracy: 0.9616 Precision: 0.9616, Recall: 0.9616, F-Score(Weighted): 0.9616, F-Score(Micro): 0.9616, F-Score(Macro): 0.9615
              precision    recall  f1-score   support

        Male       0.97      0.96      0.96      5792
      Female       0.96      0.96      0.96      5162

    accuracy                           0.96     10954
   macro avg       0.96      0.96      0.96     10954
weighted avg       0.96      0.96      0.96     10954



In [12]:
y_preds = np.load("deepace_gender_preds_val_42.npy")
evaluate(y_preds, y_true, header='Validation set metrics - Gender (DeepFace) ')
print(classification_report(y_true, y_preds, labels=[0,1], target_names=["Male", "Female"]))

Validation set metrics - Gender (DeepFace)  
Accuracy: 0.7793 Precision: 0.7865, Recall: 0.7793, F-Score(Weighted): 0.7763, F-Score(Micro): 0.7793, F-Score(Macro): 0.7743
              precision    recall  f1-score   support

        Male       0.75      0.88      0.81      5792
      Female       0.83      0.67      0.74      5162

    accuracy                           0.78     10954
   macro avg       0.79      0.77      0.77     10954
weighted avg       0.79      0.78      0.78     10954



In [13]:
def accuracy_per_class(y_true, y_pred):
    classes = np.unique(y_true)
    accuracies = {}
    
    for cls in classes:
        idx = np.where(y_true == cls)[0]  # indices where the true label is class cls
        correct = np.sum(y_pred[idx] == y_true[idx])
        accuracies[cls] = correct / len(idx) if len(idx) > 0 else 0.0
        
    return accuracies

# Example usage:
y_true = np.array([0, 1, 2, 1, 0, 2, 2])
y_pred = np.array([0, 2, 1, 1, 0, 2, 1])

print(accuracy_per_class(y_true, y_pred))

{0: 1.0, 1: 0.5, 2: 0.3333333333333333}
