In [1]:
from io import BytesIO
from PIL import Image
import datasets
from transformers import CLIPProcessor, CLIPModel
from sklearn.linear_model import LogisticRegression
import torch
import numpy as np
# from tqdm.auto import tqdm

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# Load model and pre-processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
model.to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [4]:
def get_embedding_and_zs(sample):
    # import pdb; pdb.set_trace()
    
    # Gender prediction
    inputs = processor(text=["the face of a woman", "the face of a man"], images=sample["image"], return_tensors="pt", padding=True).to(device)
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image # this is the image-text similarity score
    gender_pred = logits_per_image.argmax(dim=1) # we can take the argmax
    
    sample["zs_gender_clip"] = [int(gp) for gp in gender_pred]
    # Store embeddings
    sample["embeddings"] = outputs.image_embeds
    
    # Race prediction
    inputs = processor(text=["East Asian", "Indian", "Black", "White", "Middle Eastern", "Latino_Hispanic", "Southeast Asian"], images=sample["image"], return_tensors="pt", padding=True).to(device)
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image # this is the image-text similarity score
    race_pred = logits_per_image.argmax(dim=1) # we can take the argmax
    
    sample["zs_race_clip"] = [int(rp) for rp in race_pred]
    return sample
    

In [5]:
# Load training data
train_ds = datasets.load_dataset('HuggingFaceM4/FairFace', '1.25', split='train', verification_mode="no_checks")
train_ds = train_ds.shuffle(seed=42).select([i for i in range(10_000)]) # Take only first 20_000 images
train_ds = train_ds.map(get_embedding_and_zs, batched = True, batch_size=16)

In [6]:
train_ds

Dataset({
    features: ['image', 'age', 'gender', 'race', 'service_test', 'zs_gender_clip', 'embeddings', 'zs_race_clip'],
    num_rows: 10000
})

In [7]:
# Load validation data as first 50% of val split
valid_ds = datasets.load_dataset('HuggingFaceM4/FairFace', '1.25', split="validation[:50%]", verification_mode="no_checks")
valid_ds = valid_ds.shuffle(seed=42).select([i for i in range(3_000)]) # Take only first 3_000 images
valid_ds = valid_ds.map(get_embedding_and_zs, batched = True, batch_size=16)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [8]:
# Load test data as second 50% of val split
test_ds = datasets.load_dataset('HuggingFaceM4/FairFace', '1.25', split="validation[50%:]", verification_mode="no_checks")
test_ds = test_ds.shuffle(seed=42).select([i for i in range(3_000)]) # Take only first 3_000 images
test_ds = test_ds.map(get_embedding_and_zs, batched = True, batch_size=16)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [9]:
train_ds

Dataset({
    features: ['image', 'age', 'gender', 'race', 'service_test', 'zs_gender_clip', 'embeddings', 'zs_race_clip'],
    num_rows: 10000
})

In [10]:
np.array(train_ds[:]["embeddings"]).shape

(10000, 512)

In [11]:
X_train = np.array(train_ds[:]["embeddings"])
y_train_gender = np.array(train_ds[:]["gender"])
y_train_race = np.array(train_ds[:]["race"])

In [12]:
lr_clf_gender = LogisticRegression(random_state=42)
lr_clf_gender.fit(X_train, y_train_gender)

In [13]:
lr_clf_race = LogisticRegression(random_state=42)
lr_clf_race.fit(X_train, y_train_race)

In [14]:
X_val = np.array(valid_ds[:]["embeddings"])
y_val_gender = np.array(valid_ds[:]["gender"])
y_val_race = np.array(valid_ds[:]["race"])

In [15]:
train_acc = lr_clf_gender.score(X_train, y_train_gender)
val_acc = lr_clf_gender.score(X_val, y_val_gender)
print(f"Training accuracy for gender: {train_acc:.4f}%")
print(f"Validation accuracy for gender: {val_acc:.4f}%")

Training accuracy for gender: 0.9601%
Validation accuracy for gender: 0.9633%


In [16]:
train_acc = lr_clf_race.score(X_train, y_train_race)
val_acc = lr_clf_race.score(X_val, y_val_race)
print(f"Training accuracy for race: {train_acc:.4f}%")
print(f"Validation accuracy for race: {val_acc:.4f}%")

Training accuracy for race: 0.7482%
Validation accuracy for race: 0.7180%


In [17]:
# Zero shot metrics
train_acc = np.sum(np.array(train_ds[:]["zs_gender_clip"]) == np.array(train_ds[:]["gender"])) / len(train_ds)
val_acc = np.sum(np.array(valid_ds[:]["zs_gender_clip"]) == np.array(valid_ds[:]["gender"])) / len(valid_ds)
print(f"Training accuracy for gender: {train_acc:.4f}%")
print(f"Validation accuracy for gender: {val_acc:.4f}%")

Training accuracy for gender: 0.0516%
Validation accuracy for gender: 0.0480%


In [18]:
# Zero shot metrics
train_acc = np.sum(np.array(train_ds[:]["zs_race_clip"]) == np.array(train_ds[:]["race"])) / len(train_ds)
val_acc = np.sum(np.array(valid_ds[:]["zs_race_clip"]) == np.array(valid_ds[:]["race"])) / len(valid_ds)
print(f"Training accuracy for race: {train_acc:.4f}%")
print(f"Validation accuracy for race: {val_acc:.4f}%")

Training accuracy for race: 0.4367%
Validation accuracy for race: 0.4290%
