In [1]:
import torch
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.io import read_image
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor
from torchvision.transforms.functional import to_pil_image
from transformers import CLIPProcessor, CLIPModel
from tqdm.auto import tqdm
# from PIL import Image
import numpy as np
import os
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
import joblib
import random

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
seed = 10
random.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)

In [4]:
# Load model and pre-processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
model.to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")




In [5]:
TRAIN_IMG_PATH =  os.path.join(os.getcwd(), "..", "..", "AffectNet", "train_set", "train_set", "images")
TRAIN_LABELS_PATH =  os.path.join(os.getcwd(), "..", "..", "AffectNet", "train_set", "train_set", "annotations")
VAL_IMG_PATH =  os.path.join(os.getcwd(), "..", "..", "AffectNet", "val_set", "val_set", "images")
VAL_LABELS_PATH =  os.path.join(os.getcwd(), "..", "..", "AffectNet", "val_set", "val_set", "annotations")

In [6]:
# os.listdir(TRAIN_LABELS_PATH)

In [7]:
class CustomAffectNetDataset(Dataset):
    def __init__(self, annotations_dir, img_dir, transform=None, target_transform=None):
        # store the image files in sorted order
        self.img_dir = img_dir
        self.label_dir = annotations_dir
        self.img_files = sorted(os.listdir(img_dir))
        # self.img_label_files = sorted([file for file in os.listdir(annotations_dir) if 'exp' in file])
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.img_files)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.img_files[idx])
        image = read_image(img_path)
        label_path = os.path.join(self.label_dir, f"{self.img_files[idx].split('.')[0]}_exp.npy")
        label = int(np.load(label_path))
        if self.transform:
            image = self.transform(image)
        if self.target_transform:
            label = self.target_transform(label)
        return image, label

In [8]:
training_data = CustomAffectNetDataset(TRAIN_LABELS_PATH, TRAIN_IMG_PATH)

In [9]:
train_dataloader = DataLoader(training_data, batch_size=16, shuffle=True)

In [10]:
# import gc; gc.collect()

In [11]:
zs_preds = []
embeddings = torch.Tensor([])
labels = []
deepface_labels = ["angry", "disgust", "fear", "happy", "sad", "surprise", "neutral"]
for img, label in tqdm(train_dataloader):
    # Ignore contempt since deepface ignores it
    label_to_ignore = 7
    relevance_mask = label != label_to_ignore
    img = img[relevance_mask]
    label = label[relevance_mask]
    pil_images = [to_pil_image(i) for i in img]
    label = label.to(device)
    inputs = processor(text=["Neutral", "Happy", "Sad", "Surprise", "Fear", "Disgust", "Anger"], images=pil_images, return_tensors="pt", padding=True).to(device)
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image # this is the image-text similarity score
    emotion_pred = logits_per_image.argmax(dim=1) # we can take the argmax
    zs_preds += emotion_pred.tolist()
    labels += label.tolist()
    embeddings = torch.cat((embeddings, outputs.image_embeds.detach().cpu()))

  0%|          | 0/17979 [00:00<?, ?it/s]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [12]:
with open("tmp_train_zs_preds.npy", "wb") as f:
    np.save(f, np.array(zs_preds))
with open("tmp_train_labels.npy", "wb") as f:
    np.save(f, np.array(labels))
with open("tmp_train_embeddings.npy", "wb") as f:
    np.save(f, np.array(embeddings))

In [13]:
# img_files[:15]

In [14]:
# Image.open(os.path.join(TRAIN_IMG_PATH, img_files[0]))

In [15]:
# label_files = sorted([file for file in os.listdir(TRAIN_LABELS_PATH) if 'exp' in file])

In [16]:
# label_files[:15]

In [17]:
embeddings = np.load("tmp_train_embeddings.npy")
labels = np.load("tmp_train_labels.npy")
zs_preds = np.load("tmp_train_zs_preds.npy")

In [18]:
lr_clf_emotion = LogisticRegression(random_state=42, max_iter=400)
lr_clf_emotion.fit(embeddings, labels)

In [19]:
y_preds = lr_clf_emotion.predict(embeddings)
precision, recall, f_score_weighted, _ = precision_recall_fscore_support(labels, y_preds, average='weighted')
_, _, f_score_macro, _ = precision_recall_fscore_support(labels, y_preds, average='macro')
_, _, f_score_micro, _ = precision_recall_fscore_support(labels, y_preds, average='micro')
train_acc = lr_clf_emotion.score(embeddings, labels)
print(f"Training set metrics - Emotion (LR + CLIP) \n" + "="*40)
print(f"Accuracy: {train_acc:.4f} Precision: {precision:.4f}, Recall: {recall:.4f}, F-Score(Weighted): {f_score_weighted:.4f}, F-Score(Micro): {f_score_micro:.4f}, F-Score(Macro): {f_score_macro:.4f}")

Training set metrics - Emotion (LR + CLIP) 
Accuracy: 0.7623 Precision: 0.7563, Recall: 0.7623, F-Score(Weighted): 0.7522, F-Score(Micro): 0.7623, F-Score(Macro): 0.5429


In [20]:
for idx, emotion in enumerate(["Neutral", "Happy", "Sad", "Surprise", "Fear", "Disgust", "Anger"]):
    emotion_mask = np.array(labels) == idx
    y_true = np.array(labels)[emotion_mask]
    y_rel_preds = np.array(y_preds)[emotion_mask]
    emotion_acc = np.sum(y_true == y_rel_preds) / len(y_true) * 100
    print(f"LR+CLIP accuracy for {emotion}(class-{idx}): {emotion_acc:.2f}%")

LR+CLIP accuracy for Neutral(class-0): 79.37%
LR+CLIP accuracy for Happy(class-1): 91.97%
LR+CLIP accuracy for Sad(class-2): 50.92%
LR+CLIP accuracy for Surprise(class-3): 37.27%
LR+CLIP accuracy for Fear(class-4): 32.82%
LR+CLIP accuracy for Disgust(class-5): 11.18%
LR+CLIP accuracy for Anger(class-6): 50.84%


In [21]:
y_preds = zs_preds
precision, recall, f_score_weighted, _ = precision_recall_fscore_support(labels, y_preds, average='weighted')
_, _, f_score_macro, _ = precision_recall_fscore_support(labels, y_preds, average='macro')
_, _, f_score_micro, _ = precision_recall_fscore_support(labels, y_preds, average='micro')
train_acc = np.sum(y_preds == np.array(labels)) / len(y_preds)
print(f"Training set metrics - Emotion (ZS CLIP) \n" + "="*40)
print(f"Accuracy: {train_acc:.4f} Precision: {precision:.4f}, Recall: {recall:.4f}, F-Score(Weighted): {f_score_weighted:.4f}, F-Score(Micro): {f_score_micro:.4f}, F-Score(Macro): {f_score_macro:.4f}")

Training set metrics - Emotion (ZS CLIP) 
Accuracy: 0.4440 Precision: 0.5904, Recall: 0.4440, F-Score(Weighted): 0.4948, F-Score(Micro): 0.4440, F-Score(Macro): 0.2821


In [22]:
for idx, emotion in enumerate(["Neutral", "Happy", "Sad", "Surprise", "Fear", "Disgust", "Anger"]):
    emotion_mask = np.array(labels) == idx
    y_true = np.array(labels)[emotion_mask]
    y_rel_preds = np.array(y_preds)[emotion_mask]
    emotion_acc = np.sum(y_true == y_rel_preds) / len(y_true) * 100
    print(f"LR+CLIP accuracy for {emotion}(class-{idx}): {emotion_acc:.2f}%")

LR+CLIP accuracy for Neutral(class-0): 21.32%
LR+CLIP accuracy for Happy(class-1): 64.82%
LR+CLIP accuracy for Sad(class-2): 24.91%
LR+CLIP accuracy for Surprise(class-3): 23.85%
LR+CLIP accuracy for Fear(class-4): 8.40%
LR+CLIP accuracy for Disgust(class-5): 45.25%
LR+CLIP accuracy for Anger(class-6): 44.19%


In [23]:
validation_data = CustomAffectNetDataset(VAL_LABELS_PATH, VAL_IMG_PATH)

In [24]:
val_dataloader = DataLoader(validation_data, batch_size=16, shuffle=True)

In [25]:
zs_preds = []
embeddings = torch.Tensor([])
labels = []
deepface_labels = ["angry", "disgust", "fear", "happy", "sad", "surprise", "neutral"]
for img, label in tqdm(val_dataloader):
    # Ignore contempt since deepface ignores it
    label_to_ignore = 7
    relevance_mask = label != label_to_ignore
    img = img[relevance_mask]
    label = label[relevance_mask]
    pil_images = [to_pil_image(i) for i in img]
    label = label.to(device)
    inputs = processor(text=["Neutral", "Happy", "Sad", "Surprise", "Fear", "Disgust", "Anger"], images=pil_images, return_tensors="pt", padding=True).to(device)
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image # this is the image-text similarity score
    emotion_pred = logits_per_image.argmax(dim=1) # we can take the argmax
    zs_preds += emotion_pred.tolist()
    labels += label.tolist()
    embeddings = torch.cat((embeddings, outputs.image_embeds.detach().cpu()))

  0%|          | 0/250 [00:00<?, ?it/s]

In [26]:
with open("tmp_val_zs_preds.npy", "wb") as f:
    np.save(f, np.array(zs_preds))
with open("tmp_val_labels.npy", "wb") as f:
    np.save(f, np.array(labels))
with open("tmp_val_embeddings.npy", "wb") as f:
    np.save(f, np.array(embeddings))

In [27]:
embeddings = np.load("tmp_val_embeddings.npy")
labels = np.load("tmp_val_labels.npy")
zs_preds = np.load("tmp_val_zs_preds.npy")

In [28]:
y_preds = lr_clf_emotion.predict(embeddings)
precision, recall, f_score_weighted, _ = precision_recall_fscore_support(labels, y_preds, average='weighted')
_, _, f_score_macro, _ = precision_recall_fscore_support(labels, y_preds, average='macro')
_, _, f_score_micro, _ = precision_recall_fscore_support(labels, y_preds, average='micro')
train_acc = lr_clf_emotion.score(embeddings, labels)
print(f"Validation set metrics - Emotion (LR + CLIP) \n" + "="*40)
print(f"Accuracy: {train_acc:.4f} Precision: {precision:.4f}, Recall: {recall:.4f}, F-Score(Weighted): {f_score_weighted:.4f}, F-Score(Micro): {f_score_micro:.4f}, F-Score(Macro): {f_score_macro:.4f}")

Validation set metrics - Emotion (LR + CLIP) 
Accuracy: 0.4869 Precision: 0.5890, Recall: 0.4869, F-Score(Weighted): 0.4534, F-Score(Micro): 0.4869, F-Score(Macro): 0.4534


In [29]:
for idx, emotion in enumerate(["Neutral", "Happy", "Sad", "Surprise", "Fear", "Disgust", "Anger"]):
    emotion_mask = np.array(labels) == idx
    y_true = np.array(labels)[emotion_mask]
    y_rel_preds = np.array(y_preds)[emotion_mask]
    emotion_acc = np.sum(y_true == y_rel_preds) / len(y_true) * 100
    print(f"LR+CLIP accuracy for {emotion}(class-{idx}): {emotion_acc:.2f}%")

LR+CLIP accuracy for Neutral(class-0): 78.80%
LR+CLIP accuracy for Happy(class-1): 93.20%
LR+CLIP accuracy for Sad(class-2): 49.80%
LR+CLIP accuracy for Surprise(class-3): 29.20%
LR+CLIP accuracy for Fear(class-4): 31.80%
LR+CLIP accuracy for Disgust(class-5): 8.20%
LR+CLIP accuracy for Anger(class-6): 49.80%


In [30]:
y_preds = zs_preds
precision, recall, f_score_weighted, _ = precision_recall_fscore_support(labels, y_preds, average='weighted')
_, _, f_score_macro, _ = precision_recall_fscore_support(labels, y_preds, average='macro')
_, _, f_score_micro, _ = precision_recall_fscore_support(labels, y_preds, average='micro')
train_acc = np.sum(y_preds == np.array(labels)) / len(y_preds)
print(f"Validation set metrics - Emotion (ZS CLIP) \n" + "="*40)
print(f"Accuracy: {train_acc:.4f} Precision: {precision:.4f}, Recall: {recall:.4f}, F-Score(Weighted): {f_score_weighted:.4f}, F-Score(Micro): {f_score_micro:.4f}, F-Score(Macro): {f_score_macro:.4f}")

Validation set metrics - Emotion (ZS CLIP) 
Accuracy: 0.3477 Precision: 0.3928, Recall: 0.3477, F-Score(Weighted): 0.3363, F-Score(Micro): 0.3477, F-Score(Macro): 0.3363


In [31]:
for idx, emotion in enumerate(["Neutral", "Happy", "Sad", "Surprise", "Fear", "Disgust", "Anger"]):
    emotion_mask = np.array(labels) == idx
    y_true = np.array(labels)[emotion_mask]
    y_rel_preds = np.array(y_preds)[emotion_mask]
    emotion_acc = np.sum(y_true == y_rel_preds) / len(y_true) * 100
    print(f"LR+CLIP accuracy for {emotion}(class-{idx}): {emotion_acc:.2f}%")

LR+CLIP accuracy for Neutral(class-0): 24.60%
LR+CLIP accuracy for Happy(class-1): 72.60%
LR+CLIP accuracy for Sad(class-2): 24.60%
LR+CLIP accuracy for Surprise(class-3): 22.00%
LR+CLIP accuracy for Fear(class-4): 9.60%
LR+CLIP accuracy for Disgust(class-5): 46.00%
LR+CLIP accuracy for Anger(class-6): 44.00%


In [32]:
# Save Emotion model
joblib.dump(lr_clf_emotion, '../models/lr_clf_emotion.joblib')
print("Model saved successfully!")
# To load the model from the file later
clf_emotion_loaded = joblib.load('../models/lr_clf_emotion.joblib')
print("Model loaded successfully!")

Model saved successfully!
Model loaded successfully!


In [33]:
lr_clf_emotion