In [2]:
import clip
import torch
import pandas as pd
import numpy as np
from PIL import Image

In [3]:
%matplotlib inline

print('\nLoading model...')
available_models = ['RN50', 'RN101', 'RN50x4', 'RN50x16']
layers = ['layer4', 'layer3', 'layer2', 'layer1']

clip_model = available_models[0]
saliency_layer = layers[0]

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load(clip_model, device=device, jit=False)
print(f"Done! Model loaded to {device} device")


Loading model...
Done! Model loaded to cuda device


In [4]:
path = "/home/lazye/Documents/ufrgs/mcs/datasets/FairFace/"
fface_df = pd.read_csv(f"{path}/train/fairface_label_train.csv")

In [5]:
fface_df.head()

Unnamed: 0,file,age,gender,race,service_test
0,train/1.jpg,50-59,Male,East Asian,True
1,train/2.jpg,30-39,Female,Indian,False
2,train/3.jpg,3-9,Female,Black,False
3,train/4.jpg,20-29,Female,Indian,True
4,train/5.jpg,20-29,Female,Indian,True


In [6]:
import glob
man_list = glob.glob('man_embeddings/*.npy')
wom_list = glob.glob('woman_embeddings/*.npy')

In [7]:
man_idx = [file_path.split('/')[-1].split('.')[0] for file_path in man_list]

In [8]:
embd = np.load(man_list[0])

In [11]:
def get_scores(img_embd, classes):
    """Softmax of pairs of 'img_embd' and each class of 'classes'"""
    image_features = torch.from_numpy(img_embd).to(device)

    text_inputs = torch.cat(
        [clip.tokenize(f"a photo of a {c}") for c in classes]).to(device)
    
    with torch.no_grad():
        text_features = model.encode_text(text_inputs)
    
    text_features /= text_features.norm(dim=-1, keepdim=True)
    similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)

    values, indices = similarity[0].topk(len(similarity[0]))
    scores = list()
    for value, index in zip(values, indices):
        scores.append((classes[index], round(100 * value.item(), 2)))
    return scores


In [12]:
labels = ['man', 'woman']
print(get_scores(embd, labels))

[('man', 92.09), ('woman', 7.92)]


In [13]:
def eval_embds(emb_list, target, classes):
    results = dict()

    text_inputs = torch.cat(
        [clip.tokenize(f"a photo of a {c}") for c in classes]).to(device)
    with torch.no_grad():
        text_features = model.encode_text(text_inputs)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    for emb in emb_list:
        path = f'{target}_embeddings/{emb}.npy'
        image_features = np.load(path)
        image_features = torch.from_numpy(image_features).to(device)
        similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)

        values, indices = similarity[0].topk(len(similarity[0]))
        scores = list()
        for value, index in zip(values, indices):
            scores.append((classes[index], round(100 * value.item(), 2)))
        results[emb] = scores
    
    return results

In [19]:
man_results = eval_embds(man_idx, 'man', labels)

In [20]:
man_results

{'400': [('man', 92.09), ('woman', 7.92)],
 '1546': [('man', 92.09), ('woman', 7.92)],
 '1352': [('man', 97.75), ('woman', 2.26)],
 '600': [('man', 95.9), ('woman', 4.11)],
 '1184': [('woman', 51.17), ('man', 48.83)],
 '577': [('man', 98.29), ('woman', 1.73)],
 '1149': [('man', 51.17), ('woman', 48.83)],
 '706': [('man', 97.9), ('woman', 2.1)],
 '613': [('man', 96.83), ('woman', 3.16)],
 '465': [('man', 91.26), ('woman', 8.76)],
 '6': [('woman', 63.33), ('man', 36.67)],
 '349': [('man', 96.68), ('woman', 3.31)],
 '1403': [('man', 96.29), ('woman', 3.7)],
 '70': [('man', 88.72), ('woman', 11.28)],
 '184': [('man', 96.29), ('woman', 3.7)],
 '352': [('man', 85.21), ('woman', 14.81)],
 '185': [('man', 59.08), ('woman', 40.92)],
 '1842': [('man', 95.65), ('woman', 4.37)],
 '1428': [('woman', 82.23), ('man', 17.79)],
 '1187': [('man', 88.09), ('woman', 11.92)],
 '1275': [('man', 93.16), ('woman', 6.85)],
 '375': [('man', 87.89), ('woman', 12.08)],
 '1460': [('man', 97.46), ('woman', 2.54)],
