# Evaluate Gender Synonyms

## Imports

In [1]:
import torch
import clip
import numpy as np
import pandas as pd
import json

## Aux Classes

In [19]:
def model_setup(model):
    """Initial loading of CLIP model."""

    available_models = clip.available_models()

    if model in available_models:
        print(f'Loading model: {model}')
        chosen_model = model
    else:
        print(f'{model} unavailable! Using default model: ViT-L/14@336px')
        chosen_model = available_models[0]

    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    model, pps = clip.load(chosen_model, device=device, jit=False)

    print(f'Done! Model loaded to {device} device')
    return model, pps


def get_similarities(img, txts):
    """Grab similarity between text and image embeddings."""
    image_features = torch.from_numpy(img).to('cuda')
    similarity = 100.0 * image_features @ txts.T

    return similarity


def get_sims_dict(img_embs, txt_prompts, txt_embs):
    """Generate dictionary with filename and similarities
    scores between text prompts"""
    final_dict = {}
    for _, emb in img_embs.iterrows():
        name = emb['file']
        img_features = emb['embeddings']
        img_sims = get_similarities(img_features, txt_embs)
        sims_dict = {}
        for label, score in zip(txt_prompts, img_sims[0]):
            sims_dict[label] = score.cpu().numpy().item()
        final_dict[name] = sims_dict
    return final_dict


def get_top_synm(final_dict):
    """Grab most similar synonym"""
    files = final_dict.keys()
    wins = []
    for val in final_dict.values():
        scores_list = list(val.values())
        label_list = list(val.keys())
        np_scores = np.asarray(scores_list)
        windex = np.where(np_scores == np_scores.max())[0][0]
        wins.append(label_list[windex])

    top_synm_dict = {'file': files, 'winner': wins}
    return top_synm_dict


def get_sum_synms(final_dict, man_prompts):
    """Ensemble over avg sum of similarities
    between male and female synms"""
    files = final_dict.keys()
    preds = []

    for key, val in final_dict.items():
        man_score = 0
        woman_score = 0
        for k, v in val.items():
            if k in man_prompts:
                man_score += v
            else:
                woman_score += v
        preds.append('Male' if man_score > woman_score else 'Female')

    sum_dict = {'file': files, 'preds': preds}
    return sum_dict


def synm_to_gender(synm, man_prompts):
    """Mapper function to eval between Male and Female
    synonyms"""
    if synm in man_prompts:
        return 'Male'
    else:
        return 'Female'


def generate_final_df(fface_df, score_df):
    """Join the winning class df with the original df"""
    new_df = fface_df.set_index(
        'file').join(score_df.set_index('file'))
    return new_df


def map_synm_to_gender(df, man_prompts):
    """Use sub-set of man synms to evaluate and map
    synms to Male or Female"""
    new_df = df.copy()
    new_df['winner'] = df['winner'].map(lambda x: synm_to_gender(x, man_prompts))
    return new_df


## Load Variables

In [3]:
ROOT = "/home/lazye/Documents/ufrgs/mcs/clip/clip-bias-explore/\
fair-face-classification"
LABELS_PATH = ROOT + "/data/labels"
EMBS_PATH = ROOT + "/data/embeddings"

fface_df = pd.read_csv(ROOT+"/data/fface_val.csv")
vit_model = model_setup('ViT-B/16')
img_embs = pd.read_pickle(EMBS_PATH+"/fface_val_img_embs.pkl")
txt_embs = torch.load(EMBS_PATH+"/synms_labels.pt")

with open(LABELS_PATH+"/bots_synms_prompts.json") as f:
    data = json.load(f)

prompt_list = list(data.values())
man_prompts = prompt_list[:10]
woman_prompts = prompt_list[10:]

Loading model: ViT-B/16
Done! Model loaded to cuda device


In [None]:
sims_dict = get_sims_dict(img_embs, prompt_list, txt_embs)

In [32]:
sum_dict = get_sum_synms(sims_dict, man_prompts)
sum_df = pd.DataFrame(data=sum_dict)
print("Prompt sum predictions")
print(sum_df.head())

final_sum_df = generate_final_df(fface_df, sum_df)
final_sum_df.rename(columns={'preds': 'gender_preds'}, inplace=True)
final_sum_df.drop(columns=['service_test'], inplace=True)
print("\nFinal prompt sum df")
print(final_sum_df.head())
final_sum_df.to_csv("avg_sum_synms.csv")

Prompt sum predictions
        file   preds
0  val/1.jpg    Male
1  val/2.jpg  Female
2  val/3.jpg    Male
3  val/4.jpg  Female
4  val/5.jpg    Male

Final prompt sum df
             age  gender             race gender_preds
file                                                  
val/1.jpg    3-9    Male       East Asian         Male
val/2.jpg  50-59  Female       East Asian       Female
val/3.jpg  30-39    Male            White         Male
val/4.jpg  20-29  Female  Latino_Hispanic       Female
val/5.jpg  20-29    Male  Southeast Asian         Male


In [29]:
top_dict = get_top_synm(final_dict=sims_dict)
top_df = pd.DataFrame(data=top_dict)
print("Top[k] predictions")
print(top_df.head())

bin_top_df = map_synm_to_gender(top_df, man_prompts)
final_top_df = generate_final_df(fface_df, bin_top_df)
final_top_df.rename(
    columns={'winner': 'gender_preds'}, inplace=True)
final_top_df.drop(columns=['service_test'], inplace=True)
print("\n Top[k] final df")
print(final_top_df.head())
final_top_df.to_csv('top_k_synms.csv')


Top[k] predictions
        file                        winner
0  val/1.jpg              a photo of a boy
1  val/2.jpg       a photo of an old woman
2  val/3.jpg  a photo of a middle-aged man
3  val/4.jpg    a photo of a female person
4  val/5.jpg      a photo of a male person

 Top[k] final df
             age  gender             race gender_preds
file                                                  
val/1.jpg    3-9    Male       East Asian         Male
val/2.jpg  50-59  Female       East Asian       Female
val/3.jpg  30-39    Male            White         Male
val/4.jpg  20-29  Female  Latino_Hispanic       Female
val/5.jpg  20-29    Male  Southeast Asian         Male
