In [128]:
import os
import shutil
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
from tqdm import tqdm
import json

In [123]:
def setup_experiment():
    # Remove clip demography folder
    if os.path.exists("data/clip_demography"):
        shutil.rmtree("data/clip_demography")
    os.mkdir("data/clip_demography")

In [130]:
def generate_demography_data_with_clip():
    # Class refs
    class_gender_refs = [
        "man",
        "woman"
    ]
    class_race_refs=["East Asian",
        "Indian",
        "Black",
        "White",
        "Middle Eastern",
        "Latino Hispanic",
        "Southeast Asian"
    ]
    class_emotion_refs = [
        "angry", 
        "fear", 
        "neutral", 
        "sad", 
        "disgust", 
        "happy", 
        "surprise"
    ]
        
        
    # Load model and pre-processor
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    # Get face file names
    face_files = os.listdir("data/faces")
    f_names = [f"data/faces/{file}" for file in face_files]
    
    # Create batches of faces (8 at a time), generate their file
    batch_size = 8
    for idx in tqdm(range(0,len(f_names), batch_size)):
        rel_f_names = f_names[idx:idx+batch_size]
        demo_data = []
        rel_images = [Image.open(rel_f_name) for rel_f_name in rel_f_names] # Load relevant images in batch
        # Get gender and confidences
        inputs = processor(text=["the face of a man", "the face of a woman"], images=rel_images, return_tensors="pt", padding=True)
        outputs = model(**inputs)
        logits_per_image = outputs.logits_per_image # this is the image-text similarity score
        gender_probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
        # Get race and confidences
        inputs = processor(text=["East Asian", "Indian", "Black", "White", "Middle Eastern", "Latino Hispanic", "Southeast Asian"], images=rel_images, return_tensors="pt", padding=True)
        outputs = model(**inputs)
        logits_per_image = outputs.logits_per_image # this is the image-text similarity score
        race_probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
        # Get emotion and confidences
        inputs = processor(text=["angry emotion", "emotion of fear", "neutral emotion", "sad emotion", "emotion of disgust", "happy emotion", "emotion of surprise"], images=rel_images, return_tensors="pt", padding=True)
        outputs = model(**inputs)
        logits_per_image = outputs.logits_per_image # this is the image-text similarity score
        emotion_probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
        for jdx in range(len(gender_probs)):
            g_probs, r_probs, e_probs = gender_probs[jdx], race_probs[jdx], emotion_probs[jdx] 
            tmp_demo={}
            
            tmp_demo["dominant_emotion"] = class_emotion_refs[int(torch.argmax(e_probs))]
            tmp_demo["emotion"] = {
                k: float(v) for k,v in zip(class_emotion_refs, e_probs)
            }
            
            tmp_demo["dominant_race"] = class_race_refs[int(torch.argmax(r_probs))]
            tmp_demo["race"] = {
                k: float(v) for k,v in zip(class_race_refs, r_probs)
            }
            
            tmp_demo["dominant_gender"] = class_gender_refs[int(torch.argmax(g_probs))]
            tmp_demo["gender"] = {
                k: float(v) for k,v in zip(class_gender_refs, g_probs)
            }
            json_name = rel_f_names[jdx].split("/")[-1].split(".")[0]
            demo_data.append((json_name, tmp_demo))
    import pdb; pdb.set_trace()
    print("Saving:")
    for f_name, demography in tqdm(demo_data):
        with open(f"data/clip_demography/{f_name}.json", "w") as f:
            json.dump(demography, f)
        
            
        
        
        
        
        

In [131]:
setup_experiment()
generate_demography_data_with_clip()

100%|████████████████████████████████████████████████████████████████████████████████| 141/141 [08:50<00:00,  3.76s/it]


> [1;32mc:\users\debadyuti\appdata\local\temp\ipykernel_35272\2255756544.py[0m(75)[0;36mgenerate_demography_data_with_clip[1;34m()[0m



ipdb>  c


Saving:


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 75.27it/s]


[('f97f_9870_0',
  {'dominant_emotion': 'disgust',
   'emotion': {'angry': 0.18374894559383392,
    'fear': 0.0740252286195755,
    'neutral': 0.10968844592571259,
    'sad': 0.0838051363825798,
    'disgust': 0.2827147841453552,
    'happy': 0.05140778422355652,
    'surprise': 0.21460969746112823},
   'dominant_race': 'Indian',
   'race': {'East Asian': 0.06731825321912766,
    'Indian': 0.47527313232421875,
    'Black': 0.030623245984315872,
    'White': 0.07833714038133621,
    'Middle Eastern': 0.26388126611709595,
    'Latino Hispanic': 0.06679756194353104,
    'Southeast Asian': 0.017769306898117065},
   'dominant_gender': 'man',
   'gender': {'man': 0.927132248878479, 'woman': 0.07286779582500458}}),
 ('f97f_9885_0',
  {'dominant_emotion': 'disgust',
   'emotion': {'angry': 0.1445969194173813,
    'fear': 0.05276205390691757,
    'neutral': 0.05592034384608269,
    'sad': 0.08061965554952621,
    'disgust': 0.45087555050849915,
    'happy': 0.020155970007181168,
    'surprise':

In [133]:
!conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia

^C


In [132]:
torch.cuda.is_available()

False