# Preamble: Install and Import Packages

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import Resize
from torchvision.io import read_image, ImageReadMode
from multilingual_clip import Config_MCLIP
import open_clip
import json
import pandas as pd
import random
from pathlib import Path
import numpy as np
import transformers as hf
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from PIL import Image
import os
import time
import math

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(hf.__version__)
torch.autograd.set_detect_anomaly(True)

4.40.1


<torch.autograd.anomaly_mode.set_detect_anomaly at 0x2965ca3a410>

# Initialise the Configuration and Random Seeds

In [3]:
_text_model_config = {}

_image_model_config = {
    "attention_probs_dropout_prob": 0.0,
    "encoder_stride": 16,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.0,
    "hidden_size": 768,
    "image_size": 224,
    "initializer_range": 0.02,
    "intermediate_size": 3072,
    "layer_norm_eps": 1e-12,
    "num_attention_heads": 12,
    "num_channels": 3,
    "num_hidden_layers": 0,
    "patch_size": 16,
    "qkv_bias": True,
}

# Dual encoder/Concat
tokeniser_model_id = 'xlm-roberta-base'
text_model_id = 'xlm-roberta-base'
image_model_id = 'google/vit-base-patch16-224-in21k'

# CLIP
multimodal_model_id = 'openai/clip-vit-base-patch32'

# M-CLIP
# tokeniser_model_id = 'M-CLIP/XLM-Roberta-Large-Vit-B-16Plus'
# text_model_id = 'M-CLIP/XLM-Roberta-Large-Vit-B-16Plus'
# image_model_id = 'ViT-B-16-plus-240'
image_training_id = 'laion400m_e32'

# ViLT
# multimodal_model_id = 'dandelin/vilt-b32-mlm'


class CFG:
    use_multimodal = True
    use_dualencoder = False
    split_lang = False
    save_models = False
    use_lstm = False
    use_attn = False
    use_mask_split = False
    use_modal_attn = False
    is_mclip = False
    init_weights = False
    tokeniser_model_id = tokeniser_model_id
    text_model_id = text_model_id
    image_model_id = image_model_id
    multimodal_model_id = multimodal_model_id
    image_training_id = image_training_id
    text_model_config = hf.AutoConfig.from_pretrained(text_model_id) if not 'M-CLIP' in text_model_id else None
    image_model_config = hf.AutoConfig.from_pretrained(image_model_id) if not 'M-CLIP' in text_model_id else None
    multimodal_model_config = hf.AutoConfig.from_pretrained(multimodal_model_id, text_config=_text_model_config, vision_config=_image_model_config)
    images_base_path = Path(f'EXIST 2024 Lab/EXIST 2024 Memes Dataset/training/memes')
    images_base_path_test = Path('EXIST 2024 Lab/EXIST 2024 Memes Dataset/test/memes')
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    debug = True
    print_freq = 300
    apex = True # for faster training
    epochs = 10
    learning_rate = 2e-4  # for adam optimizer
    eps = 1e-6
    betas = (0.9, 0.999)  # for adam optimizer
    batch_size = 64
    max_len = 512
    weight_decay = 0.01  # for adam optimizer regulaization parameter
    gradient_accumulation_steps = 1
    max_grad_norm = 1000
    seed = 42
    train = True
    num_class = 2  # Number of class in your dataset
    mlp_hidden_size = 256
    mlp_hidden_layers = 0
    mlp_dropout = 0.1
    mlp_grad_clip = 1.0
    mlp_init_range = 0.2
    mlp_attn_dim = 256

In [4]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(CFG.seed)

In [5]:
class MultilingualCLIP(hf.PreTrainedModel):
    config_class = Config_MCLIP.MCLIPConfig

    def __init__(self, config, *args, **kwargs):
        super().__init__(config, *args, **kwargs)
        self.transformer = hf.AutoModel.from_pretrained(config.modelBase, cache_dir=kwargs.get("cache_dir"))
        self.LinearTransformation = torch.nn.Linear(in_features=config.transformerDimensions,
                                                    out_features=config.numDims)

    def forward(self, tokens, mask):
        embs = self.transformer(tokens, attention_mask=mask)[0]
        embs = (embs * mask.unsqueeze(2)).sum(dim=1) / mask.sum(dim=1)[:, None]
        return self.LinearTransformation(embs)

    @classmethod
    def _load_state_dict_into_model(cls, model, state_dict, pretrained_model_name_or_path, _fast_init=True):
        model.load_state_dict(state_dict)
        return model, [], [], []

# Preprocess the Dataset

In [6]:
with open('EXIST 2024 Lab/EXIST 2024 Memes Dataset/training/EXIST2024_training.json', 'r', encoding='utf-8') as fp:
    annotations = json.load(fp)
df = pd.DataFrame.from_dict(annotations).T
print(df.shape)
df.head()

(4044, 16)


Unnamed: 0,id_EXIST,lang,text,meme,path_memes,number_annotators,annotators,gender_annotators,age_annotators,ethnicities_annotators,study_levels_annotators,countries_annotators,labels_task4,labels_task5,labels_task6,split
110001,110001,es,2+2=5 MITO Albert Einstein tenía bajo rendimie...,110001.jpeg,memes/110001.jpeg,6,"[Annotator_1, Annotator_2, Annotator_3, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 18-22, 23-45]","[Hispano or Latino, Hispano or Latino, Hispano...","[High school degree or equivalent, Master’s de...","[Mexico, Spain, Argentina, Spain, Mexico, Mexico]","[YES, YES, YES, YES, YES, YES]","[DIRECT, DIRECT, DIRECT, DIRECT, DIRECT, DIRECT]","[[IDEOLOGICAL-INEQUALITY, STEREOTYPING-DOMINAN...",TRAIN-MEME_ES
110002,110002,es,CUANDO UNA MUJER VA A LUCHAR POR SUS DERECHOS,110002.jpeg,memes/110002.jpeg,6,"[Annotator_1, Annotator_2, Annotator_3, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 18-22, 23-45]","[Hispano or Latino, Hispano or Latino, Hispano...","[High school degree or equivalent, Master’s de...","[Mexico, Spain, Argentina, Spain, Mexico, Mexico]","[YES, YES, YES, YES, YES, YES]","[DIRECT, DIRECT, DIRECT, DIRECT, DIRECT, JUDGE...","[[IDEOLOGICAL-INEQUALITY, STEREOTYPING-DOMINAN...",TRAIN-MEME_ES
110003,110003,es,ІЯ ЕГЕЯ Е MOA ¿El Partido Republicano busca pe...,110003.jpeg,memes/110003.jpeg,6,"[Annotator_1, Annotator_2, Annotator_3, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 18-22, 23-45]","[Hispano or Latino, Hispano or Latino, Hispano...","[High school degree or equivalent, Master’s de...","[Mexico, Spain, Argentina, Spain, Mexico, Mexico]","[YES, YES, NO, NO, NO, NO]","[DIRECT, DIRECT, -, -, -, -]","[[STEREOTYPING-DOMINANCE, OBJECTIFICATION, MIS...",TRAIN-MEME_ES
110004,110004,es,"Paises que ""apoyan"" los derechos de la mujer A...",110004.jpeg,memes/110004.jpeg,6,"[Annotator_1, Annotator_2, Annotator_3, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 18-22, 23-45]","[Hispano or Latino, Hispano or Latino, Hispano...","[High school degree or equivalent, Master’s de...","[Mexico, Spain, Argentina, Spain, Mexico, Mexico]","[YES, YES, NO, NO, YES, NO]","[JUDGEMENTAL, JUDGEMENTAL, -, -, JUDGEMENTAL, -]","[[IDEOLOGICAL-INEQUALITY], [IDEOLOGICAL-INEQUA...",TRAIN-MEME_ES
110005,110005,es,Ya verás como este 8 de marzo hay uno que te s...,110005.jpeg,memes/110005.jpeg,6,"[Annotator_1, Annotator_2, Annotator_3, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 18-22, 23-45]","[Hispano or Latino, Hispano or Latino, Hispano...","[High school degree or equivalent, Master’s de...","[Mexico, Spain, Argentina, Spain, Mexico, Mexico]","[NO, YES, NO, NO, YES, NO]","[-, JUDGEMENTAL, -, -, DIRECT, -]","[[-], [IDEOLOGICAL-INEQUALITY], [-], [-], [IDE...",TRAIN-MEME_ES


In [7]:
mini_df = df[['id_EXIST', 'meme', 'text', 'lang']].reset_index(drop=True)
mini_df['id_EXIST'] = pd.to_numeric(mini_df['id_EXIST'])
mini_df.head()

Unnamed: 0,id_EXIST,meme,text,lang
0,110001,110001.jpeg,2+2=5 MITO Albert Einstein tenía bajo rendimie...,es
1,110002,110002.jpeg,CUANDO UNA MUJER VA A LUCHAR POR SUS DERECHOS,es
2,110003,110003.jpeg,ІЯ ЕГЕЯ Е MOA ¿El Partido Republicano busca pe...,es
3,110004,110004.jpeg,"Paises que ""apoyan"" los derechos de la mujer A...",es
4,110005,110005.jpeg,Ya verás como este 8 de marzo hay uno que te s...,es


In [8]:
task4_gold_path = Path('EXIST 2024 Lab/evaluation/golds/EXIST2024_training_task4_gold_hard.json')
task5_gold_path = Path('EXIST 2024 Lab/evaluation/golds/EXIST2024_training_task5_gold_hard.json')
task6_gold_path = Path('EXIST 2024 Lab/evaluation/golds/EXIST2024_training_task6_gold_hard.json')
task6_gold = pd.read_json(task6_gold_path)
task6_gold

Unnamed: 0,test_case,id,value
0,EXIST2024,110001,[IDEOLOGICAL-INEQUALITY]
1,EXIST2024,110002,"[IDEOLOGICAL-INEQUALITY, MISOGYNY-NON-SEXUAL-V..."
2,EXIST2024,110003,[NO]
3,EXIST2024,110005,[NO]
4,EXIST2024,110006,"[IDEOLOGICAL-INEQUALITY, STEREOTYPING-DOMINANC..."
...,...,...,...
3402,EXIST2024,212006,"[MISOGYNY-NON-SEXUAL-VIOLENCE, SEXUAL-VIOLENCE]"
3403,EXIST2024,212007,"[OBJECTIFICATION, MISOGYNY-NON-SEXUAL-VIOLENCE]"
3404,EXIST2024,212008,"[STEREOTYPING-DOMINANCE, OBJECTIFICATION]"
3405,EXIST2024,212009,[NO]


In [9]:
mini_df = pd.merge(mini_df, task6_gold, left_on='id_EXIST', right_on='id', how='left').drop(columns=['id', 'test_case']).rename(columns={'value': 'label_task6'})
mini_df['label_task6'] = mini_df['label_task6'].apply(lambda x: x if type(x) is list else [])
unique_labels = ['OBJECTIFICATION', 'NO', 'SEXUAL-VIOLENCE', 'MISOGYNY-NON-SEXUAL-VIOLENCE', 'IDEOLOGICAL-INEQUALITY', 'STEREOTYPING-DOMINANCE']
print(unique_labels)
CFG.num_class = len(unique_labels)
for label in unique_labels:
    mini_df[label] = mini_df['label_task6'].apply(lambda x: int(label in x))
mini_df

['OBJECTIFICATION', 'NO', 'SEXUAL-VIOLENCE', 'MISOGYNY-NON-SEXUAL-VIOLENCE', 'IDEOLOGICAL-INEQUALITY', 'STEREOTYPING-DOMINANCE']


Unnamed: 0,id_EXIST,meme,text,lang,label_task6,OBJECTIFICATION,NO,SEXUAL-VIOLENCE,MISOGYNY-NON-SEXUAL-VIOLENCE,IDEOLOGICAL-INEQUALITY,STEREOTYPING-DOMINANCE
0,110001,110001.jpeg,2+2=5 MITO Albert Einstein tenía bajo rendimie...,es,[IDEOLOGICAL-INEQUALITY],0,0,0,0,1,0
1,110002,110002.jpeg,CUANDO UNA MUJER VA A LUCHAR POR SUS DERECHOS,es,"[IDEOLOGICAL-INEQUALITY, MISOGYNY-NON-SEXUAL-V...",0,0,0,1,1,0
2,110003,110003.jpeg,ІЯ ЕГЕЯ Е MOA ¿El Partido Republicano busca pe...,es,[NO],0,1,0,0,0,0
3,110004,110004.jpeg,"Paises que ""apoyan"" los derechos de la mujer A...",es,[],0,0,0,0,0,0
4,110005,110005.jpeg,Ya verás como este 8 de marzo hay uno que te s...,es,[NO],0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
4039,212006,212006.jpeg,u gon act like a bitch u gon die like a bitch,en,"[MISOGYNY-NON-SEXUAL-VIOLENCE, SEXUAL-VIOLENCE]",0,0,1,1,0,0
4040,212007,212007.jpeg,SHE LOOKS LIKE EVERY OTHER BITCH LIKE makeamem...,en,"[OBJECTIFICATION, MISOGYNY-NON-SEXUAL-VIOLENCE]",1,0,0,1,0,0
4041,212008,212008.jpeg,YOURE A BASIC BITCH CASE DISMISSED,en,"[STEREOTYPING-DOMINANCE, OBJECTIFICATION]",1,0,0,0,0,1
4042,212009,212009.jpeg,WHEN YOU'RE AUNT HAS THIS WEIRD ASS MAN AND SH...,en,[NO],0,1,0,0,0,0


# Initialise the Processors/Tokenisers/Models

In [10]:
if CFG.is_mclip:
    tokenizer = hf.AutoTokenizer.from_pretrained(CFG.tokeniser_model_id)
    text_model = MultilingualCLIP.from_pretrained(CFG.text_model_id).to(CFG.device)
    image_model, _, image_processor = open_clip.create_model_and_transforms(CFG.image_model_id, pretrained=CFG.image_training_id)
    image_model = image_model.to(CFG.device)
elif CFG.use_multimodal:
    mm_processor = hf.AutoProcessor.from_pretrained(CFG.multimodal_model_id)
    mm_model = hf.AutoModel.from_pretrained(CFG.multimodal_model_id).to(CFG.device)
elif CFG.use_dualencoder:
    tokenizer = hf.AutoTokenizer.from_pretrained(CFG.tokeniser_model_id, padding=True, truncation=True)
    processor = hf.AutoImageProcessor.from_pretrained(CFG.image_model_id)
    de_processor = hf.VisionTextDualEncoderProcessor(image_processor=processor, tokenizer=tokenizer)
    text_model = hf.AutoModel.from_pretrained(CFG.text_model_id).to(CFG.device)
    image_model = hf.AutoModel.from_pretrained(CFG.image_model_id).to(CFG.device)
    de_model = hf.VisionTextDualEncoderModel(vision_model=image_model, text_model=text_model)
else:
    tokenizer = hf.AutoTokenizer.from_pretrained(CFG.tokeniser_model_id)
    text_model = hf.AutoModel.from_pretrained(CFG.text_model_id).to(CFG.device)
    # Adding a config to the image_model gets rid of lots of pretrained weights
    image_model = hf.AutoModel.from_pretrained(CFG.image_model_id).to(CFG.device)

  return self.fget.__get__(instance, owner)()


# Train/Val Split

In [11]:
def dataframe_train_test_split(df, target_label, seed=CFG.seed, test_size=0.2, split_labels=True):
    train = df.sample(frac=(1.0 - test_size), random_state=seed)
    test = df.drop(train.index).reset_index(drop=True)

    train.reset_index(drop=True, inplace=True)

    if split_labels:
        return train.drop(columns=target_label), test.drop(columns=target_label), train[target_label], test[target_label]
    else:
        return train, test

X_train, X_val, y_train, y_val = dataframe_train_test_split(mini_df[['id_EXIST', 'meme', 'text', 'label_task6']], 'label_task6', test_size=0.2, seed=CFG.seed)
X_val.head()

Unnamed: 0,id_EXIST,meme,text
0,110002,110002.jpeg,CUANDO UNA MUJER VA A LUCHAR POR SUS DERECHOS
1,110005,110005.jpeg,Ya verás como este 8 de marzo hay uno que te s...
2,110012,110012.jpeg,A LOS QUE NO ME SALUDAN POR EL DIA DE LA MUJER...
3,110017,110017.jpeg,"SE ACERCA EL DIA DE LA MUJER, SE ACEPTAN POSTR..."
4,110020,110020.jpeg,Día de la Mujer Expectativa Realidad


# Custom Dataset Definition

In [12]:
class ExistDataset(Dataset):
    def __init__(self, features, img_dir, labels=None, test=False, img_transform=None, caption_transform=None, target_transform=None):
        self.features = features
        self.labels = labels
        self.img_dir = img_dir
        self.test = test
        self.img_transform = img_transform
        self.caption_transform = caption_transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        img_path = str(self.img_dir.joinpath(self.features['meme'].iloc[idx]))
        if CFG.is_mclip:
            image = Image.open(img_path)
        else:
            image = read_image(img_path, mode=ImageReadMode.RGB).to(device=CFG.device)
        caption = self.features['text'].iloc[idx]
        
        if not self.test:
            label = self.labels.iloc[idx]
        else:
            identity = self.features['id_EXIST'].iloc[idx]
        
        if self.img_transform:
            image = self.img_transform(image)
        if self.caption_transform:
            caption = self.caption_transform(caption)
        if not self.test and self.target_transform:
            label = self.target_transform(label)
            
        if CFG.split_lang:
            caption = f'Language: {self.features["lang"].iloc[idx]} - {caption}'
            
        if CFG.is_mclip:
            processed = tokenizer(caption, padding=True, return_tensors='pt')
            seq = processed['input_ids']
            mask = processed['attention_mask']
            image = image_processor(image)
        elif CFG.use_multimodal:
            processed = mm_processor(text=caption, images=image, return_tensors="pt", padding=True, truncation=True)
            seq = processed['input_ids']
            mask = processed['attention_mask']
            image = processed['pixel_values']
        elif CFG.use_dualencoder:
            processed = de_processor(text=caption, images=image, return_tensors="pt")
            seq = processed['input_ids']
            mask = processed['attention_mask']
            image = processed['pixel_values']
        else:
            processed = tokenizer.encode_plus(
                caption,
                padding='longest',
                truncation=True,
                return_tensors='pt'
            )
            seq = processed['input_ids']
            mask = processed['attention_mask']
        
        if not self.test:
            label = torch.tensor([label]).float()
            return image, seq, mask, label
        
        return identity, image, seq, mask

In [13]:
class Collator(object):
    def __init__(self, test=False):
        self.test = test
    def __call__(self, batch):
        if not self.test:
            images, seqs, masks, labels = zip(*batch)
            labels = torch.stack(labels)
        else:
            ids, images, seqs, masks = zip(*batch)

        seqs = [seq.squeeze(dim=0) for seq in seqs]
        masks = [mask.squeeze(dim=0) for mask in masks]
        images = [image.squeeze(dim=0) for image in images]

        seqs = nn.utils.rnn.pad_sequence(seqs, batch_first=True)
        masks = nn.utils.rnn.pad_sequence(masks, batch_first=True)

        images = torch.stack(images)
        
        if not self.test:
            return images, seqs, masks, labels
        
        return ids, images, seqs, masks

In [14]:
resizer = Resize((224, 224), antialias=True)

def resize_images(img_tensor):
    return resizer(img_tensor)

# Dataset Initialisation

In [15]:
val_dataset = ExistDataset(X_val, CFG.images_base_path, labels=y_val, img_transform=resize_images, test=True)
len(val_dataset)

809

# Model Architecture

In [16]:
class ConcatArch(nn.Module):
    def __init__(self, hidden_size, hidden_layers, dropout, num_classes, use_multimodal=False, use_dualencoder=False, is_mclip=False):
        super().__init__()
        
        self.hidden_size = hidden_size
        self.hidden_layers = hidden_layers
        self.use_multimodal = use_multimodal
        self.use_dualencoder = use_dualencoder
        self.is_mclip = is_mclip
        self.is_vilt = 'ViltForMaskedLM' in CFG.multimodal_model_config.architectures
        
        if self.is_mclip:
            self.text_model = text_model
            self.image_model = image_model
        elif self.use_multimodal:
            self.mm_model = mm_model
        elif self.use_dualencoder:
            self.de_model = de_model
        else:
            self.text_model = text_model
            self.image_model = image_model
        
        if self.is_mclip:
            self.fc1 = nn.Linear(1280, self.hidden_size)
        elif self.use_multimodal:
            if self.is_vilt and CFG.use_lstm:
                out_channels = CFG.mlp_hidden_size + CFG.multimodal_model_config.hidden_size
                self.lstm = nn.LSTM(CFG.multimodal_model_config.hidden_size, CFG.mlp_hidden_size, batch_first=True)
            elif self.is_vilt and CFG.use_mask_split:
                out_channels = CFG.multimodal_model_config.hidden_size * 3
            elif self.is_vilt and CFG.use_attn:
                self.attn = nn.Sequential(
                    nn.Linear(CFG.multimodal_model_config.hidden_size, CFG.mlp_attn_dim),
                    nn.Tanh(),
                    nn.Linear(CFG.mlp_attn_dim, 1),
                    nn.Softmax(dim=1)
                )
            elif self.is_vilt and CFG.use_modal_attn:
                self.attn1 = nn.Sequential(
                    nn.Linear(CFG.multimodal_model_config.hidden_size, CFG.mlp_attn_dim),
                    nn.Tanh(),
                    nn.Linear(CFG.mlp_attn_dim, 1),
                    nn.Softmax(dim=1)
                )
                self.attn2 = nn.Sequential(
                    nn.Linear(CFG.multimodal_model_config.hidden_size, CFG.mlp_attn_dim),
                    nn.Tanh(),
                    nn.Linear(CFG.mlp_attn_dim, 1),
                    nn.Softmax(dim=1)
                )
                out_channels = CFG.multimodal_model_config.hidden_size * 2
            elif self.is_vilt:
                out_channels = CFG.multimodal_model_config.hidden_size
            else:
                out_channels = 2 * CFG.multimodal_model_config.projection_dim
            self.fc1 = nn.Linear(out_channels, self.hidden_size)
        elif self.use_dualencoder:
            self.fc1 = nn.Linear(2 * 512, self.hidden_size)
        else:
            self.fc1 = nn.Linear(CFG.text_model_config.hidden_size + CFG.image_model_config.hidden_size, self.hidden_size)
        self.hiddens = nn.ModuleList([nn.Linear(self.hidden_size, self.hidden_size) for _ in range(self.hidden_layers)])
        self.fc2 = nn.Linear(self.hidden_size, num_classes)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        
        if CFG.init_weights:
            self._init_weights(self.fc1)
            for hidden in self.hiddens:
                self._init_weights(hidden)
            self._init_weights(self.fc2)

    def forward(self, tokens, mask, image):
        if self.is_mclip:
            emb_text = self.text_model.forward(tokens, mask)
            emb_img = self.image_model.encode_image(image)
            x = torch.cat([emb_text, emb_img], dim=1)
        elif self.use_multimodal:
            mm_output = self.mm_model(input_ids=tokens, attention_mask=mask, pixel_values=image, output_hidden_states=True)
            cats = [mm_output.pooler_output] if self.is_vilt else [mm_output.text_embeds, mm_output.image_embeds]
            
            if self.is_vilt and CFG.use_lstm:
                # First hidden state is apparently the embedding output
                # https://discuss.huggingface.co/t/hidden-states-embedding-tensors/3549/
                layerwise_cls = torch.stack([h[:, 0, :] for h in mm_output.hidden_states[1:]], dim=1)
                _, (h, _) = self.lstm(layerwise_cls)
                h = h.squeeze(dim=0)
                cats.append(h)

            if self.is_vilt and CFG.use_mask_split:
                last_h = mm_output.last_hidden_state
                mask_len = mask.shape[1]
                mean_pooled_text = torch.mean(last_h[:, :mask_len, :], dim=1)
                mean_pooled_img = torch.mean(last_h[:, mask_len:, :], dim=1)
                cats += [mean_pooled_text, mean_pooled_img]

            if self.is_vilt and CFG.use_attn:
                last_h = mm_output.last_hidden_state
                attentions = self.attn(last_h)
                x = torch.sum(attentions * last_h, dim=1)

                cls = last_h[:, 0, :]
                x += cls
            elif self.is_vilt and CFG.use_modal_attn:
                last_h = mm_output.last_hidden_state
                mask_len = mask.shape[1]
                text_split = last_h[:, :mask_len, :]
                img_split = last_h[:, mask_len:, :]
                text_attentions = self.attn1(text_split)
                img_attentions = self.attn2(img_split)
                x1 = torch.sum(text_attentions * text_split, dim=1)
                x2 = torch.sum(img_attentions * img_split, dim=1)

                x = torch.cat([x1, x2], dim=1)

                cls = last_h[:, 0, :]
                cls = torch.cat([cls, cls], dim=1)
                x += cls
            else:
                x = torch.cat(cats, dim=1)
        elif self.use_dualencoder:
            de_output = self.de_model(input_ids=tokens, attention_mask=mask, pixel_values=image)
            x = torch.cat([de_output.text_embeds, de_output.image_embeds], dim=1)
        else:
            cls_text = self.text_model(tokens, attention_mask=mask).last_hidden_state[:, 0, :]
            cls_img = self.image_model(image).last_hidden_state[:, 0, :]
            x = torch.cat([cls_text, cls_img], dim=1)

        x = self.fc1(x)
        x = self.activation(x)
        x = self.dropout(x)
        for hidden in self.hiddens:
            x = hidden(x)
            x = self.activation(x)
            x = self.dropout(x)
        x = self.fc2(x)
        
        output = x
        return output.float()
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=CFG.mlp_init_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=CFG.mlp_init_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

# Utility Functions

In [17]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

In [18]:
def get_score(y_trues, y_preds):
    macro_f1 = f1_score(y_trues, y_preds, average='macro')
    return macro_f1

In [19]:
def test_loop(model, test_dataloader):
    all_soft = []
    all_hard = []
    all_ids = []
    
    model.eval()
    
    for identity, image, seq, mask in tqdm(test_dataloader):
        test_image = image.to(device=CFG.device)
        test_seq = seq.to(device=CFG.device)
        test_mask = mask.to(device=CFG.device)

        with torch.no_grad():
            output = model(test_seq, test_mask, test_image)
        
        soft = torch.sigmoid(output)
        hard = torch.sigmoid(output).round()
        
        all_ids += list(identity)
        all_soft.append(soft)
        all_hard.append(hard)
        
    all_soft = torch.cat(all_soft, dim=0)
    all_hard = torch.cat(all_hard, dim=0)
    
    return all_ids, all_hard, all_soft

# Inference From Checkpoint

In [20]:
collate = Collator(test=True)
valid_dataloader = DataLoader(val_dataset, batch_size=CFG.batch_size, collate_fn=collate)
loss_fn = nn.CrossEntropyLoss()

inf_model_name = 'openai-clip-vit-base-patch32_score_0.7942'
inf_model = ConcatArch(
    hidden_size=CFG.mlp_hidden_size,
    hidden_layers=CFG.mlp_hidden_layers,
    dropout=CFG.mlp_dropout,
    num_classes=CFG.num_class,
    use_multimodal=CFG.use_multimodal,
    use_dualencoder=CFG.use_dualencoder,
    is_mclip=CFG.is_mclip
).to(CFG.device)
inf_model.load_state_dict(torch.load('Task 6/' + inf_model_name + '.pth', map_location=torch.device(CFG.device))['model'])
inf_model

ConcatArch(
  (mm_model): CLIPModel(
    (text_model): CLIPTextTransformer(
      (embeddings): CLIPTextEmbeddings(
        (token_embedding): Embedding(49408, 512)
        (position_embedding): Embedding(77, 512)
      )
      (encoder): CLIPEncoder(
        (layers): ModuleList(
          (0-11): 12 x CLIPEncoderLayer(
            (self_attn): CLIPAttention(
              (k_proj): Linear(in_features=512, out_features=512, bias=True)
              (v_proj): Linear(in_features=512, out_features=512, bias=True)
              (q_proj): Linear(in_features=512, out_features=512, bias=True)
              (out_proj): Linear(in_features=512, out_features=512, bias=True)
            )
            (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
            (mlp): CLIPMLP(
              (activation_fn): QuickGELUActivation()
              (fc1): Linear(in_features=512, out_features=2048, bias=True)
              (fc2): Linear(in_features=2048, out_features=512, bias=True)
 

In [21]:
ids, hards, softs = test_loop(inf_model, valid_dataloader)
hards = hards.tolist()
softs = softs.tolist()
print(ids)
print(hards)
print(softs)

100%|██████████| 13/13 [00:09<00:00,  1.30it/s]

[110002, 110005, 110012, 110017, 110020, 110022, 110035, 110036, 110037, 110039, 110047, 110051, 110065, 110076, 110090, 110091, 110096, 110099, 110104, 110115, 110117, 110120, 110123, 110128, 110131, 110144, 110147, 110154, 110155, 110161, 110162, 110173, 110181, 110187, 110190, 110191, 110198, 110201, 110202, 110203, 110207, 110217, 110226, 110235, 110242, 110246, 110254, 110263, 110265, 110269, 110285, 110294, 110302, 110318, 110331, 110336, 110338, 110357, 110370, 110379, 110380, 110392, 110393, 110396, 110397, 110398, 110400, 110401, 110404, 110413, 110418, 110419, 110453, 110456, 110460, 110467, 110470, 110475, 110488, 110489, 110493, 110497, 110500, 110503, 110511, 110513, 110516, 110525, 110538, 110541, 110547, 110551, 110560, 110563, 110564, 110570, 110575, 110581, 110587, 110596, 110601, 110607, 110609, 110617, 110624, 110626, 110628, 110634, 110636, 110640, 110641, 110642, 110647, 110649, 110657, 110659, 110660, 110664, 110676, 110682, 110684, 110685, 110699, 110700, 110703,




In [22]:
def mapper(preds):
    hard_preds = []
    for label, pred in zip(unique_labels, preds):
        if pred > 0:
            hard_preds.append(label)
    return hard_preds

In [23]:
hard_path = inf_model_name + '_val_hard'
soft_path = inf_model_name + '_val_soft'

In [24]:
hard_dicts = []
for identity, hard in zip(ids, hards):
    p = mapper(hard)
    if len(p) > 0:
        hard_dicts.append({
            'test_case': 'EXIST2024',
            'id': str(identity),
            'value': p
        })
with open(f'{hard_path}.json', 'w') as fp:
    json.dump(hard_dicts, fp)

In [25]:
def soft_mapper(preds):
    soft_preds = dict()
    for label, pred in zip(unique_labels, preds):
        soft_preds[label] = pred
    return soft_preds

soft_dicts = []
for identity, soft in zip(ids, softs):
    soft_dicts.append({
        'test_case': 'EXIST2024',
        'id': str(identity),
        'value': soft_mapper(soft)
    })
with open(f'{soft_path}.json', 'w') as fp:
    json.dump(soft_dicts, fp)

In [26]:
len(hard_dicts)

544

# PyEvALL Test

In [27]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)

In [28]:
from pyevall.evaluation import PyEvALLEvaluation
from pyevall.utils.utils import PyEvALLUtils
from pyevall.metrics.metricfactory import MetricFactory

gold_hard = 'EXIST 2024 Lab/evaluation/golds/EXIST2024_training_task6_gold_hard.json'
gold_soft = 'EXIST 2024 Lab/evaluation/golds/EXIST2024_training_task6_gold_soft.json'

gold_val_hard = 'task6_val_hard_gold.json'
gold_val_soft = 'task6_val_soft_gold.json'

gold_hard_df = pd.read_json(gold_hard)
gold_hard_df = gold_hard_df[gold_hard_df['id'].isin(X_val['id_EXIST'])]
gold_hard_df['id'] = gold_hard_df['id'].astype(str)
gold_hard_df.to_json(gold_val_hard, index=False, orient='records')

gold_soft_df = pd.read_json(gold_soft)
gold_soft_df = gold_soft_df[gold_soft_df['id'].isin(X_val['id_EXIST'])]
gold_soft_df['id'] = gold_soft_df['id'].astype(str)
gold_soft_df.to_json(gold_val_soft, index=False, orient='records')

predictions_hard = f'{hard_path}.json'
predictions_soft = f'{soft_path}.json'

test = PyEvALLEvaluation()
metrics_hard=[MetricFactory.ICM.value, MetricFactory.ICMNorm.value, MetricFactory.FMeasure.value, MetricFactory.Accuracy.value]
metrics_soft=[MetricFactory.ICMSoft.value, MetricFactory.ICMSoftNorm.value, MetricFactory.CrossEntropy.value]

params = {
    PyEvALLUtils.PARAM_FORMAT: PyEvALLUtils.PARAM_OPTION_FORMAT_JSON,
    PyEvALLUtils.PARAM_REPORT: PyEvALLUtils.PARAM_OPTION_REPORT_DATAFRAME
}

report_hard = test.evaluate(predictions_hard, gold_val_hard, metrics_hard, **params)
with open(f'{hard_path}_results.json', 'w') as fp:
    json.dump(report_hard.report, fp, indent=4)
report_hard.print_report()


2024-05-31 20:51:37,577 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICM', 'ICMNorm', 'FMeasure', 'Accuracy']
2024-05-31 20:51:37,665 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM evaluation method
2024-05-31 20:51:37,666 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Normalized evaluation method
2024-05-31 20:51:37,667 - pyevall.metrics.metrics - INFO -             evaluate() - Executing fmeasure evaluation method
2024-05-31 20:51:37,855 - pyevall.metrics.metrics - INFO -             evaluate() - Executing accuracy evaluation method
cargado 24
+----+---------------------------------------------------------+-------+------------+----------+-------+
|    | files                                                   | ICM   | ICM-Norm   |       F1 | Acc   |
|----+---------------------------------------------------------+-------+------------+----------+-------|
|  0 | openai-clip-vit-base-patch3

In [29]:
report_soft = test.evaluate(predictions_soft, gold_val_soft, metrics_soft, **params)
with open(f'{soft_path}_results.json', 'w') as fp:
    json.dump(report_soft.report, fp, indent=4)
report_soft.print_report()

2024-05-31 20:51:37,866 - pyevall.evaluation - INFO -             evaluate() - Evaluating the following metrics ['ICMSoft', 'ICMSoftNorm', 'CrossEntropy']
2024-05-31 20:51:38,078 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Soft evaluation method
2024-05-31 20:51:38,775 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM-Soft Normalized evaluation method
2024-05-31 20:51:38,775 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Soft evaluation method
2024-05-31 20:51:39,404 - pyevall.metrics.metrics - INFO -             evaluate() - Executing ICM Soft evaluation method
2024-05-31 20:51:39,975 - pyevall.metrics.metrics - INFO -             evaluate() - Executing Cross Entropy evaluation method
+----+---------------------------------------------------------+------------+-----------------+---------+
|    | files                                                   |   ICM-Soft |   ICM-Soft-Norm |      CE |
|----+------