# Preamble: Install and Import Packages

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import Resize
from torchvision.io import read_image, ImageReadMode
from multilingual_clip import Config_MCLIP
import open_clip
import json
import pandas as pd
import random
from pathlib import Path
import cv2
import numpy as np
import transformers as hf
from tqdm.auto import tqdm
from sklearn.metrics import f1_score, classification_report
from PIL import Image
import os
import gc
import time
import math
from schedulefree import AdamWScheduleFree

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(hf.__version__)
torch.autograd.set_detect_anomaly(True)

4.40.1


<torch.autograd.anomaly_mode.set_detect_anomaly at 0x13d554205d0>

# Initialise the Configuration and Random Seeds

In [22]:
_text_model_config = {}

_image_model_config = {
    "attention_probs_dropout_prob": 0.0,
    "encoder_stride": 16,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.0,
    "hidden_size": 768,
    "image_size": 224,
    "initializer_range": 0.02,
    "intermediate_size": 3072,
    "layer_norm_eps": 1e-12,
    "num_attention_heads": 12,
    "num_channels": 3,
    "num_hidden_layers": 0,
    "patch_size": 16,
    "qkv_bias": True,
}

# Dual encoder/Concat
tokeniser_model_id = 'xlm-roberta-base'
text_model_id = 'xlm-roberta-base'
image_model_id = 'google/vit-base-patch16-224-in21k'

# CLIP
# multimodal_model_id = 'openai/clip-vit-base-patch32'

# M-CLIP
# tokeniser_model_id = 'M-CLIP/XLM-Roberta-Large-Vit-B-16Plus'
# text_model_id = 'M-CLIP/XLM-Roberta-Large-Vit-B-16Plus'
# image_model_id = 'ViT-B-16-plus-240'
image_training_id = 'laion400m_e32'

# ViLT
multimodal_model_id = 'dandelin/vilt-b32-mlm'


class CFG:
    use_multimodal = True
    use_dualencoder = False
    split_lang = False
    save_models = False
    use_lstm = False
    use_attn = False
    use_mask_split = False
    use_modal_attn = False
    is_mclip = False
    init_weights = False
    tokeniser_model_id = tokeniser_model_id
    text_model_id = text_model_id
    image_model_id = image_model_id
    multimodal_model_id = multimodal_model_id
    image_training_id = image_training_id
    text_model_config = hf.AutoConfig.from_pretrained(text_model_id) if not 'M-CLIP' in text_model_id else None
    image_model_config = hf.AutoConfig.from_pretrained(image_model_id) if not 'M-CLIP' in text_model_id else None
    multimodal_model_config = hf.AutoConfig.from_pretrained(multimodal_model_id, text_config=_text_model_config, vision_config=_image_model_config)
    images_base_path = Path(f'EXIST 2024 Lab/EXIST 2024 Memes Dataset/training/memes')
    images_base_path_test = Path('EXIST 2024 Lab/EXIST 2024 Memes Dataset/test/memes')
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    debug = True
    print_freq = 300
    apex = True # for faster training
    epochs = 10
    learning_rate = 2e-4  # for adam optimizer
    eps = 1e-6
    betas = (0.9, 0.999)  # for adam optimizer
    batch_size = 32
    max_len = 512
    weight_decay = 0.01  # for adam optimizer regulaization parameter
    gradient_accumulation_steps = 1
    max_grad_norm = 1000
    seed = 42
    train = True
    num_class = 3
    mlp_hidden_size = 256
    mlp_hidden_layers = 0
    mlp_dropout = 0.1
    mlp_grad_clip = 1.0
    mlp_init_range = 0.2
    mlp_attn_dim = 256

In [23]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(CFG.seed)

In [24]:
class MultilingualCLIP(hf.PreTrainedModel):
    config_class = Config_MCLIP.MCLIPConfig

    def __init__(self, config, *args, **kwargs):
        super().__init__(config, *args, **kwargs)
        self.transformer = hf.AutoModel.from_pretrained(config.modelBase, cache_dir=kwargs.get("cache_dir"))
        self.LinearTransformation = torch.nn.Linear(in_features=config.transformerDimensions,
                                                    out_features=config.numDims)

    def forward(self, tokens, mask):
        embs = self.transformer(tokens, attention_mask=mask)[0]
        embs = (embs * mask.unsqueeze(2)).sum(dim=1) / mask.sum(dim=1)[:, None]
        return self.LinearTransformation(embs)

    @classmethod
    def _load_state_dict_into_model(cls, model, state_dict, pretrained_model_name_or_path, _fast_init=True):
        model.load_state_dict(state_dict)
        return model, [], [], []

# Preprocess the Dataset

In [25]:
with open('EXIST 2024 Lab/EXIST 2024 Memes Dataset/training/EXIST2024_training.json', 'r', encoding='utf-8') as fp:
    annotations = json.load(fp)
df = pd.DataFrame.from_dict(annotations).T
print(df.shape)
df.head()

(4044, 16)


Unnamed: 0,id_EXIST,lang,text,meme,path_memes,number_annotators,annotators,gender_annotators,age_annotators,ethnicities_annotators,study_levels_annotators,countries_annotators,labels_task4,labels_task5,labels_task6,split
110001,110001,es,2+2=5 MITO Albert Einstein tenía bajo rendimie...,110001.jpeg,memes/110001.jpeg,6,"[Annotator_1, Annotator_2, Annotator_3, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 18-22, 23-45]","[Hispano or Latino, Hispano or Latino, Hispano...","[High school degree or equivalent, Master’s de...","[Mexico, Spain, Argentina, Spain, Mexico, Mexico]","[YES, YES, YES, YES, YES, YES]","[DIRECT, DIRECT, DIRECT, DIRECT, DIRECT, DIRECT]","[[IDEOLOGICAL-INEQUALITY, STEREOTYPING-DOMINAN...",TRAIN-MEME_ES
110002,110002,es,CUANDO UNA MUJER VA A LUCHAR POR SUS DERECHOS,110002.jpeg,memes/110002.jpeg,6,"[Annotator_1, Annotator_2, Annotator_3, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 18-22, 23-45]","[Hispano or Latino, Hispano or Latino, Hispano...","[High school degree or equivalent, Master’s de...","[Mexico, Spain, Argentina, Spain, Mexico, Mexico]","[YES, YES, YES, YES, YES, YES]","[DIRECT, DIRECT, DIRECT, DIRECT, DIRECT, JUDGE...","[[IDEOLOGICAL-INEQUALITY, STEREOTYPING-DOMINAN...",TRAIN-MEME_ES
110003,110003,es,ІЯ ЕГЕЯ Е MOA ¿El Partido Republicano busca pe...,110003.jpeg,memes/110003.jpeg,6,"[Annotator_1, Annotator_2, Annotator_3, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 18-22, 23-45]","[Hispano or Latino, Hispano or Latino, Hispano...","[High school degree or equivalent, Master’s de...","[Mexico, Spain, Argentina, Spain, Mexico, Mexico]","[YES, YES, NO, NO, NO, NO]","[DIRECT, DIRECT, -, -, -, -]","[[STEREOTYPING-DOMINANCE, OBJECTIFICATION, MIS...",TRAIN-MEME_ES
110004,110004,es,"Paises que ""apoyan"" los derechos de la mujer A...",110004.jpeg,memes/110004.jpeg,6,"[Annotator_1, Annotator_2, Annotator_3, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 18-22, 23-45]","[Hispano or Latino, Hispano or Latino, Hispano...","[High school degree or equivalent, Master’s de...","[Mexico, Spain, Argentina, Spain, Mexico, Mexico]","[YES, YES, NO, NO, YES, NO]","[JUDGEMENTAL, JUDGEMENTAL, -, -, JUDGEMENTAL, -]","[[IDEOLOGICAL-INEQUALITY], [IDEOLOGICAL-INEQUA...",TRAIN-MEME_ES
110005,110005,es,Ya verás como este 8 de marzo hay uno que te s...,110005.jpeg,memes/110005.jpeg,6,"[Annotator_1, Annotator_2, Annotator_3, Annota...","[F, F, F, M, M, M]","[18-22, 23-45, 46+, 46+, 18-22, 23-45]","[Hispano or Latino, Hispano or Latino, Hispano...","[High school degree or equivalent, Master’s de...","[Mexico, Spain, Argentina, Spain, Mexico, Mexico]","[NO, YES, NO, NO, YES, NO]","[-, JUDGEMENTAL, -, -, DIRECT, -]","[[-], [IDEOLOGICAL-INEQUALITY], [-], [-], [IDE...",TRAIN-MEME_ES


In [26]:
mini_df = df[['id_EXIST', 'meme', 'text', 'lang']].reset_index(drop=True)
mini_df['id_EXIST'] = pd.to_numeric(mini_df['id_EXIST'])
mini_df.head()

Unnamed: 0,id_EXIST,meme,text,lang
0,110001,110001.jpeg,2+2=5 MITO Albert Einstein tenía bajo rendimie...,es
1,110002,110002.jpeg,CUANDO UNA MUJER VA A LUCHAR POR SUS DERECHOS,es
2,110003,110003.jpeg,ІЯ ЕГЕЯ Е MOA ¿El Partido Republicano busca pe...,es
3,110004,110004.jpeg,"Paises que ""apoyan"" los derechos de la mujer A...",es
4,110005,110005.jpeg,Ya verás como este 8 de marzo hay uno que te s...,es


In [27]:
task4_gold_path = Path('EXIST 2024 Lab/evaluation/golds/EXIST2024_training_task4_gold_hard.json')
task5_gold_path = Path('EXIST 2024 Lab/evaluation/golds/EXIST2024_training_task5_gold_hard.json')
task6_gold_path = Path('EXIST 2024 Lab/evaluation/golds/EXIST2024_training_task6_gold_hard.json')
task5_gold = pd.read_json(task5_gold_path)

choices = ['DIRECT', 'JUDGEMENTAL', 'NO']
mini_df = pd.merge(mini_df, task5_gold, left_on='id_EXIST', right_on='id', how='left').drop(columns=['id', 'test_case']).rename(columns={'value': 'label_task5'})
mini_df['label_task5'] = mini_df['label_task5'].apply(lambda x: np.random.choice(choices) if pd.isna(x) else x)
mini_df['label_task5'] = pd.to_numeric(mini_df['label_task5'].map({'DIRECT': 1, 'JUDGEMENTAL': 2, 'NO': 0}))
print(len(mini_df))
mini_df.head()

4044


Unnamed: 0,id_EXIST,meme,text,lang,label_task5
0,110001,110001.jpeg,2+2=5 MITO Albert Einstein tenía bajo rendimie...,es,1
1,110002,110002.jpeg,CUANDO UNA MUJER VA A LUCHAR POR SUS DERECHOS,es,1
2,110003,110003.jpeg,ІЯ ЕГЕЯ Е MOA ¿El Partido Republicano busca pe...,es,0
3,110004,110004.jpeg,"Paises que ""apoyan"" los derechos de la mujer A...",es,0
4,110005,110005.jpeg,Ya verás como este 8 de marzo hay uno que te s...,es,0


# Initialise the Processors/Tokenisers/Models

In [28]:
if CFG.is_mclip:
    tokenizer = hf.AutoTokenizer.from_pretrained(CFG.tokeniser_model_id)
    text_model = MultilingualCLIP.from_pretrained(CFG.text_model_id).to(CFG.device)
    image_model, _, image_processor = open_clip.create_model_and_transforms(CFG.image_model_id, pretrained=CFG.image_training_id)
    image_model = image_model.to(CFG.device)
elif CFG.use_multimodal:
    mm_processor = hf.AutoProcessor.from_pretrained(CFG.multimodal_model_id)
    mm_model = hf.AutoModel.from_pretrained(CFG.multimodal_model_id).to(CFG.device)
elif CFG.use_dualencoder:
    tokenizer = hf.AutoTokenizer.from_pretrained(CFG.tokeniser_model_id, padding=True, truncation=True)
    processor = hf.AutoImageProcessor.from_pretrained(CFG.image_model_id)
    de_processor = hf.VisionTextDualEncoderProcessor(image_processor=processor, tokenizer=tokenizer)
    text_model = hf.AutoModel.from_pretrained(CFG.text_model_id).to(CFG.device)
    image_model = hf.AutoModel.from_pretrained(CFG.image_model_id).to(CFG.device)
    de_model = hf.VisionTextDualEncoderModel(vision_model=image_model, text_model=text_model)
else:
    tokenizer = hf.AutoTokenizer.from_pretrained(CFG.tokeniser_model_id)
    text_model = hf.AutoModel.from_pretrained(CFG.text_model_id).to(CFG.device)
    # Adding a config to the image_model gets rid of lots of pretrained weights
    image_model = hf.AutoModel.from_pretrained(CFG.image_model_id).to(CFG.device)

# Train/Val Split

In [29]:
def dataframe_train_test_split(df, target_label, seed=CFG.seed, test_size=0.2, split_labels=True):
    train = df.sample(frac=(1.0 - test_size), random_state=seed).reset_index(drop=True)
    test = df.drop(train.index).sample(frac=1.0, random_state=seed).reset_index(drop=True)

    if split_labels:
        return train.drop(columns=target_label), test.drop(columns=target_label), train[target_label], test[target_label]
    else:
        return train, test

X_train, X_val, y_train, y_val = dataframe_train_test_split(mini_df[['meme', 'text', 'label_task5']], 'label_task5', test_size=0.2, seed=CFG.seed)
X_train.head()

Unnamed: 0,meme,text
0,211600.jpeg,Dating after 25 Learn how to be a step dad 557
1,110150.jpeg,"ESTAMOS ENAMORADOS, ASÉ-QUE-NO-DEDORIA TENER S..."
2,112026.jpeg,BUUU ZORRA!!
3,210472.jpeg,Claims to be Feminist FOX STUDT MEN IN 45 HAPP...
4,211170.jpeg,"NOT ALL MEN NOT ALL MEN, BUT A STATISTICALLY S..."


# Custom Dataset Definition

In [30]:
class ExistDataset(Dataset):
    def __init__(self, features, img_dir, labels=None, test=False, img_transform=None, caption_transform=None, target_transform=None):
        self.features = features
        self.labels = labels
        self.img_dir = img_dir
        self.test = test
        self.img_transform = img_transform
        self.caption_transform = caption_transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        img_path = str(self.img_dir.joinpath(self.features['meme'].iloc[idx]))
        if CFG.is_mclip:
            image = Image.open(img_path)
        else:
            image = read_image(img_path, mode=ImageReadMode.RGB).to(device=CFG.device)
        caption = self.features['text'].iloc[idx]
        
        if not self.test:
            label = self.labels.iloc[idx]
        else:
            identity = self.features['id_EXIST'].iloc[idx]
        
        if self.img_transform:
            image = self.img_transform(image)
        if self.caption_transform:
            caption = self.caption_transform(caption)
        if not self.test and self.target_transform:
            label = self.target_transform(label)
            
        if CFG.split_lang:
            caption = f'Language: {self.features["lang"].iloc[idx]} - {caption}'
            
        if CFG.is_mclip:
            processed = tokenizer(caption, padding=True, return_tensors='pt')
            seq = processed['input_ids']
            mask = processed['attention_mask']
            image = image_processor(image)
        elif CFG.use_multimodal:
            processed = mm_processor(text=caption, images=image, return_tensors="pt", padding=True, truncation=True)
            seq = processed['input_ids']
            mask = processed['attention_mask']
            image = processed['pixel_values']
        elif CFG.use_dualencoder:
            processed = de_processor(text=caption, images=image, return_tensors="pt")
            seq = processed['input_ids']
            mask = processed['attention_mask']
            image = processed['pixel_values']
        else:
            processed = tokenizer.encode_plus(
                caption,
                padding='longest',
                truncation=True,
                return_tensors='pt'
            )
            seq = processed['input_ids']
            mask = processed['attention_mask']
        
        if not self.test:
            label = torch.tensor([label]).long()
            return image, seq, mask, label
        
        return identity, image, seq, mask

In [31]:
class Collator(object):
    def __init__(self, test=False):
        self.test = test
    def __call__(self, batch):
        if not self.test:
            images, seqs, masks, labels = zip(*batch)
            labels = torch.stack(labels)
        else:
            ids, images, seqs, masks = zip(*batch)

        seqs = [seq.squeeze(dim=0) for seq in seqs]
        masks = [mask.squeeze(dim=0) for mask in masks]
        images = [image.squeeze(dim=0) for image in images]

        seqs = nn.utils.rnn.pad_sequence(seqs, batch_first=True)
        masks = nn.utils.rnn.pad_sequence(masks, batch_first=True)

        images = torch.stack(images)
        
        if not self.test:
            return images, seqs, masks, labels
        
        return ids, images, seqs, masks

In [32]:
resizer = Resize((224, 224), antialias=True)

def resize_images(img_tensor):
    return resizer(img_tensor)

# Dataset Initialisation

In [33]:
train_dataset = ExistDataset(X_train, CFG.images_base_path, labels=y_train, img_transform=resize_images)
len(train_dataset)

3235

In [34]:
val_dataset = ExistDataset(X_val, CFG.images_base_path, labels=y_val, img_transform=resize_images)
len(val_dataset)

809

# Model Architecture

In [35]:
class ConcatArch(nn.Module):
    def __init__(self, hidden_size, hidden_layers, dropout, num_classes, use_multimodal=False, use_dualencoder=False, is_mclip=False):
        super().__init__()
        
        self.hidden_size = hidden_size
        self.hidden_layers = hidden_layers
        self.use_multimodal = use_multimodal
        self.use_dualencoder = use_dualencoder
        self.is_mclip = is_mclip
        self.is_vilt = 'ViltForMaskedLM' in CFG.multimodal_model_config.architectures
        
        if self.is_mclip:
            self.text_model = text_model
            self.image_model = image_model
        elif self.use_multimodal:
            self.mm_model = mm_model
        elif self.use_dualencoder:
            self.de_model = de_model
        else:
            self.text_model = text_model
            self.image_model = image_model
        
        if self.is_mclip:
            self.fc1 = nn.Linear(1280, self.hidden_size)
        elif self.use_multimodal:
            if self.is_vilt and CFG.use_lstm:
                out_channels = CFG.mlp_hidden_size + CFG.multimodal_model_config.hidden_size
                self.lstm = nn.LSTM(CFG.multimodal_model_config.hidden_size, CFG.mlp_hidden_size, batch_first=True)
            elif self.is_vilt and CFG.use_mask_split:
                out_channels = CFG.multimodal_model_config.hidden_size * 3
            elif self.is_vilt and CFG.use_attn:
                self.attn = nn.Sequential(
                    nn.Linear(CFG.multimodal_model_config.hidden_size, CFG.mlp_attn_dim),
                    nn.Tanh(),
                    nn.Linear(CFG.mlp_attn_dim, 1),
                    nn.Softmax(dim=1)
                )
            elif self.is_vilt and CFG.use_modal_attn:
                self.attn1 = nn.Sequential(
                    nn.Linear(CFG.multimodal_model_config.hidden_size, CFG.mlp_attn_dim),
                    nn.Tanh(),
                    nn.Linear(CFG.mlp_attn_dim, 1),
                    nn.Softmax(dim=1)
                )
                self.attn2 = nn.Sequential(
                    nn.Linear(CFG.multimodal_model_config.hidden_size, CFG.mlp_attn_dim),
                    nn.Tanh(),
                    nn.Linear(CFG.mlp_attn_dim, 1),
                    nn.Softmax(dim=1)
                )
                out_channels = CFG.multimodal_model_config.hidden_size * 2
            elif self.is_vilt:
                out_channels = CFG.multimodal_model_config.hidden_size
            else:
                out_channels = 2 * CFG.multimodal_model_config.projection_dim
            self.fc1 = nn.Linear(out_channels, self.hidden_size)
        elif self.use_dualencoder:
            self.fc1 = nn.Linear(2 * 512, self.hidden_size)
        else:
            self.fc1 = nn.Linear(CFG.text_model_config.hidden_size + CFG.image_model_config.hidden_size, self.hidden_size)
        self.hiddens = nn.ModuleList([nn.Linear(self.hidden_size, self.hidden_size) for _ in range(self.hidden_layers)])
        self.fc2 = nn.Linear(self.hidden_size, num_classes)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        
        if CFG.init_weights:
            self._init_weights(self.fc1)
            for hidden in self.hiddens:
                self._init_weights(hidden)
            self._init_weights(self.fc2)

    def forward(self, tokens, mask, image):
        if self.is_mclip:
            emb_text = self.text_model.forward(tokens, mask)
            emb_img = self.image_model.encode_image(image)
            x = torch.cat([emb_text, emb_img], dim=1)
        elif self.use_multimodal:
            mm_output = self.mm_model(input_ids=tokens, attention_mask=mask, pixel_values=image, output_hidden_states=True)
            cats = [mm_output.pooler_output] if self.is_vilt else [mm_output.text_embeds, mm_output.image_embeds]
            
            if self.is_vilt and CFG.use_lstm:
                # First hidden state is apparently the embedding output
                # https://discuss.huggingface.co/t/hidden-states-embedding-tensors/3549/
                layerwise_cls = torch.stack([h[:, 0, :] for h in mm_output.hidden_states[1:]], dim=1)
                _, (h, _) = self.lstm(layerwise_cls)
                h = h.squeeze(dim=0)
                cats.append(h)

            if self.is_vilt and CFG.use_mask_split:
                last_h = mm_output.last_hidden_state
                mask_len = mask.shape[1]
                mean_pooled_text = torch.mean(last_h[:, :mask_len, :], dim=1)
                mean_pooled_img = torch.mean(last_h[:, mask_len:, :], dim=1)
                cats += [mean_pooled_text, mean_pooled_img]

            if self.is_vilt and CFG.use_attn:
                last_h = mm_output.last_hidden_state
                attentions = self.attn(last_h)
                x = torch.sum(attentions * last_h, dim=1)

                cls = last_h[:, 0, :]
                x += cls
            elif self.is_vilt and CFG.use_modal_attn:
                last_h = mm_output.last_hidden_state
                mask_len = mask.shape[1]
                text_split = last_h[:, :mask_len, :]
                img_split = last_h[:, mask_len:, :]
                text_attentions = self.attn1(text_split)
                img_attentions = self.attn2(img_split)
                x1 = torch.sum(text_attentions * text_split, dim=1)
                x2 = torch.sum(img_attentions * img_split, dim=1)

                x = torch.cat([x1, x2], dim=1)

                cls = last_h[:, 0, :]
                cls = torch.cat([cls, cls], dim=1)
                x += cls
            else:
                x = torch.cat(cats, dim=1)
        elif self.use_dualencoder:
            de_output = self.de_model(input_ids=tokens, attention_mask=mask, pixel_values=image)
            x = torch.cat([de_output.text_embeds, de_output.image_embeds], dim=1)
        else:
            cls_text = self.text_model(tokens, attention_mask=mask).last_hidden_state[:, 0, :]
            cls_img = self.image_model(image).last_hidden_state[:, 0, :]
            x = torch.cat([cls_text, cls_img], dim=1)

        x = self.fc1(x)
        x = self.activation(x)
        x = self.dropout(x)
        for hidden in self.hiddens:
            x = hidden(x)
            x = self.activation(x)
            x = self.dropout(x)
        x = self.fc2(x)
        
        output = x
        return output.float()
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=CFG.mlp_init_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=CFG.mlp_init_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

# Utility Functions

In [36]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

In [37]:
def get_score(y_trues, y_preds):
    macro_f1 = f1_score(y_trues, y_preds, average='macro')
    return macro_f1

# Train/Val/Test Loops

In [38]:
def train_loop(model, optimizer, train_dataloader, epoch, loss_fn):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    train_losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    
    for step, (image, seq, mask, label) in enumerate(tqdm(train_dataloader)):
        optimizer.zero_grad()
        
        train_image = image.to(device=CFG.device)
        train_seq = seq.to(device=CFG.device)
        train_mask = mask.to(device=CFG.device)
        
        batch_size = train_image.shape[0]
        
        label = label.squeeze(dim=1).to(device=CFG.device)

        with torch.cuda.amp.autocast(enabled=CFG.apex):
            output = model(train_seq, train_mask, train_image)
            
        loss = loss_fn(output, label)
        
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
            
        loss.backward()
        # scaler.scale(loss).backward()
        grad_norm = nn.utils.clip_grad_norm_(model.parameters(), CFG.mlp_grad_clip)
        optimizer.step()
        
        train_losses.update(loss.item(), batch_size)
        
        # if (step + 1) % CFG.gradient_accumulation_steps == 0:
        #     scaler.step(optimizer)
        #     scaler.update()
        #     optimizer.zero_grad()
        #     global_step += 1
            
        end = time.time()

        if step % CFG.print_freq == 0 or step == (len(train_dataloader) - 1):
            print(f'Epoch: [{epoch + 1}][{step}/{len(train_dataloader)}] '
                    f'Elapsed {timeSince(start, float(step + 1) / len(train_dataloader)):s} '
                    f'Loss: {train_losses.val:.4f} ({train_losses.avg:.4f}) '
                    f'Grad: {grad_norm:.4f}')
        
        torch.cuda.empty_cache()
        gc.collect()
        
        
    return train_losses.avg

In [39]:
def valid_loop(model, valid_dataloader, loss_fn):
    all_preds = []
    all_labels = []
    
    model.eval()
    valid_losses = AverageMeter()
    start = end = time.time()
    
    for step, (image, seq, mask, label) in enumerate(tqdm(valid_dataloader)):
        valid_image = image.to(device=CFG.device)
        valid_seq = seq.to(device=CFG.device)
        valid_mask = mask.to(device=CFG.device)

        batch_size = valid_image.shape[0]

        label = label.squeeze(dim=1).to(device=CFG.device)

        with torch.no_grad():
            output = model(valid_seq, valid_mask, valid_image)

        loss = loss_fn(output, label)

        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps

        valid_losses.update(loss.item(), batch_size)
        predicted = output.argmax(dim=1)

        all_labels.append(label)
        all_preds.append(predicted)
        
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_dataloader) - 1):
            print(f'Validation: [{step}/{len(valid_dataloader)}] '
                    f'Elapsed {timeSince(start, float(step + 1) / len(valid_dataloader)):s} '
                    f'Loss: {valid_losses.val:.4f} ({valid_losses.avg:.4f})')
            
    all_preds = torch.cat(all_preds, dim=0)
    all_labels = torch.cat(all_labels, dim=0)
    
    all_preds_np = all_preds.cpu().numpy().astype(int)
    all_labels_np = all_labels.cpu().numpy().astype(int)
    
    return valid_losses.avg, all_preds_np, all_labels_np 

In [40]:
def test_loop(model, test_dataloader):
    all_soft = []
    all_hard = []
    all_ids = []
    
    model.eval()
    
    for identity, image, seq, mask in tqdm(test_dataloader):
        test_image = image.to(device=CFG.device)
        test_seq = seq.to(device=CFG.device)
        test_mask = mask.to(device=CFG.device)

        with torch.no_grad():
            output = model(test_seq, test_mask, test_image)
        
        soft = nn.functional.softmax(output, dim=1)
        hard = output.argmax(dim=1)
        
        all_ids += list(identity)
        all_soft.append(soft)
        all_hard.append(hard)
        
    all_soft = torch.cat(all_soft, dim=0)
    all_hard = torch.cat(all_hard, dim=0)
    
    return all_ids, all_hard, all_soft

# Training and Validation

In [41]:
collate = Collator()
train_dataloader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True, collate_fn=collate)
valid_dataloader = DataLoader(val_dataset, batch_size=CFG.batch_size, collate_fn=collate)

model = ConcatArch(
    hidden_size=CFG.mlp_hidden_size,
    hidden_layers=CFG.mlp_hidden_layers,
    dropout=CFG.mlp_dropout,
    num_classes=CFG.num_class,
    use_multimodal=CFG.use_multimodal,
    use_dualencoder=CFG.use_dualencoder,
    is_mclip=CFG.is_mclip
).to(CFG.device)

optim = AdamWScheduleFree(model.parameters(), lr=CFG.learning_rate, eps=CFG.eps, betas=CFG.betas)
loss_fn = nn.CrossEntropyLoss()

In [24]:
best_score = 0

for epoch in range(CFG.epochs):
    
    train_loss = train_loop(model, optim, train_dataloader, epoch, loss_fn)

    valid_loss, all_preds_np, all_labels_np = valid_loop(model, valid_dataloader, loss_fn)
    
    if CFG.debug:
        print(all_labels_np)
        print(all_preds_np)
    
    score = get_score(all_labels_np, all_preds_np)
    
    report = classification_report(all_labels_np, all_preds_np, digits=4)
    print(report)
    
    if CFG.save_models and score > best_score:
        model_name = CFG.multimodal_model_id if CFG.use_multimodal else '-'.join([CFG.text_model_id, CFG.image_model_id])
        if CFG.use_lstm:
            model_name += '-lstm'
        if CFG.use_attn:
            model_name += '-attn'
        elif CFG.use_modal_attn:
            model_name += '-mattn'
        if CFG.use_mask_split:
            model_name += '-msplit'
        model_name = model_name.replace('/', '-') + f'_score_{score:.4f}'
        torch.save({'model': model.state_dict()}, f'{model_name}.pth')
        print(f'Saved model: {model_name}')
        with open(f'{model_name}_results.txt', 'w', encoding='utf-8') as fp:
            fp.write(report)
        best_score = score
    
    print(f'Epoch {epoch + 1}/{CFG.epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {valid_loss:.4f}')


  2%|▏         | 1/51 [00:03<02:32,  3.04s/it]

Epoch: [1][0/51] Elapsed 0m 2s (remain 2m 24s) Loss: 0.6685 (0.6685) Grad: 11.1624


100%|██████████| 51/51 [02:11<00:00,  2.57s/it]


Epoch: [1][50/51] Elapsed 2m 11s (remain 0m 0s) Loss: 0.4588 (0.5325) Grad: 14.6177


  8%|▊         | 1/13 [00:00<00:10,  1.17it/s]

Validation: [0/13] Elapsed 0m 0s (remain 0m 10s) Loss: 0.7452 (0.7452)


100%|██████████| 13/13 [00:10<00:00,  1.28it/s]

Validation: [12/13] Elapsed 0m 10s (remain 0m 0s) Loss: 0.4587 (0.6973)
[2 2 0 0 1 1 0 0 1 1 1 2 2 2 0 0 1 2 1 0 1 1 1 0 1 0 1 0 1 0 2 2 1 0 1 1 1
 0 2 0 0 0 2 0 2 1 0 1 1 1 1 0 0 1 1 0 1 2 0 1 0 2 1 1 0 1 0 0 1 1 0 0 1 1
 1 0 2 0 1 1 0 1 1 0 2 1 1 0 2 1 0 0 0 1 2 0 1 0 0 1 2 2 1 1 2 2 1 0 0 0 0
 1 1 1 0 0 1 2 2 0 1 0 1 0 1 1 0 2 1 0 1 2 2 1 1 0 0 0 1 2 0 1 1 0 0 0 2 2
 0 1 0 2 1 1 1 0 0 2 1 0 0 0 2 2 0 0 0 1 0 0 0 1 1 1 2 1 0 2 0 1 2 0 1 1 1
 2 2 0 0 1 0 1 0 1 1 0 1 1 2 0 2 1 1 2 2 1 0 0 1 0 2 1 2 2 0 0 0 0 0 2 1 1
 1 1 1 0 0 1 1 0 1 1 0 1 1 2 1 0 0 1 0 0 0 1 2 1 1 1 2 0 0 0 0 1 1 1 1 2 0
 1 1 0 1 1 1 0 0 1 1 0 0 0 1 0 1 0 1 1 0 1 1 0 0 1 0 1 1 0 1 2 1 0 0 1 1 1
 2 1 1 0 0 2 0 2 1 2 1 1 1 1 0 1 0 1 2 0 1 1 2 0 1 0 1 2 1 2 1 1 1 2 0 1 1
 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1 2 0 0 1 2 1 0 0 2 0 1 1 0 1 1 1 2 2 0 1 0 0
 1 0 0 0 2 1 1 1 1 0 0 1 0 2 0 1 1 0 0 0 1 2 0 1 0 2 0 0 0 1 2 1 0 1 1 2 1
 0 0 1 1 0 2 0 1 1 0 1 0 1 1 1 0 0 1 1 0 1 0 0 1 1 2 2 0 2 1 0 0 1 0 1 2 1
 1 1 0 1 2 0 1 1 0 1 0 0 1 1




Saved model: openai-clip-vit-base-patch32_score_0.7157
Epoch 1/10, Train Loss: 0.5325, Validation Loss: 0.6973


  2%|▏         | 1/51 [00:02<02:22,  2.84s/it]

Epoch: [2][0/51] Elapsed 0m 2s (remain 2m 15s) Loss: 0.4542 (0.4542) Grad: 58.8693


100%|██████████| 51/51 [02:11<00:00,  2.57s/it]


Epoch: [2][50/51] Elapsed 2m 10s (remain 0m 0s) Loss: 0.3868 (0.4520) Grad: 3.3542


  8%|▊         | 1/13 [00:00<00:10,  1.20it/s]

Validation: [0/13] Elapsed 0m 0s (remain 0m 10s) Loss: 0.7895 (0.7895)


100%|██████████| 13/13 [00:10<00:00,  1.30it/s]

Validation: [12/13] Elapsed 0m 10s (remain 0m 0s) Loss: 0.3489 (0.6761)
[2 2 0 0 1 1 0 0 1 1 1 2 2 2 0 0 1 2 1 0 1 1 1 0 1 0 1 0 1 0 2 2 1 0 1 1 1
 0 2 0 0 0 2 0 2 1 0 1 1 1 1 0 0 1 1 0 1 2 0 1 0 2 1 1 0 1 0 0 1 1 0 0 1 1
 1 0 2 0 1 1 0 1 1 0 2 1 1 0 2 1 0 0 0 1 2 0 1 0 0 1 2 2 1 1 2 2 1 0 0 0 0
 1 1 1 0 0 1 2 2 0 1 0 1 0 1 1 0 2 1 0 1 2 2 1 1 0 0 0 1 2 0 1 1 0 0 0 2 2
 0 1 0 2 1 1 1 0 0 2 1 0 0 0 2 2 0 0 0 1 0 0 0 1 1 1 2 1 0 2 0 1 2 0 1 1 1
 2 2 0 0 1 0 1 0 1 1 0 1 1 2 0 2 1 1 2 2 1 0 0 1 0 2 1 2 2 0 0 0 0 0 2 1 1
 1 1 1 0 0 1 1 0 1 1 0 1 1 2 1 0 0 1 0 0 0 1 2 1 1 1 2 0 0 0 0 1 1 1 1 2 0
 1 1 0 1 1 1 0 0 1 1 0 0 0 1 0 1 0 1 1 0 1 1 0 0 1 0 1 1 0 1 2 1 0 0 1 1 1
 2 1 1 0 0 2 0 2 1 2 1 1 1 1 0 1 0 1 2 0 1 1 2 0 1 0 1 2 1 2 1 1 1 2 0 1 1
 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1 2 0 0 1 2 1 0 0 2 0 1 1 0 1 1 1 2 2 0 1 0 0
 1 0 0 0 2 1 1 1 1 0 0 1 0 2 0 1 1 0 0 0 1 2 0 1 0 2 0 0 0 1 2 1 0 1 1 2 1
 0 0 1 1 0 2 0 1 1 0 1 0 1 1 1 0 0 1 1 0 1 0 0 1 1 2 2 0 2 1 0 0 1 0 1 2 1
 1 1 0 1 2 0 1 1 0 1 0 0 1 1




Saved model: openai-clip-vit-base-patch32_score_0.7548
Epoch 2/10, Train Loss: 0.4520, Validation Loss: 0.6761


  2%|▏         | 1/51 [00:03<02:30,  3.00s/it]

Epoch: [3][0/51] Elapsed 0m 2s (remain 2m 23s) Loss: 0.3163 (0.3163) Grad: 2.5648


100%|██████████| 51/51 [02:11<00:00,  2.57s/it]


Epoch: [3][50/51] Elapsed 2m 11s (remain 0m 0s) Loss: 0.2614 (0.3896) Grad: 9.5256


  8%|▊         | 1/13 [00:00<00:09,  1.30it/s]

Validation: [0/13] Elapsed 0m 0s (remain 0m 9s) Loss: 0.7910 (0.7910)


100%|██████████| 13/13 [00:09<00:00,  1.31it/s]

Validation: [12/13] Elapsed 0m 9s (remain 0m 0s) Loss: 0.2983 (0.6438)
[2 2 0 0 1 1 0 0 1 1 1 2 2 2 0 0 1 2 1 0 1 1 1 0 1 0 1 0 1 0 2 2 1 0 1 1 1
 0 2 0 0 0 2 0 2 1 0 1 1 1 1 0 0 1 1 0 1 2 0 1 0 2 1 1 0 1 0 0 1 1 0 0 1 1
 1 0 2 0 1 1 0 1 1 0 2 1 1 0 2 1 0 0 0 1 2 0 1 0 0 1 2 2 1 1 2 2 1 0 0 0 0
 1 1 1 0 0 1 2 2 0 1 0 1 0 1 1 0 2 1 0 1 2 2 1 1 0 0 0 1 2 0 1 1 0 0 0 2 2
 0 1 0 2 1 1 1 0 0 2 1 0 0 0 2 2 0 0 0 1 0 0 0 1 1 1 2 1 0 2 0 1 2 0 1 1 1
 2 2 0 0 1 0 1 0 1 1 0 1 1 2 0 2 1 1 2 2 1 0 0 1 0 2 1 2 2 0 0 0 0 0 2 1 1
 1 1 1 0 0 1 1 0 1 1 0 1 1 2 1 0 0 1 0 0 0 1 2 1 1 1 2 0 0 0 0 1 1 1 1 2 0
 1 1 0 1 1 1 0 0 1 1 0 0 0 1 0 1 0 1 1 0 1 1 0 0 1 0 1 1 0 1 2 1 0 0 1 1 1
 2 1 1 0 0 2 0 2 1 2 1 1 1 1 0 1 0 1 2 0 1 1 2 0 1 0 1 2 1 2 1 1 1 2 0 1 1
 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1 2 0 0 1 2 1 0 0 2 0 1 1 0 1 1 1 2 2 0 1 0 0
 1 0 0 0 2 1 1 1 1 0 0 1 0 2 0 1 1 0 0 0 1 2 0 1 0 2 0 0 0 1 2 1 0 1 1 2 1
 0 0 1 1 0 2 0 1 1 0 1 0 1 1 1 0 0 1 1 0 1 0 0 1 1 2 2 0 2 1 0 0 1 0 1 2 1
 1 1 0 1 2 0 1 1 0 1 0 0 1 1 




Saved model: openai-clip-vit-base-patch32_score_0.7686
Epoch 3/10, Train Loss: 0.3896, Validation Loss: 0.6438


  2%|▏         | 1/51 [00:02<02:28,  2.96s/it]

Epoch: [4][0/51] Elapsed 0m 2s (remain 2m 21s) Loss: 0.3979 (0.3979) Grad: 3.1550


100%|██████████| 51/51 [02:11<00:00,  2.57s/it]


Epoch: [4][50/51] Elapsed 2m 11s (remain 0m 0s) Loss: 0.3827 (0.3313) Grad: 17.1887


  8%|▊         | 1/13 [00:00<00:09,  1.28it/s]

Validation: [0/13] Elapsed 0m 0s (remain 0m 9s) Loss: 0.7175 (0.7175)


100%|██████████| 13/13 [00:09<00:00,  1.33it/s]

Validation: [12/13] Elapsed 0m 9s (remain 0m 0s) Loss: 0.3235 (0.6643)
[2 2 0 0 1 1 0 0 1 1 1 2 2 2 0 0 1 2 1 0 1 1 1 0 1 0 1 0 1 0 2 2 1 0 1 1 1
 0 2 0 0 0 2 0 2 1 0 1 1 1 1 0 0 1 1 0 1 2 0 1 0 2 1 1 0 1 0 0 1 1 0 0 1 1
 1 0 2 0 1 1 0 1 1 0 2 1 1 0 2 1 0 0 0 1 2 0 1 0 0 1 2 2 1 1 2 2 1 0 0 0 0
 1 1 1 0 0 1 2 2 0 1 0 1 0 1 1 0 2 1 0 1 2 2 1 1 0 0 0 1 2 0 1 1 0 0 0 2 2
 0 1 0 2 1 1 1 0 0 2 1 0 0 0 2 2 0 0 0 1 0 0 0 1 1 1 2 1 0 2 0 1 2 0 1 1 1
 2 2 0 0 1 0 1 0 1 1 0 1 1 2 0 2 1 1 2 2 1 0 0 1 0 2 1 2 2 0 0 0 0 0 2 1 1
 1 1 1 0 0 1 1 0 1 1 0 1 1 2 1 0 0 1 0 0 0 1 2 1 1 1 2 0 0 0 0 1 1 1 1 2 0
 1 1 0 1 1 1 0 0 1 1 0 0 0 1 0 1 0 1 1 0 1 1 0 0 1 0 1 1 0 1 2 1 0 0 1 1 1
 2 1 1 0 0 2 0 2 1 2 1 1 1 1 0 1 0 1 2 0 1 1 2 0 1 0 1 2 1 2 1 1 1 2 0 1 1
 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1 2 0 0 1 2 1 0 0 2 0 1 1 0 1 1 1 2 2 0 1 0 0
 1 0 0 0 2 1 1 1 1 0 0 1 0 2 0 1 1 0 0 0 1 2 0 1 0 2 0 0 0 1 2 1 0 1 1 2 1
 0 0 1 1 0 2 0 1 1 0 1 0 1 1 1 0 0 1 1 0 1 0 0 1 1 2 2 0 2 1 0 0 1 0 1 2 1
 1 1 0 1 2 0 1 1 0 1 0 0 1 1 




Saved model: openai-clip-vit-base-patch32_score_0.7853
Epoch 4/10, Train Loss: 0.3313, Validation Loss: 0.6643


  2%|▏         | 1/51 [00:02<02:28,  2.97s/it]

Epoch: [5][0/51] Elapsed 0m 2s (remain 2m 21s) Loss: 0.3234 (0.3234) Grad: 7.1236


100%|██████████| 51/51 [02:10<00:00,  2.57s/it]


Epoch: [5][50/51] Elapsed 2m 10s (remain 0m 0s) Loss: 0.2907 (0.2887) Grad: 10.7329


  8%|▊         | 1/13 [00:00<00:09,  1.26it/s]

Validation: [0/13] Elapsed 0m 0s (remain 0m 9s) Loss: 0.6382 (0.6382)


100%|██████████| 13/13 [00:09<00:00,  1.34it/s]

Validation: [12/13] Elapsed 0m 9s (remain 0m 0s) Loss: 0.2412 (0.6486)
[2 2 0 0 1 1 0 0 1 1 1 2 2 2 0 0 1 2 1 0 1 1 1 0 1 0 1 0 1 0 2 2 1 0 1 1 1
 0 2 0 0 0 2 0 2 1 0 1 1 1 1 0 0 1 1 0 1 2 0 1 0 2 1 1 0 1 0 0 1 1 0 0 1 1
 1 0 2 0 1 1 0 1 1 0 2 1 1 0 2 1 0 0 0 1 2 0 1 0 0 1 2 2 1 1 2 2 1 0 0 0 0
 1 1 1 0 0 1 2 2 0 1 0 1 0 1 1 0 2 1 0 1 2 2 1 1 0 0 0 1 2 0 1 1 0 0 0 2 2
 0 1 0 2 1 1 1 0 0 2 1 0 0 0 2 2 0 0 0 1 0 0 0 1 1 1 2 1 0 2 0 1 2 0 1 1 1
 2 2 0 0 1 0 1 0 1 1 0 1 1 2 0 2 1 1 2 2 1 0 0 1 0 2 1 2 2 0 0 0 0 0 2 1 1
 1 1 1 0 0 1 1 0 1 1 0 1 1 2 1 0 0 1 0 0 0 1 2 1 1 1 2 0 0 0 0 1 1 1 1 2 0
 1 1 0 1 1 1 0 0 1 1 0 0 0 1 0 1 0 1 1 0 1 1 0 0 1 0 1 1 0 1 2 1 0 0 1 1 1
 2 1 1 0 0 2 0 2 1 2 1 1 1 1 0 1 0 1 2 0 1 1 2 0 1 0 1 2 1 2 1 1 1 2 0 1 1
 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1 2 0 0 1 2 1 0 0 2 0 1 1 0 1 1 1 2 2 0 1 0 0
 1 0 0 0 2 1 1 1 1 0 0 1 0 2 0 1 1 0 0 0 1 2 0 1 0 2 0 0 0 1 2 1 0 1 1 2 1
 0 0 1 1 0 2 0 1 1 0 1 0 1 1 1 0 0 1 1 0 1 0 0 1 1 2 2 0 2 1 0 0 1 0 1 2 1
 1 1 0 1 2 0 1 1 0 1 0 0 1 1 




Saved model: openai-clip-vit-base-patch32_score_0.7914
Epoch 5/10, Train Loss: 0.2887, Validation Loss: 0.6486


  2%|▏         | 1/51 [00:02<02:25,  2.90s/it]

Epoch: [6][0/51] Elapsed 0m 2s (remain 2m 18s) Loss: 0.3769 (0.3769) Grad: 9.2370


100%|██████████| 51/51 [02:10<00:00,  2.56s/it]


Epoch: [6][50/51] Elapsed 2m 10s (remain 0m 0s) Loss: 0.4241 (0.2490) Grad: 36.2116


  8%|▊         | 1/13 [00:00<00:09,  1.26it/s]

Validation: [0/13] Elapsed 0m 0s (remain 0m 9s) Loss: 0.5432 (0.5432)


100%|██████████| 13/13 [00:09<00:00,  1.32it/s]

Validation: [12/13] Elapsed 0m 9s (remain 0m 0s) Loss: 0.1352 (0.6425)
[2 2 0 0 1 1 0 0 1 1 1 2 2 2 0 0 1 2 1 0 1 1 1 0 1 0 1 0 1 0 2 2 1 0 1 1 1
 0 2 0 0 0 2 0 2 1 0 1 1 1 1 0 0 1 1 0 1 2 0 1 0 2 1 1 0 1 0 0 1 1 0 0 1 1
 1 0 2 0 1 1 0 1 1 0 2 1 1 0 2 1 0 0 0 1 2 0 1 0 0 1 2 2 1 1 2 2 1 0 0 0 0
 1 1 1 0 0 1 2 2 0 1 0 1 0 1 1 0 2 1 0 1 2 2 1 1 0 0 0 1 2 0 1 1 0 0 0 2 2
 0 1 0 2 1 1 1 0 0 2 1 0 0 0 2 2 0 0 0 1 0 0 0 1 1 1 2 1 0 2 0 1 2 0 1 1 1
 2 2 0 0 1 0 1 0 1 1 0 1 1 2 0 2 1 1 2 2 1 0 0 1 0 2 1 2 2 0 0 0 0 0 2 1 1
 1 1 1 0 0 1 1 0 1 1 0 1 1 2 1 0 0 1 0 0 0 1 2 1 1 1 2 0 0 0 0 1 1 1 1 2 0
 1 1 0 1 1 1 0 0 1 1 0 0 0 1 0 1 0 1 1 0 1 1 0 0 1 0 1 1 0 1 2 1 0 0 1 1 1
 2 1 1 0 0 2 0 2 1 2 1 1 1 1 0 1 0 1 2 0 1 1 2 0 1 0 1 2 1 2 1 1 1 2 0 1 1
 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1 2 0 0 1 2 1 0 0 2 0 1 1 0 1 1 1 2 2 0 1 0 0
 1 0 0 0 2 1 1 1 1 0 0 1 0 2 0 1 1 0 0 0 1 2 0 1 0 2 0 0 0 1 2 1 0 1 1 2 1
 0 0 1 1 0 2 0 1 1 0 1 0 1 1 1 0 0 1 1 0 1 0 0 1 1 2 2 0 2 1 0 0 1 0 1 2 1
 1 1 0 1 2 0 1 1 0 1 0 0 1 1 




Saved model: openai-clip-vit-base-patch32_score_0.8029
Epoch 6/10, Train Loss: 0.2490, Validation Loss: 0.6425


  2%|▏         | 1/51 [00:02<02:27,  2.96s/it]

Epoch: [7][0/51] Elapsed 0m 2s (remain 2m 21s) Loss: 0.1594 (0.1594) Grad: 32.8946


100%|██████████| 51/51 [02:10<00:00,  2.56s/it]


Epoch: [7][50/51] Elapsed 2m 10s (remain 0m 0s) Loss: 0.0668 (0.2056) Grad: 2.5356


  8%|▊         | 1/13 [00:00<00:09,  1.23it/s]

Validation: [0/13] Elapsed 0m 0s (remain 0m 9s) Loss: 0.6131 (0.6131)


100%|██████████| 13/13 [00:09<00:00,  1.33it/s]

Validation: [12/13] Elapsed 0m 9s (remain 0m 0s) Loss: 0.1745 (0.6216)
[2 2 0 0 1 1 0 0 1 1 1 2 2 2 0 0 1 2 1 0 1 1 1 0 1 0 1 0 1 0 2 2 1 0 1 1 1
 0 2 0 0 0 2 0 2 1 0 1 1 1 1 0 0 1 1 0 1 2 0 1 0 2 1 1 0 1 0 0 1 1 0 0 1 1
 1 0 2 0 1 1 0 1 1 0 2 1 1 0 2 1 0 0 0 1 2 0 1 0 0 1 2 2 1 1 2 2 1 0 0 0 0
 1 1 1 0 0 1 2 2 0 1 0 1 0 1 1 0 2 1 0 1 2 2 1 1 0 0 0 1 2 0 1 1 0 0 0 2 2
 0 1 0 2 1 1 1 0 0 2 1 0 0 0 2 2 0 0 0 1 0 0 0 1 1 1 2 1 0 2 0 1 2 0 1 1 1
 2 2 0 0 1 0 1 0 1 1 0 1 1 2 0 2 1 1 2 2 1 0 0 1 0 2 1 2 2 0 0 0 0 0 2 1 1
 1 1 1 0 0 1 1 0 1 1 0 1 1 2 1 0 0 1 0 0 0 1 2 1 1 1 2 0 0 0 0 1 1 1 1 2 0
 1 1 0 1 1 1 0 0 1 1 0 0 0 1 0 1 0 1 1 0 1 1 0 0 1 0 1 1 0 1 2 1 0 0 1 1 1
 2 1 1 0 0 2 0 2 1 2 1 1 1 1 0 1 0 1 2 0 1 1 2 0 1 0 1 2 1 2 1 1 1 2 0 1 1
 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1 2 0 0 1 2 1 0 0 2 0 1 1 0 1 1 1 2 2 0 1 0 0
 1 0 0 0 2 1 1 1 1 0 0 1 0 2 0 1 1 0 0 0 1 2 0 1 0 2 0 0 0 1 2 1 0 1 1 2 1
 0 0 1 1 0 2 0 1 1 0 1 0 1 1 1 0 0 1 1 0 1 0 0 1 1 2 2 0 2 1 0 0 1 0 1 2 1
 1 1 0 1 2 0 1 1 0 1 0 0 1 1 




Saved model: openai-clip-vit-base-patch32_score_0.8309
Epoch 7/10, Train Loss: 0.2056, Validation Loss: 0.6216


  2%|▏         | 1/51 [00:02<02:26,  2.94s/it]

Epoch: [8][0/51] Elapsed 0m 2s (remain 2m 20s) Loss: 0.1173 (0.1173) Grad: 5.6827


100%|██████████| 51/51 [02:11<00:00,  2.58s/it]


Epoch: [8][50/51] Elapsed 2m 11s (remain 0m 0s) Loss: 0.1462 (0.1805) Grad: 41.7379


  8%|▊         | 1/13 [00:00<00:09,  1.22it/s]

Validation: [0/13] Elapsed 0m 0s (remain 0m 9s) Loss: 0.5573 (0.5573)


100%|██████████| 13/13 [00:09<00:00,  1.32it/s]

Validation: [12/13] Elapsed 0m 9s (remain 0m 0s) Loss: 0.0420 (0.6264)
[2 2 0 0 1 1 0 0 1 1 1 2 2 2 0 0 1 2 1 0 1 1 1 0 1 0 1 0 1 0 2 2 1 0 1 1 1
 0 2 0 0 0 2 0 2 1 0 1 1 1 1 0 0 1 1 0 1 2 0 1 0 2 1 1 0 1 0 0 1 1 0 0 1 1
 1 0 2 0 1 1 0 1 1 0 2 1 1 0 2 1 0 0 0 1 2 0 1 0 0 1 2 2 1 1 2 2 1 0 0 0 0
 1 1 1 0 0 1 2 2 0 1 0 1 0 1 1 0 2 1 0 1 2 2 1 1 0 0 0 1 2 0 1 1 0 0 0 2 2
 0 1 0 2 1 1 1 0 0 2 1 0 0 0 2 2 0 0 0 1 0 0 0 1 1 1 2 1 0 2 0 1 2 0 1 1 1
 2 2 0 0 1 0 1 0 1 1 0 1 1 2 0 2 1 1 2 2 1 0 0 1 0 2 1 2 2 0 0 0 0 0 2 1 1
 1 1 1 0 0 1 1 0 1 1 0 1 1 2 1 0 0 1 0 0 0 1 2 1 1 1 2 0 0 0 0 1 1 1 1 2 0
 1 1 0 1 1 1 0 0 1 1 0 0 0 1 0 1 0 1 1 0 1 1 0 0 1 0 1 1 0 1 2 1 0 0 1 1 1
 2 1 1 0 0 2 0 2 1 2 1 1 1 1 0 1 0 1 2 0 1 1 2 0 1 0 1 2 1 2 1 1 1 2 0 1 1
 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1 2 0 0 1 2 1 0 0 2 0 1 1 0 1 1 1 2 2 0 1 0 0
 1 0 0 0 2 1 1 1 1 0 0 1 0 2 0 1 1 0 0 0 1 2 0 1 0 2 0 0 0 1 2 1 0 1 1 2 1
 0 0 1 1 0 2 0 1 1 0 1 0 1 1 1 0 0 1 1 0 1 0 0 1 1 2 2 0 2 1 0 0 1 0 1 2 1
 1 1 0 1 2 0 1 1 0 1 0 0 1 1 




Saved model: openai-clip-vit-base-patch32_score_0.8345
Epoch 8/10, Train Loss: 0.1805, Validation Loss: 0.6264


  2%|▏         | 1/51 [00:02<02:28,  2.96s/it]

Epoch: [9][0/51] Elapsed 0m 2s (remain 2m 21s) Loss: 0.1479 (0.1479) Grad: 3.6130


100%|██████████| 51/51 [02:11<00:00,  2.58s/it]


Epoch: [9][50/51] Elapsed 2m 11s (remain 0m 0s) Loss: 0.1367 (0.1566) Grad: 10.4167


  8%|▊         | 1/13 [00:00<00:09,  1.21it/s]

Validation: [0/13] Elapsed 0m 0s (remain 0m 9s) Loss: 0.5694 (0.5694)


100%|██████████| 13/13 [00:10<00:00,  1.27it/s]


Validation: [12/13] Elapsed 0m 10s (remain 0m 0s) Loss: 0.1378 (0.6233)
[2 2 0 0 1 1 0 0 1 1 1 2 2 2 0 0 1 2 1 0 1 1 1 0 1 0 1 0 1 0 2 2 1 0 1 1 1
 0 2 0 0 0 2 0 2 1 0 1 1 1 1 0 0 1 1 0 1 2 0 1 0 2 1 1 0 1 0 0 1 1 0 0 1 1
 1 0 2 0 1 1 0 1 1 0 2 1 1 0 2 1 0 0 0 1 2 0 1 0 0 1 2 2 1 1 2 2 1 0 0 0 0
 1 1 1 0 0 1 2 2 0 1 0 1 0 1 1 0 2 1 0 1 2 2 1 1 0 0 0 1 2 0 1 1 0 0 0 2 2
 0 1 0 2 1 1 1 0 0 2 1 0 0 0 2 2 0 0 0 1 0 0 0 1 1 1 2 1 0 2 0 1 2 0 1 1 1
 2 2 0 0 1 0 1 0 1 1 0 1 1 2 0 2 1 1 2 2 1 0 0 1 0 2 1 2 2 0 0 0 0 0 2 1 1
 1 1 1 0 0 1 1 0 1 1 0 1 1 2 1 0 0 1 0 0 0 1 2 1 1 1 2 0 0 0 0 1 1 1 1 2 0
 1 1 0 1 1 1 0 0 1 1 0 0 0 1 0 1 0 1 1 0 1 1 0 0 1 0 1 1 0 1 2 1 0 0 1 1 1
 2 1 1 0 0 2 0 2 1 2 1 1 1 1 0 1 0 1 2 0 1 1 2 0 1 0 1 2 1 2 1 1 1 2 0 1 1
 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1 2 0 0 1 2 1 0 0 2 0 1 1 0 1 1 1 2 2 0 1 0 0
 1 0 0 0 2 1 1 1 1 0 0 1 0 2 0 1 1 0 0 0 1 2 0 1 0 2 0 0 0 1 2 1 0 1 1 2 1
 0 0 1 1 0 2 0 1 1 0 1 0 1 1 1 0 0 1 1 0 1 0 0 1 1 2 2 0 2 1 0 0 1 0 1 2 1
 1 1 0 1 2 0 1 1 0 1 0 0 1 1

  2%|▏         | 1/51 [00:02<02:22,  2.86s/it]

Epoch: [10][0/51] Elapsed 0m 2s (remain 2m 16s) Loss: 0.0518 (0.0518) Grad: 42.3347


100%|██████████| 51/51 [02:12<00:00,  2.61s/it]


Epoch: [10][50/51] Elapsed 2m 12s (remain 0m 0s) Loss: 0.0224 (0.1385) Grad: 1.8575


  8%|▊         | 1/13 [00:00<00:09,  1.21it/s]

Validation: [0/13] Elapsed 0m 0s (remain 0m 9s) Loss: 0.5171 (0.5171)


100%|██████████| 13/13 [00:10<00:00,  1.25it/s]

Validation: [12/13] Elapsed 0m 10s (remain 0m 0s) Loss: 0.1954 (0.6385)
[2 2 0 0 1 1 0 0 1 1 1 2 2 2 0 0 1 2 1 0 1 1 1 0 1 0 1 0 1 0 2 2 1 0 1 1 1
 0 2 0 0 0 2 0 2 1 0 1 1 1 1 0 0 1 1 0 1 2 0 1 0 2 1 1 0 1 0 0 1 1 0 0 1 1
 1 0 2 0 1 1 0 1 1 0 2 1 1 0 2 1 0 0 0 1 2 0 1 0 0 1 2 2 1 1 2 2 1 0 0 0 0
 1 1 1 0 0 1 2 2 0 1 0 1 0 1 1 0 2 1 0 1 2 2 1 1 0 0 0 1 2 0 1 1 0 0 0 2 2
 0 1 0 2 1 1 1 0 0 2 1 0 0 0 2 2 0 0 0 1 0 0 0 1 1 1 2 1 0 2 0 1 2 0 1 1 1
 2 2 0 0 1 0 1 0 1 1 0 1 1 2 0 2 1 1 2 2 1 0 0 1 0 2 1 2 2 0 0 0 0 0 2 1 1
 1 1 1 0 0 1 1 0 1 1 0 1 1 2 1 0 0 1 0 0 0 1 2 1 1 1 2 0 0 0 0 1 1 1 1 2 0
 1 1 0 1 1 1 0 0 1 1 0 0 0 1 0 1 0 1 1 0 1 1 0 0 1 0 1 1 0 1 2 1 0 0 1 1 1
 2 1 1 0 0 2 0 2 1 2 1 1 1 1 0 1 0 1 2 0 1 1 2 0 1 0 1 2 1 2 1 1 1 2 0 1 1
 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1 2 0 0 1 2 1 0 0 2 0 1 1 0 1 1 1 2 2 0 1 0 0
 1 0 0 0 2 1 1 1 1 0 0 1 0 2 0 1 1 0 0 0 1 2 0 1 0 2 0 0 0 1 2 1 0 1 1 2 1
 0 0 1 1 0 2 0 1 1 0 1 0 1 1 1 0 0 1 1 0 1 0 0 1 1 2 2 0 2 1 0 0 1 0 1 2 1
 1 1 0 1 2 0 1 1 0 1 0 0 1 1




In [42]:
del model
torch.cuda.empty_cache()
gc.collect()

39

# Inference From Checkpoint

In [43]:
inf_model_name = 'dandelin-vilt-b32-mlm_score_0.8672'
inf_model = ConcatArch(
    hidden_size=CFG.mlp_hidden_size,
    hidden_layers=CFG.mlp_hidden_layers,
    dropout=CFG.mlp_dropout,
    num_classes=CFG.num_class,
    use_multimodal=CFG.use_multimodal,
    use_dualencoder=CFG.use_dualencoder,
    is_mclip=CFG.is_mclip
).to(CFG.device)
inf_model.load_state_dict(torch.load('Task 5/' + inf_model_name + '.pth', map_location=torch.device(CFG.device))['model'])
inf_model

ConcatArch(
  (mm_model): ViltModel(
    (embeddings): ViltEmbeddings(
      (text_embeddings): TextEmbeddings(
        (word_embeddings): Embedding(30522, 768)
        (position_embeddings): Embedding(40, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (patch_embeddings): ViltPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32))
      )
      (token_type_embeddings): Embedding(2, 768)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViltEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViltLayer(
          (attention): ViltAttention(
            (attention): ViltSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_fea

In [44]:
valid_loss, all_preds_np, all_labels_np = valid_loop(inf_model, valid_dataloader, loss_fn)
if CFG.debug:
    print(all_labels_np)
    print(all_preds_np)
    
score = get_score(all_labels_np, all_preds_np)

report = classification_report(all_labels_np, all_preds_np, digits=4)
print(report)

  4%|▍         | 1/26 [00:01<00:40,  1.61s/it]

Validation: [0/26] Elapsed 0m 1s (remain 0m 40s) Loss: 1.0176 (1.0176)


100%|██████████| 26/26 [00:21<00:00,  1.21it/s]

Validation: [25/26] Elapsed 0m 21s (remain 0m 0s) Loss: 1.2675 (0.8792)
[2 2 0 0 1 1 0 0 1 1 1 2 2 2 0 0 1 2 1 0 1 1 1 0 1 0 1 0 1 0 2 2 1 0 1 1 1
 0 2 0 0 0 2 0 2 1 0 1 1 1 1 0 0 1 1 0 1 2 0 1 0 2 1 1 0 1 0 0 1 1 0 0 1 1
 1 0 2 0 1 1 0 1 1 0 2 1 1 0 2 1 0 0 0 1 2 0 1 0 0 1 2 2 1 1 2 2 1 0 0 0 0
 1 1 1 0 0 1 2 2 0 1 0 1 0 1 1 0 2 1 0 1 2 2 1 1 0 0 0 1 2 0 1 1 0 0 0 2 2
 0 1 0 2 1 1 1 0 0 2 1 0 0 0 2 2 0 0 0 1 0 0 0 1 1 1 2 1 0 2 0 1 2 0 1 1 1
 2 2 0 0 1 0 1 0 1 1 0 1 1 2 0 2 1 1 2 2 1 0 0 1 0 2 1 2 2 0 0 0 0 0 2 1 1
 1 1 1 0 0 1 1 0 1 1 0 1 1 2 1 0 0 1 0 0 0 1 2 1 1 1 2 0 0 0 0 1 1 1 1 2 0
 1 1 0 1 1 1 0 0 1 1 0 0 0 1 0 1 0 1 1 0 1 1 0 0 1 0 1 1 0 1 2 1 0 0 1 1 1
 2 1 1 0 0 2 0 2 1 2 1 1 1 1 0 1 0 1 2 0 1 1 2 0 1 0 1 2 1 2 1 1 1 2 0 1 1
 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1 2 0 0 1 2 1 0 0 2 0 1 1 0 1 1 1 2 2 0 1 0 0
 1 0 0 0 2 1 1 1 1 0 0 1 0 2 0 1 1 0 0 0 1 2 0 1 0 2 0 0 0 1 2 1 0 1 1 2 1
 0 0 1 1 0 2 0 1 1 0 1 0 1 1 1 0 0 1 1 0 1 0 0 1 1 2 2 0 2 1 0 0 1 0 1 2 1
 1 1 0 1 2 0 1 1 0 1 0 0 1 1




In [45]:
with open(f'{inf_model_name}_results.txt', 'w', encoding='utf-8') as fp:
    fp.write(report)

In [46]:
with open('EXIST 2024 Lab/EXIST 2024 Memes Dataset/test/EXIST2024_test_clean.json', 'r', encoding='utf-8') as fp:
    test_annotations = json.load(fp)
test_df = pd.DataFrame.from_dict(test_annotations).T
print(test_df.shape)
test_df.head()

(1053, 13)


Unnamed: 0,id_EXIST,lang,text,meme,path_memes,number_annotators,annotators,gender_annotators,age_annotators,ethnicities_annotators,study_levels_annotators,countries_annotators,split
310001,310001,es,Soy como la madre de mi hermano,310001.jpeg,memes/310001.jpeg,6,"[Annotator_888, Annotator_889, Annotator_890, ...","[M, M, M, F, F, F]","[46+, 23-45, 18-22, 46+, 18-22, 23-45]","[White or Caucasian, White or Caucasian, White...","[Master’s degree, Bachelor’s degree, High scho...","[Italy, Spain, Portugal, Mexico, United Kingdo...",TEST-MEME_ES
310002,310002,es,DESAFI LLEVAR EN IRÁN LAS AUTORIDADES ESTÁN RE...,310002.jpeg,memes/310002.jpeg,6,"[Annotator_888, Annotator_889, Annotator_890, ...","[M, M, M, F, F, F]","[46+, 23-45, 18-22, 46+, 18-22, 23-45]","[White or Caucasian, White or Caucasian, White...","[Master’s degree, Bachelor’s degree, High scho...","[Italy, Spain, Portugal, Mexico, United Kingdo...",TEST-MEME_ES
310003,310003,es,Vincent Vega @VincentVega677-7h D *** YA VIENE...,310003.jpeg,memes/310003.jpeg,6,"[Annotator_888, Annotator_889, Annotator_890, ...","[M, M, M, F, F, F]","[46+, 23-45, 18-22, 46+, 18-22, 23-45]","[White or Caucasian, White or Caucasian, White...","[Master’s degree, Bachelor’s degree, High scho...","[Italy, Spain, Portugal, Mexico, United Kingdo...",TEST-MEME_ES
310004,310004,es,A ti Mujer soñadora Feliz Día,310004.jpeg,memes/310004.jpeg,6,"[Annotator_888, Annotator_889, Annotator_890, ...","[M, M, M, F, F, F]","[46+, 23-45, 18-22, 46+, 18-22, 23-45]","[White or Caucasian, White or Caucasian, White...","[Master’s degree, Bachelor’s degree, High scho...","[Italy, Spain, Portugal, Mexico, United Kingdo...",TEST-MEME_ES
310005,310005,es,Lya Gonzalez @LyaGonzalez1 Aquí hay revolución...,310005.jpeg,memes/310005.jpeg,6,"[Annotator_888, Annotator_889, Annotator_890, ...","[M, M, M, F, F, F]","[46+, 23-45, 18-22, 46+, 18-22, 23-45]","[White or Caucasian, White or Caucasian, White...","[Master’s degree, Bachelor’s degree, High scho...","[Italy, Spain, Portugal, Mexico, United Kingdo...",TEST-MEME_ES


In [47]:
test_mini_df = test_df[['id_EXIST', 'meme', 'text', 'lang', 'ethnicities_annotators']].reset_index(drop=True)
test_mini_df.head()

Unnamed: 0,id_EXIST,meme,text,lang,ethnicities_annotators
0,310001,310001.jpeg,Soy como la madre de mi hermano,es,"[White or Caucasian, White or Caucasian, White..."
1,310002,310002.jpeg,DESAFI LLEVAR EN IRÁN LAS AUTORIDADES ESTÁN RE...,es,"[White or Caucasian, White or Caucasian, White..."
2,310003,310003.jpeg,Vincent Vega @VincentVega677-7h D *** YA VIENE...,es,"[White or Caucasian, White or Caucasian, White..."
3,310004,310004.jpeg,A ti Mujer soñadora Feliz Día,es,"[White or Caucasian, White or Caucasian, White..."
4,310005,310005.jpeg,Lya Gonzalez @LyaGonzalez1 Aquí hay revolución...,es,"[White or Caucasian, White or Caucasian, White..."


In [48]:
test_dataset = ExistDataset(test_mini_df[['id_EXIST', 'meme', 'text']], CFG.images_base_path_test, test=True, img_transform=resize_images)
test_dataset

<__main__.ExistDataset at 0x13d29b94250>

# Test Prediction

In [49]:
test_collate = Collator(test=True)
test_dataloader = DataLoader(test_dataset, batch_size=CFG.batch_size, collate_fn=test_collate)

ids, hards, softs = test_loop(inf_model, test_dataloader)
hards = hards.tolist()
softs = softs.tolist()
print(ids)
print(hards)
print(softs)

100%|██████████| 33/33 [00:27<00:00,  1.19it/s]

['310001', '310002', '310003', '310004', '310005', '310006', '310007', '310008', '310009', '310010', '310011', '310012', '310013', '310014', '310015', '310016', '310017', '310018', '310019', '310020', '310021', '310022', '310023', '310024', '310025', '310026', '310027', '310028', '310029', '310030', '310031', '310032', '310033', '310034', '310035', '310036', '310037', '310038', '310039', '310040', '310041', '310042', '310043', '310044', '310045', '310046', '310047', '310048', '310049', '310050', '310051', '310052', '310053', '310054', '310055', '310056', '310057', '310058', '310059', '310060', '310061', '310062', '310063', '310064', '310065', '310066', '310067', '310068', '310069', '310070', '310071', '310072', '310073', '310074', '310075', '310076', '310077', '310078', '310079', '310080', '310081', '310082', '310083', '310084', '310085', '310086', '310087', '310088', '310089', '310090', '310091', '310092', '310093', '310094', '310095', '310096', '310097', '310098', '310099', '310100',




In [50]:
def mapper(value):
    match value:
        case 0:
            return 'NO'
        case 1:
            return 'DIRECT'
        case 2:
            return 'JUDGEMENTAL'

hard_dicts = []
for identity, hard in zip(ids, hards):
    hard_dicts.append({
        'test_case': 'EXIST2024',
        'id': str(identity),
        'value': mapper(hard)
    })
with open(inf_model_name + '_hard.json', 'w') as fp:
    json.dump(hard_dicts, fp)

In [51]:
soft_dicts = []
for identity, soft in zip(ids, softs):
    soft_dicts.append({
        'test_case': 'EXIST2024',
        'id': str(identity),
        'value': {
            'DIRECT': soft[1],
            'NO': soft[0],
            'JUDGEMENTAL': soft[2],
        }
    })
with open(inf_model_name + '_soft.json', 'w') as fp:
    json.dump(soft_dicts, fp)