# Preamble: Install and Import Packages

In [1]:
!pip install transformers==4.40.1 git+https://github.com/csebuetnlp/normalizer multilingual-clip open_clip_torch -q

[0m

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torchvision.transforms import Resize
from torchvision.io import read_image, ImageReadMode
from multilingual_clip import Config_MCLIP
import open_clip
import json
import pandas as pd
import random
from pathlib import Path
import cv2
import numpy as np
import transformers as hf
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, classification_report
from PIL import Image
import os
import gc
import time
import math
from normalizer import normalize

In [3]:
torch.autograd.set_detect_anomaly(True)
hf.__version__

'4.40.1'

# Initialise the Configuration and Random Seeds

In [4]:
_text_model_config = {}

_image_model_config = {
    "attention_probs_dropout_prob": 0.0,
    "encoder_stride": 16,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.0,
    "hidden_size": 768,
    "image_size": 224,
    "initializer_range": 0.02,
    "intermediate_size": 3072,
    "layer_norm_eps": 1e-12,
    "num_attention_heads": 12,
    "num_channels": 3,
    "num_hidden_layers": 0,
    "patch_size": 16,
    "qkv_bias": True,
}

# Dual encoder/Concat
tokeniser_model_id = 'xlm-roberta-base'
text_model_id = 'xlm-roberta-base'
image_model_id = 'google/vit-base-patch16-224-in21k'

# CLIP
# multimodal_model_id = 'openai/clip-vit-base-patch32'

# M-CLIP
# tokeniser_model_id = 'M-CLIP/XLM-Roberta-Large-Vit-B-16Plus'
# text_model_id = 'M-CLIP/XLM-Roberta-Large-Vit-B-16Plus'
# image_model_id = 'ViT-B-16-plus-240'
image_training_id = 'laion400m_e32'

# ViLT
multimodal_model_id = 'dandelin/vilt-b32-mlm'


class CFG:
    use_multimodal = True
    use_dualencoder = False
    save_models = True
    is_mclip = False
    init_weights = False
    tokeniser_model_id = tokeniser_model_id
    text_model_id = text_model_id
    image_model_id = image_model_id
    multimodal_model_id = multimodal_model_id
    image_training_id = image_training_id
    text_model_config = hf.AutoConfig.from_pretrained(text_model_id) if not 'M-CLIP' in text_model_id else None
    image_model_config = hf.AutoConfig.from_pretrained(image_model_id) if not 'M-CLIP' in text_model_id else None
    multimodal_model_config = hf.AutoConfig.from_pretrained(multimodal_model_id, text_config=_text_model_config, vision_config=_image_model_config)
    images_base_path = Path('/kaggle/input/vqa-bangla/Bangla_VQA/Bangla_VQA/images')
    images_base_path_test = Path('/kaggle/input/vqa-bangla/Bangla_VQA/Bangla_VQA/images')
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    lang = 'en'
    debug = True
    print_freq = 50
    apex = True # for faster training
    epochs = 10
    learning_rate = 2e-4  # for adam optimizer
    eps = 1e-6
    betas = (0.9, 0.999)  # for adam optimizer
    batch_size = 128  # 128 if ViLT/CLIP, 32 if BLIP, 4 if M-CLIP
    max_len = 512
    weight_decay = 0.01  # for adam optimizer regulaization parameter
    gradient_accumulation_steps = 1
    max_grad_norm = 1000
    seed = 42
    train = True
    num_classes = 0
    mlp_hidden_size = 256
    mlp_hidden_layers = 0
    mlp_dropout = 0.1
    mlp_grad_clip = 1.0
    mlp_init_range = 0.2
    mlp_attn_dim = 256



config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/502 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

In [5]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(CFG.seed)

In [6]:
class MultilingualCLIP(hf.PreTrainedModel):
    config_class = Config_MCLIP.MCLIPConfig

    def __init__(self, config, *args, **kwargs):
        super().__init__(config, *args, **kwargs)
        self.transformer = hf.AutoModel.from_pretrained(config.modelBase, cache_dir=kwargs.get("cache_dir"))
        self.LinearTransformation = torch.nn.Linear(in_features=config.transformerDimensions,
                                                    out_features=config.numDims)

    def forward(self, tokens, mask):
        embs = self.transformer(tokens, attention_mask=mask)[0]
        embs = (embs * mask.unsqueeze(2)).sum(dim=1) / mask.sum(dim=1)[:, None]
        return self.LinearTransformation(embs)

    @classmethod
    def _load_state_dict_into_model(cls, model, state_dict, pretrained_model_name_or_path, _fast_init=True):
        model.load_state_dict(state_dict)
        return model, [], [], []

# Preprocess the Dataset

In [7]:
def normalise_bn(text_bn):
    return normalize(
        text_bn,
        unicode_norm="NFKC",
        punct_replacement=None,
        url_replacement=None,
        emoji_replacement=None,
        apply_unicode_norm_last=True
    )

In [8]:
train_df = pd.read_csv("/kaggle/input/vqa-bangla/updated_train.csv")
val_df = pd.read_csv("/kaggle/input/vqa-bangla/updated_valid.csv")
test_df = pd.read_csv("/kaggle/input/vqa-bangla/updated_test.csv")

df = pd.concat([train_df, val_df, test_df], ignore_index=True)

df.head()

Unnamed: 0,image_name,Captions,Question,Answer,Category,Question_en,Answer_en,Captions_en,Answer_fixed
0,bnature_663.jpg,খালের পানিতে তিনটি গাছের প্রতিচ্ছবি সাথে গৌধোল...,ছবিতে কতগুলো গাছের প্রতিচ্ছবি দেখা যাচ্ছে?,তিনটি,numeric,How many trees are reflected in the picture?,three,Goudholi's beauty with three trees reflected i...,তিন
1,chitron_5113.png,অনেকগুলো মানুষ বসে আছে। মঞ্চের উপর কয়েকজন মানু...,ছবিতে কতজন মানুষ মঞ্চের উপর দাঁড়িয়ে আছে?,পাঁচজন,numeric,How many people are on the stage?,five,"A lot of people were sitting, a few people wer...",পাঁচ
2,bnature_876.jpg,দুজন ছেলে ও দুজন মেয়ে রাস্তা দিয়ে পাশাপাশি হ...,ছবিতে কতজন ছেলে ও মেয়ে একসাথে হাটছে?,চারজন,numeric,How many boys and girls are walking together i...,four,Two boys and two girls walking side by side on...,চার
3,bnature_1007.jpg,"রাস্তা দিয়ে কয়েকজন ছাত্র ছাত্রী যাচ্ছে, যাদে...",ছবিতে কতজন ছাত্র ছাত্রী রাস্তা দিয়ে হাঁটছে?,৪ জন,numeric,How many students are walking on the street in...,four,"Several students walking on the street, carryi...",চার
4,chitron_7446.png,'১ ইট তালগাছ ১ টি খেজুর গাছ এবং রাস্তা দিয়ে ছা...,ছবিতে কতগুলো গাছ দেখা যাচ্ছে?,২ টি,numeric,How many trees are shown in the picture?,Two,1 brick palm tree 1 date tree and 4 school stu...,দুই


In [9]:
all_labels = list(set(df['Answer' if CFG.lang == 'bn' else 'Answer_en'].unique().astype(str)))
all_labels.sort()
label_map = dict()
CFG.num_classes = len(all_labels)
for idx, label in enumerate(all_labels):
    label_map[normalise_bn(str(label)) if CFG.lang == 'bn' else str(label)] = idx

# Initialise the Processors/Tokenisers/Models

In [10]:
if CFG.is_mclip:
    tokenizer = hf.AutoTokenizer.from_pretrained(CFG.tokeniser_model_id)
    text_model = MultilingualCLIP.from_pretrained(CFG.text_model_id).to(CFG.device)
    image_model, _, image_processor = open_clip.create_model_and_transforms(CFG.image_model_id, pretrained=CFG.image_training_id)
    image_model = image_model.to(CFG.device)
elif CFG.use_multimodal:
    mm_processor = hf.AutoProcessor.from_pretrained(CFG.multimodal_model_id)
    mm_model = hf.AutoModel.from_pretrained(CFG.multimodal_model_id).to(CFG.device)
elif CFG.use_dualencoder:
    tokenizer = hf.AutoTokenizer.from_pretrained(CFG.tokeniser_model_id, padding=True, truncation=True)
    processor = hf.AutoImageProcessor.from_pretrained(CFG.image_model_id)
    de_processor = hf.VisionTextDualEncoderProcessor(image_processor=processor, tokenizer=tokenizer)
    text_model = hf.AutoModel.from_pretrained(CFG.text_model_id).to(CFG.device)
    image_model = hf.AutoModel.from_pretrained(CFG.image_model_id).to(CFG.device)
    de_model = hf.VisionTextDualEncoderModel(vision_model=image_model, text_model=text_model)
else:
    tokenizer = hf.AutoTokenizer.from_pretrained(CFG.tokeniser_model_id)
    text_model = hf.AutoModel.from_pretrained(CFG.text_model_id).to(CFG.device)
    # Adding a config to the image_model gets rid of lots of pretrained weights
    image_model = hf.AutoModel.from_pretrained(CFG.image_model_id).to(CFG.device)



tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

# Custom Dataset Definition

In [11]:
class VQADataset(Dataset):
    def __init__(self, features, img_dir, img_transform=None, caption_transform=None, target_transform=None):
        self.features = features
        self.img_dir = img_dir
        self.img_transform = img_transform
        self.caption_transform = caption_transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        img_path = str(self.img_dir.joinpath(self.features['image_name'].iloc[idx]))
        if CFG.is_mclip:
            image = Image.open(img_path)
        else:
            image = read_image(img_path, mode=ImageReadMode.RGB).to(device=CFG.device)
        caption = normalise_bn(self.features['Question' if CFG.lang == 'bn' else 'Question_en'].iloc[idx])
        identity = self.features['image_name'].iloc[idx]
        label = torch.tensor(label_map[normalise_bn(str(self.features['Answer'].iloc[idx])) if CFG.lang == 'bn' else str(self.features['Answer_en'].iloc[idx])], dtype=torch.long)
        
        if self.img_transform:
            image = self.img_transform(image)
        if self.caption_transform:
            caption = self.caption_transform(caption)
        if self.target_transform:
            label = self.target_transform(label)
            
        if CFG.is_mclip:
            processed = tokenizer(caption, padding=True, return_tensors='pt')
            seq = processed['input_ids']
            mask = processed['attention_mask']
            image = image_processor(image)
        elif CFG.use_multimodal:
            processed = mm_processor(text=caption, images=image, return_tensors="pt", padding=True, truncation=True)
            seq = processed['input_ids']
            mask = processed['attention_mask']
            image = processed['pixel_values']
        elif CFG.use_dualencoder:
            processed = de_processor(text=caption, images=image, return_tensors="pt")
            seq = processed['input_ids']
            mask = processed['attention_mask']
            image = processed['pixel_values']
        else:
            processed = tokenizer.encode_plus(
                caption,
                padding='longest',
                truncation=True,
                return_tensors='pt'
            )
            seq = processed['input_ids']
            mask = processed['attention_mask']
        
        return identity, image, seq, mask, label

In [12]:
class Collator(object):
    def __init__(self, test=False):
        self.test = test
    def __call__(self, batch):
        ids, images, seqs, masks, labels = zip(*batch)

        seqs = [seq.squeeze(dim=0) for seq in seqs]
        masks = [mask.squeeze(dim=0) for mask in masks]
        images = [image.squeeze(dim=0) for image in images]
        labels = torch.stack(labels)

        seqs = nn.utils.rnn.pad_sequence(seqs, batch_first=True)
        masks = nn.utils.rnn.pad_sequence(masks, batch_first=True)

        images = torch.stack(images)
        
        return ids, images, seqs, masks, labels

In [13]:
resizer = Resize((224, 224), antialias=True)

def resize_images(img_tensor):
    return resizer(img_tensor)

# Dataset Initialisation

In [14]:
train_dataset = VQADataset(train_df, CFG.images_base_path, img_transform=resize_images)
val_dataset = VQADataset(val_df, CFG.images_base_path, img_transform=resize_images)
test_dataset = VQADataset(test_df, CFG.images_base_path, img_transform=resize_images)

print(len(train_dataset))
print(len(val_dataset))
print(len(test_dataset))

12231
1529
1532


# Model Architecture

In [15]:
class MultiModalArch(nn.Module):
    def __init__(self, hidden_size, hidden_layers, dropout, num_classes, use_multimodal=False, use_dualencoder=False, is_mclip=False):
        super().__init__()
        
        self.hidden_size = hidden_size
        self.hidden_layers = hidden_layers
        self.use_multimodal = use_multimodal
        self.use_dualencoder = use_dualencoder
        self.is_mclip = is_mclip
        self.is_vilt = 'ViltForMaskedLM' in CFG.multimodal_model_config.architectures
        
        if self.is_mclip:
            self.text_model = text_model
            self.image_model = image_model
        elif self.use_multimodal:
            self.mm_model = mm_model
        elif self.use_dualencoder:
            self.de_model = de_model
        else:
            self.text_model = text_model
            self.image_model = image_model
        
        if self.is_mclip:
            self.fc1 = nn.Linear(1280, self.hidden_size)
        elif self.use_multimodal:
            if self.is_vilt:
                out_channels = CFG.multimodal_model_config.hidden_size
            else:
                out_channels = 2 * CFG.multimodal_model_config.projection_dim
            self.fc1 = nn.Linear(out_channels, self.hidden_size)
        elif self.use_dualencoder:
            self.fc1 = nn.Linear(2 * 512, self.hidden_size)
        else:
            self.fc1 = nn.Linear(CFG.text_model_config.hidden_size + CFG.image_model_config.hidden_size, self.hidden_size)
        self.hiddens = nn.ModuleList([nn.Linear(self.hidden_size, self.hidden_size) for _ in range(self.hidden_layers)])
        self.fc2 = nn.Linear(self.hidden_size, num_classes)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        
        if CFG.init_weights:
            self._init_weights(self.fc1)
            for hidden in self.hiddens:
                self._init_weights(hidden)
            self._init_weights(self.fc2)

    def forward(self, tokens, mask, image):
        if self.is_mclip:
            emb_text = self.text_model.forward(tokens, mask)
            emb_img = self.image_model.encode_image(image)
            x = torch.cat([emb_text, emb_img], dim=1)
        elif self.use_multimodal:
            mm_output = self.mm_model(input_ids=tokens, attention_mask=mask, pixel_values=image, output_hidden_states=True)
            cats = [mm_output.pooler_output] if self.is_vilt else [mm_output.text_embeds, mm_output.image_embeds]
            x = torch.cat(cats, dim=1)
        elif self.use_dualencoder:
            de_output = self.de_model(input_ids=tokens, attention_mask=mask, pixel_values=image)
            x = torch.cat([de_output.text_embeds, de_output.image_embeds], dim=1)
        else:
            cls_text = self.text_model(tokens, attention_mask=mask).last_hidden_state[:, 0, :]
            cls_img = self.image_model(image).last_hidden_state[:, 0, :]
            x = torch.cat([cls_text, cls_img], dim=1)

        x = self.fc1(x)
        x = self.activation(x)
        x = self.dropout(x)
        for hidden in self.hiddens:
            x = hidden(x)
            x = self.activation(x)
            x = self.dropout(x)
        x = self.fc2(x)
        
        output = x
        return output.float()
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=CFG.mlp_init_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=CFG.mlp_init_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

# Utility Functions

In [16]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

In [17]:
def get_score(y_trues, y_preds):
    accuracy = accuracy_score(y_trues, y_preds)
    return accuracy

In [18]:
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)

In [19]:
# ====================================================
# Logger File
# ====================================================

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

OUTPUT_DIR = "./"
def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

# Train/Val Loops

In [20]:
def train_loop(model, optimizer, loss_fn, train_dataloader, epoch):
    model.train()
    train_losses = AverageMeter()
    start = end = time.time()
    
    for step, (_, image, seq, mask, label) in enumerate(tqdm(train_dataloader)):
        
        train_image = image.to(CFG.device)
        train_seq = seq.to(CFG.device)
        train_mask = mask.to(CFG.device)
        train_label = label.to(device=CFG.device)
        
        batch_size = train_image.shape[0]

        with torch.cuda.amp.autocast(enabled=CFG.apex):
            output = model(train_seq, train_mask, train_image)
            
        loss = loss_fn(output, train_label)
        
        optimizer.zero_grad()
        loss.backward()
        grad_norm = nn.utils.clip_grad_norm_(model.parameters(), CFG.mlp_grad_clip)
        optimizer.step()
        
        train_losses.update(loss.item(), batch_size)
            
        end = time.time()

        if step % CFG.print_freq == 0 or step == (len(train_dataloader) - 1):
            print(f'Epoch: [{epoch + 1}][{step}/{len(train_dataloader)}] '
                  f'Elapsed {timeSince(start, float(step + 1) / len(train_dataloader)):s} '
                  f'Loss: {train_losses.val:.4f} ({train_losses.avg:.4f}) ')
        
        if step % 100 == 0:
            torch.cuda.empty_cache()
            gc.collect()

    return train_losses.avg

In [21]:
def validation_loop(model, loss_fn, valid_dataloader, epoch):
    all_ids = []
    all_preds = []
    all_labels = []
    
    model.eval()
    validation_losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    
    for step, (identity, image, seq, mask, label) in enumerate(tqdm(valid_dataloader)):
        
        image = image.to(device=CFG.device)
        seq = seq.to(device=CFG.device)
        mask = mask.to(device=CFG.device)
        label = label.to(device=CFG.device)
        
        batch_size = image.shape[0]

        with torch.no_grad():
            with torch.cuda.amp.autocast(enabled=CFG.apex):
                output = model(seq, mask, image)

        loss = loss_fn(output, label)
        
        validation_losses.update(loss.item(), batch_size)
        predicted = output.argmax(dim=1)

        all_ids += list(identity)
        all_labels.append(label)
        all_preds.append(predicted)
            
        end = time.time()

        if step % CFG.print_freq == 0 or step == (len(valid_dataloader) - 1):
            print(f'Epoch: [{epoch + 1}][{step}/{len(valid_dataloader)}] '
                  f'Elapsed {timeSince(start, float(step + 1) / len(valid_dataloader)):s} '
                  f'Loss: {validation_losses.val:.4f} ({validation_losses.avg:.4f})')
        
        if step % 100 == 0:
            torch.cuda.empty_cache()
            gc.collect()
            
    all_preds = torch.cat(all_preds, dim=0)
    all_labels = torch.cat(all_labels, dim=0)
    
    all_preds_np = all_preds.cpu().numpy().astype(int)
    all_labels_np = all_labels.cpu().numpy().astype(int)
        
    return validation_losses.avg, all_ids, all_preds_np, all_labels_np

In [22]:
def save_predictions(model_name, ids, labels, preds, split='val'):
    entries = []
    for identity, label, pred in zip(ids, labels, preds):
        entry = {
            'identity': identity,
            'label': all_labels[label],
            'pred': all_labels[pred]
        }
        entries.append(entry)

    with open(f'/kaggle/working/{model_name}_{CFG.lang}_{split}_preds.json', 'w') as fp:
        json.dump(entries, fp, cls=NpEncoder)

# Training and Validation

In [23]:
collate = Collator()
train_dataloader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True, collate_fn=collate)
valid_dataloader = DataLoader(val_dataset, batch_size=CFG.batch_size, shuffle=False, collate_fn=collate)
test_dataloader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False, collate_fn=collate)

loss_fn = nn.CrossEntropyLoss()

In [24]:
model = MultiModalArch(
    hidden_size=CFG.mlp_hidden_size,
    hidden_layers=CFG.mlp_hidden_layers,
    dropout=CFG.mlp_dropout,
    num_classes=CFG.num_classes,
    use_multimodal=CFG.use_multimodal,
    use_dualencoder=CFG.use_dualencoder,
    is_mclip=CFG.is_mclip
).to(CFG.device)
if not CFG.is_mclip:
    model = nn.DataParallel(model)

optim = AdamW(model.parameters(), lr=CFG.learning_rate, eps=CFG.eps, betas=CFG.betas)

In [None]:
best_score = float('-inf')

for epoch in range(CFG.epochs):

    start_time = time.time()

    # train function 
    avg_train_loss = train_loop(model, optim, loss_fn, train_dataloader, epoch)

    # val function 
    avg_val_loss, all_ids, all_preds_np, all_labels_np = validation_loop(model, loss_fn, valid_dataloader, epoch)
    
    score = get_score(all_labels_np, all_preds_np)
    
    report = classification_report(all_labels_np, all_preds_np, digits=4)

    elapsed = time.time() - start_time

    LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_train_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')

    if CFG.save_models and score > best_score:
        model_name = CFG.multimodal_model_id if CFG.use_multimodal else '-'.join([CFG.text_model_id, CFG.image_model_id])
        model_name = model_name.replace('/', '-') + f'_score_{score:.4f}' + f'_{CFG.lang}'
        torch.save({'model': model.state_dict()}, f'{model_name}.pth')
        print(f'Saved model: {model_name}')
        with open(f'{model_name}_results.txt', 'w', encoding='utf-8') as fp:
            fp.write(report)
        best_score = score
        
        save_predictions(model_name, all_ids, all_labels_np, all_preds_np)
        
        avg_test_loss, all_ids_test, all_preds_np_test, all_labels_np_test = validation_loop(model, loss_fn, test_dataloader, epoch)
        save_predictions(model_name, all_ids_test, all_labels_np_test, all_preds_np_test, split='test')

    torch.cuda.empty_cache()
    gc.collect()

  0%|          | 0/96 [00:00<?, ?it/s]

Epoch: [1][0/96] Elapsed 0m 8s (remain 12m 59s) Loss: 8.3639 (8.3639) 
Epoch: [1][50/96] Elapsed 3m 45s (remain 3m 18s) Loss: 6.8856 (7.6252) 
Epoch: [1][95/96] Elapsed 6m 56s (remain 0m 0s) Loss: 7.3370 (7.4193) 


  0%|          | 0/12 [00:00<?, ?it/s]

Epoch: [1][0/12] Elapsed 0m 2s (remain 0m 28s) Loss: 4.0357 (4.0357)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Epoch 1 - avg_train_loss: 7.4193  avg_val_loss: 7.0895  time: 446s


Epoch: [1][11/12] Elapsed 0m 29s (remain 0m 0s) Loss: 7.9695 (7.0895)
Saved model: xlm-roberta-base-google-vit-base-patch16-224-in21k_score_0.0713_en


  0%|          | 0/12 [00:00<?, ?it/s]

Epoch: [1][0/12] Elapsed 0m 2s (remain 0m 32s) Loss: 6.7488 (6.7488)
Epoch: [1][11/12] Elapsed 0m 31s (remain 0m 0s) Loss: 8.0759 (7.6523)


  0%|          | 0/96 [00:00<?, ?it/s]

Epoch: [2][0/96] Elapsed 0m 4s (remain 6m 35s) Loss: 7.1166 (7.1166) 


In [None]:
# del model
# torch.cuda.empty_cache()
# gc.collect()

# Inference From Checkpoint

In [23]:
# inf_model_name = 'M-CLIP-XLM-Roberta-Large-Vit-B-16Plus-ViT-B-16-plus-240_score_0.0190'
# inf_model = MultiModalArch(
#     hidden_size=CFG.mlp_hidden_size,
#     hidden_layers=CFG.mlp_hidden_layers,
#     dropout=CFG.mlp_dropout,
#     num_classes=CFG.num_classes,
#     use_multimodal=CFG.use_multimodal,
#     use_dualencoder=CFG.use_dualencoder,
#     is_mclip=CFG.is_mclip
# ).to(CFG.device)
# if not CFG.is_mclip:
#     inf_model = nn.DataParallel(inf_model)
# inf_model.load_state_dict(torch.load('/kaggle/working/' + inf_model_name + f'_{CFG.lang}' + '.pth', map_location=torch.device(CFG.device))['model'])
# inf_model

MultiModalArch(
  (text_model): MultilingualCLIP(
    (transformer): XLMRobertaModel(
      (embeddings): XLMRobertaEmbeddings(
        (word_embeddings): Embedding(250002, 1024, padding_idx=1)
        (position_embeddings): Embedding(514, 1024, padding_idx=1)
        (token_type_embeddings): Embedding(1, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): XLMRobertaEncoder(
        (layer): ModuleList(
          (0-23): 24 x XLMRobertaLayer(
            (attention): XLMRobertaAttention(
              (self): XLMRobertaSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_features=1024, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): XLMRobertaSelfOutput(

In [25]:
# avg_val_loss, all_ids, all_preds_np, all_labels_np = validation_loop(inf_model, loss_fn, valid_dataloader, 0)
# if CFG.debug:
#     print(all_labels_np)
#     print(all_preds_np)
    
# score = get_score(all_labels_np, all_preds_np)

# report = classification_report(all_labels_np, all_preds_np, digits=4)
# print(report)

  0%|          | 0/383 [00:00<?, ?it/s]

Epoch: [1][0/383] Elapsed 0m 2s (remain 15m 4s) Loss: 4.7221 (4.7221)
Epoch: [1][50/383] Elapsed 0m 11s (remain 1m 17s) Loss: 4.5757 (5.1672)
Epoch: [1][100/383] Elapsed 0m 21s (remain 1m 0s) Loss: 9.3990 (5.9564)
Epoch: [1][150/383] Elapsed 0m 35s (remain 0m 55s) Loss: 8.6295 (6.8906)
Epoch: [1][200/383] Elapsed 0m 44s (remain 0m 40s) Loss: 8.5118 (7.4440)
Epoch: [1][250/383] Elapsed 0m 57s (remain 0m 30s) Loss: 10.6316 (7.7586)
Epoch: [1][300/383] Elapsed 1m 7s (remain 0m 18s) Loss: 4.1957 (7.9083)
Epoch: [1][350/383] Elapsed 1m 18s (remain 0m 7s) Loss: 9.2648 (7.7561)
Epoch: [1][382/383] Elapsed 1m 23s (remain 0m 0s) Loss: 6.7356 (7.8342)
[2661 5640 2099 ... 1978  867 4205]
[2211 2211 2211 ... 2211 2211 2211]
              precision    recall  f1-score   support

           7     0.0000    0.0000    0.0000         1
          17     0.0000    0.0000    0.0000         1
          20     0.0000    0.0000    0.0000         1
          22     0.0000    0.0000    0.0000         1
       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
# entries = []
# for identity, label, pred in zip(all_ids, all_labels_np, all_preds_np):
#     entry = {
#         'identity': identity,
#         'label': all_labels[label],
#         'pred': all_labels[pred]
#     }
#     entries.append(entry)

# with open(f'/kaggle/working/{inf_model_name}_{CFG.lang}_val_preds.json', 'w') as fp:
#     json.dump(entries, fp, cls=NpEncoder)

In [27]:
# avg_test_loss, all_ids_test, all_preds_np_test, all_labels_np_test = validation_loop(inf_model, loss_fn, test_dataloader, 0)
# if CFG.debug:
#     print(all_labels_np_test)
#     print(all_preds_np_test)
    
# score_test = get_score(all_labels_np_test, all_preds_np_test)

# report_test = classification_report(all_labels_np_test, all_preds_np_test, digits=4)
# print(report_test)

  0%|          | 0/383 [00:00<?, ?it/s]

Epoch: [1][0/383] Elapsed 0m 0s (remain 1m 10s) Loss: 4.5519 (4.5519)
Epoch: [1][50/383] Elapsed 0m 9s (remain 1m 4s) Loss: 5.7966 (5.1975)
Epoch: [1][100/383] Elapsed 0m 21s (remain 1m 0s) Loss: 10.3884 (6.0287)
Epoch: [1][150/383] Elapsed 0m 31s (remain 0m 48s) Loss: 8.3530 (6.9530)
Epoch: [1][200/383] Elapsed 0m 44s (remain 0m 40s) Loss: 9.4853 (7.4494)
Epoch: [1][250/383] Elapsed 0m 57s (remain 0m 30s) Loss: 9.6440 (7.8055)
Epoch: [1][300/383] Elapsed 1m 14s (remain 0m 20s) Loss: 5.8731 (7.9028)
Epoch: [1][350/383] Elapsed 1m 27s (remain 0m 7s) Loss: 7.4225 (7.7173)
Epoch: [1][382/383] Elapsed 1m 36s (remain 0m 0s) Loss: 7.6207 (7.8045)
[2211 5640 2189 ... 4198 4199 3208]
[2211 2211 2211 ... 2211 2211 2211]
              precision    recall  f1-score   support

          14     0.0000    0.0000    0.0000         1
          23     0.0000    0.0000    0.0000         1
          32     0.0000    0.0000    0.0000         1
          34     0.0000    0.0000    0.0000         1
        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
# entries_test = []
# for identity, label, pred in zip(all_ids_test, all_labels_np_test, all_preds_np_test):
#     entry = {
#         'identity': identity,
#         'label': all_labels[label],
#         'pred': all_labels[pred]
#     }
#     entries_test.append(entry)

# with open(f'/kaggle/working/{inf_model_name}_{CFG.lang}_test_preds.json', 'w') as fp:
#     json.dump(entries_test, fp, cls=NpEncoder)