# Imports

In [1]:
# !pip install transformers==4.40.1 -q
!pip install git+https://github.com/csebuetnlp/normalizer -q

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from transformers import AutoImageProcessor, AutoProcessor, AutoModel, AutoConfig, AutoTokenizer
from datasets import load_dataset
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from PIL import Image
from tqdm.notebook import tqdm
import itertools
import pandas as pd
from typing import Any, Optional
from normalizer import normalize
from sklearn.metrics import accuracy_score
import numpy as np
import gc

2024-06-09 14:15:47.396546: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-09 14:15:47.396657: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-09 14:15:47.555005: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Model selection

In [3]:
models = {
    "BanglaBert" : "csebuetnlp/banglabert",
    "BanglaBertLarge" : "csebuetnlp/banglabert_large",
    "BanglishBert" : "csebuetnlp/banglishbert",
    "XlmRoberta" : "FacebookAI/xlm-roberta-base",
    "mDeberta" : "microsoft/mdeberta-v3-base", 
    "SahajBert" : "neuropark/sahajBERT",
    "mBERT" : "google-bert/bert-base-multilingual-cased",
    
    "Deberta" : "microsoft/deberta-v3-base",
    "electra" : "google/electra-base-discriminator",
    "Bert" : "google-bert/bert-base-uncased"
    
}

# Load data

In [4]:
train = pd.read_csv("/kaggle/input/vqa-bangla/updated_train.csv")
valid = pd.read_csv("/kaggle/input/vqa-bangla/updated_valid.csv")
test = pd.read_csv("/kaggle/input/vqa-bangla/updated_test.csv")

# train.dropna(inplace=True), valid.dropna(inplace=True), test.dropna(inplace=True)
train.head()

Unnamed: 0,image_name,Captions,Question,Answer,Category,Question_en,Answer_en,Captions_en,Answer_fixed
0,bnature_663.jpg,খালের পানিতে তিনটি গাছের প্রতিচ্ছবি সাথে গৌধোল...,ছবিতে কতগুলো গাছের প্রতিচ্ছবি দেখা যাচ্ছে?,তিনটি,numeric,How many trees are reflected in the picture?,three,Goudholi's beauty with three trees reflected i...,তিন
1,chitron_5113.png,অনেকগুলো মানুষ বসে আছে। মঞ্চের উপর কয়েকজন মানু...,ছবিতে কতজন মানুষ মঞ্চের উপর দাঁড়িয়ে আছে?,পাঁচজন,numeric,How many people are on the stage?,five,"A lot of people were sitting, a few people wer...",পাঁচ
2,bnature_876.jpg,দুজন ছেলে ও দুজন মেয়ে রাস্তা দিয়ে পাশাপাশি হ...,ছবিতে কতজন ছেলে ও মেয়ে একসাথে হাটছে?,চারজন,numeric,How many boys and girls are walking together i...,four,Two boys and two girls walking side by side on...,চার
3,bnature_1007.jpg,"রাস্তা দিয়ে কয়েকজন ছাত্র ছাত্রী যাচ্ছে, যাদে...",ছবিতে কতজন ছাত্র ছাত্রী রাস্তা দিয়ে হাঁটছে?,৪ জন,numeric,How many students are walking on the street in...,four,"Several students walking on the street, carryi...",চার
4,chitron_7446.png,'১ ইট তালগাছ ১ টি খেজুর গাছ এবং রাস্তা দিয়ে ছা...,ছবিতে কতগুলো গাছ দেখা যাচ্ছে?,২ টি,numeric,How many trees are shown in the picture?,Two,1 brick palm tree 1 date tree and 4 school stu...,দুই


In [5]:
concat = pd.concat([train, valid, test])

uniques = set(concat['Answer_fixed'].unique())
labels = list(uniques)

labels = [normalize(str(l)) for l in labels]

train.shape, valid.shape, test.shape, len(labels)

((12231, 9), (1529, 9), (1532, 9), 5542)

# Config

In [6]:
class Configs:
    model = models['XlmRoberta']
    epochs= 15
    num_workers=4 
    learning_rate=1e-5 
    batch_size=64 # batch size
    max_len= 512  # df.top_captions.map(len).max()    
    weight_decay=0.01 # for adam optimizer regulaization parameter
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42 # seed no. for random initialization 
    train=True
    num_class = len(labels) # Number of class in your dataset
    image_path = "/kaggle/input/vqa-bangla/Bangla_VQA/images"
    tokenizer = None # AutoTokenizer.from_pretrained(self.model_name)

cfg = Configs()
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f'Using accelerator device: {device}')

Using accelerator device: cuda


In [7]:
AutoConfig.from_pretrained(cfg.model)

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

XLMRobertaConfig {
  "_name_or_path": "FacebookAI/xlm-roberta-base",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.39.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}

# Model

In [8]:
class TextModel(nn.Module):
    def __init__(self,  model_name, num_classes=10):
        super().__init__()
        self.model_config = AutoConfig.from_pretrained(model_name)
        self.pretrained_model = AutoModel.from_pretrained(model_name)  #efficientNet.config.hidden_dim 
        
        self.mlp_input = (
            self.model_config.hidden_size if hasattr(self.model_config, 'hidden_size') else
            self.model_config.hidden_sizes[-1] if hasattr(self.model_config, 'hidden_sizes') else
            self.model_config.hidden_dim if hasattr(self.model_config, 'hidden_dim') else
            None
        )
        self.fc1 = nn.Linear(in_features=self.mlp_input, out_features=1024)
        self.fc2 = nn.Linear(in_features=1024, out_features=num_classes)
        self.relu = nn.ReLU()  
        

    def forward(self, **kwargs):
        outs = self.pretrained_model(**kwargs)#.pooler_output
        
        outs = (outs.pooler_output if hasattr(outs, 'pooler_output') else
        outs.last_hidden_state[:,0,:] if hasattr(outs, 'last_hidden_state') else None)

        x = self.fc1(outs)
        x = self.relu(x) 
        x = self.fc2(x)
        
        return x


# Model Initialization
model = TextModel(cfg.model,cfg.num_class)
model = nn.DataParallel(model)
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.learning_rate)
criterion = torch.nn.CrossEntropyLoss()

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

# Custom dataset

In [9]:
def custom_collate_fn(batch):
    
    sentences = [item['sentence'] for item in batch]
    image_ids = [item['image_id'] for item in batch]
    input_ids = [item['input_ids'].squeeze(0) for item in batch]
    attention_masks = [item['attention_mask'].squeeze(0) for item in batch]
    
    labels = [item['labels'] for item in batch]
    
    input_ids_padded = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=0)
    attention_masks_padded = torch.nn.utils.rnn.pad_sequence(attention_masks, batch_first=True, padding_value=0)
    
    labels = torch.stack(labels)
    
    return {
        "image_id" : image_ids,
        "sentences": sentences,
        "input_ids": input_ids_padded,
        "attention_mask": attention_masks_padded,
        "labels": labels
    }


# Tokenizer Initialization
cfg.tokenizer = AutoTokenizer.from_pretrained(cfg.model)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [10]:
class CustomDataset(Dataset):
    def __init__(self, data, configs, labels) -> None:
        super().__init__()
        self.df = data
        self.configs = configs
        self.classes = labels        
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        row = self.df.iloc[index]
        
        image_id = row['image_name']
        caption = row['Captions']
        ques = row['Question']
        sentence = f"Question: {ques} [SEP] Caption: {caption}"
        
        inputs = self.configs.tokenizer.encode_plus(
            sentence, 
            add_special_tokens=True, 
            max_length=self.configs.max_len,
            return_tensors="pt", 
            truncation=True
        )
        
        label = torch.tensor(self.classes.index(normalize(str(self.df.loc[index, "Answer_fixed"]))), dtype=torch.long)

        return {
            "image_id" : image_id,
            "sentence" : sentence,
            "input_ids" : inputs['input_ids'],
            "attention_mask" : inputs['attention_mask'],
            "labels": label
        }

In [11]:
# train_dataset, valid_dataset = train_test_split(df, test_size = 0.1)
# train_dataset.reset_index(drop=True, inplace=True)
# valid_dataset.reset_index(drop=True, inplace=True)
# train = train.iloc[0:128]

In [12]:
train_dataset = CustomDataset(data=train, configs=cfg, labels=labels)
valid_dataset = CustomDataset(data=valid, configs=cfg, labels=labels)
test_dataset = CustomDataset(data=test, configs=cfg, labels=labels)

In [13]:
train_loader = DataLoader(dataset=train_dataset, batch_size=cfg.batch_size, collate_fn=custom_collate_fn)
val_loader = DataLoader(dataset=valid_dataset, batch_size=cfg.batch_size, collate_fn=custom_collate_fn)
test_loader = DataLoader(dataset=test_dataset, batch_size=cfg.batch_size, collate_fn=custom_collate_fn)

In [14]:
for batch in train_loader:
    print(batch['input_ids'].shape, batch['image_id'][0])
    break

torch.Size([64, 119]) bnature_663.jpg


# Training loop

In [15]:
best_loss = 1e10
best_acc = float('-inf')

In [16]:
# val_loader = train_loader
# test_loader = train_loader

In [17]:
for epoch in tqdm(range(cfg.epochs)):
    loss = 0.0 
    
    for step, batch in enumerate(tqdm(train_loader)):
        label = batch['labels'].to(device)
        
        inputs = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        inputs_dict = {'input_ids': inputs, 'attention_mask': attention_mask}
            
        outputs = model(**inputs_dict)
        
        
        loss = criterion(outputs, label)
        loss += loss.item()
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        if step % 10 ==0:
            torch.cuda.empty_cache()
            gc.collect()
    
    
    loss /= len(train_loader)
    print(f"train: Epoch {epoch+1}/{cfg.epochs}, Loss: {loss}")
    
    val_loss = 0.0
    acc = 0.0
    true_labels = []
    predicted_labels = []
    
    with torch.no_grad():
        for batch in tqdm(val_loader):
            label = batch['labels'].to(device)
            
            inputs = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            inputs_dict = {'input_ids': inputs, 'attention_mask': attention_mask}

            outputs = model(**inputs_dict)
            
            val_loss = criterion(outputs, label)
            val_loss += val_loss.item()
            
            _, predicted = torch.max(outputs, 1)
            true_labels.extend(label.cpu().numpy())
            predicted_labels.extend(predicted.cpu().numpy())           
            
        true_labels = np.array(true_labels)
        predicted_labels = np.array(predicted_labels)

        acc = (true_labels == predicted_labels).mean()

            
        val_loss /= len(val_loader)
        print(f"valid: Epoch {epoch+1}/{cfg.epochs}, Loss: {val_loss}, Accuracy: {acc}")
        
    if best_acc < acc:
        best_acc = acc
        print("saving best model...")
        torch.save({
            'model_state_dict': model.module.state_dict(),
            }, '/kaggle/working/best_model.pth')    

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/192 [00:00<?, ?it/s]

train: Epoch 1/15, Loss: 0.08918478339910507


  0%|          | 0/24 [00:00<?, ?it/s]

valid: Epoch 1/15, Loss: 0.717708945274353, Accuracy: 0.061478090255068674
saving best model...


  0%|          | 0/192 [00:00<?, ?it/s]

train: Epoch 2/15, Loss: 0.08729030191898346


  0%|          | 0/24 [00:00<?, ?it/s]

valid: Epoch 2/15, Loss: 0.7178754806518555, Accuracy: 0.07128842380640942
saving best model...


  0%|          | 0/192 [00:00<?, ?it/s]

train: Epoch 3/15, Loss: 0.084256611764431


  0%|          | 0/24 [00:00<?, ?it/s]

valid: Epoch 3/15, Loss: 0.7253448963165283, Accuracy: 0.07128842380640942


  0%|          | 0/192 [00:00<?, ?it/s]

train: Epoch 4/15, Loss: 0.08209040760993958


  0%|          | 0/24 [00:00<?, ?it/s]

valid: Epoch 4/15, Loss: 0.7337872982025146, Accuracy: 0.015696533682145193


  0%|          | 0/192 [00:00<?, ?it/s]

train: Epoch 5/15, Loss: 0.0804724171757698


  0%|          | 0/24 [00:00<?, ?it/s]

valid: Epoch 5/15, Loss: 0.7383251190185547, Accuracy: 0.015696533682145193


  0%|          | 0/192 [00:00<?, ?it/s]

train: Epoch 6/15, Loss: 0.07917841523885727


  0%|          | 0/24 [00:00<?, ?it/s]

valid: Epoch 6/15, Loss: 0.7455781698226929, Accuracy: 0.015696533682145193


  0%|          | 0/192 [00:00<?, ?it/s]

train: Epoch 7/15, Loss: 0.07800829410552979


  0%|          | 0/24 [00:00<?, ?it/s]

valid: Epoch 7/15, Loss: 0.7689099311828613, Accuracy: 0.015696533682145193


  0%|          | 0/192 [00:00<?, ?it/s]

train: Epoch 8/15, Loss: 0.0771900862455368


  0%|          | 0/24 [00:00<?, ?it/s]

valid: Epoch 8/15, Loss: 0.7881530523300171, Accuracy: 0.015696533682145193


  0%|          | 0/192 [00:00<?, ?it/s]

train: Epoch 9/15, Loss: 0.07659856975078583


  0%|          | 0/24 [00:00<?, ?it/s]

valid: Epoch 9/15, Loss: 0.793383777141571, Accuracy: 0.015696533682145193


  0%|          | 0/192 [00:00<?, ?it/s]

train: Epoch 10/15, Loss: 0.07616899907588959


  0%|          | 0/24 [00:00<?, ?it/s]

valid: Epoch 10/15, Loss: 0.798769474029541, Accuracy: 0.015696533682145193


  0%|          | 0/192 [00:00<?, ?it/s]

train: Epoch 11/15, Loss: 0.07575637847185135


  0%|          | 0/24 [00:00<?, ?it/s]

valid: Epoch 11/15, Loss: 0.8039659261703491, Accuracy: 0.015696533682145193


  0%|          | 0/192 [00:00<?, ?it/s]

train: Epoch 12/15, Loss: 0.075367271900177


  0%|          | 0/24 [00:00<?, ?it/s]

valid: Epoch 12/15, Loss: 0.8084203600883484, Accuracy: 0.015696533682145193


  0%|          | 0/192 [00:00<?, ?it/s]

train: Epoch 13/15, Loss: 0.07498105615377426


  0%|          | 0/24 [00:00<?, ?it/s]

valid: Epoch 13/15, Loss: 0.8128153085708618, Accuracy: 0.015696533682145193


  0%|          | 0/192 [00:00<?, ?it/s]

train: Epoch 14/15, Loss: 0.07465918362140656


  0%|          | 0/24 [00:00<?, ?it/s]

valid: Epoch 14/15, Loss: 0.8170510530471802, Accuracy: 0.015696533682145193


  0%|          | 0/192 [00:00<?, ?it/s]

train: Epoch 15/15, Loss: 0.07436978816986084


  0%|          | 0/24 [00:00<?, ?it/s]

valid: Epoch 15/15, Loss: 0.8211307525634766, Accuracy: 0.015696533682145193


# inference

In [18]:
torch.cuda.empty_cache()
gc.collect()

54

In [19]:
def accuracy_check(best_model, loader, df, txt):
    
    true_labels = []
    predicted_labels = []
    
    with torch.no_grad():
        for batch in tqdm(loader):
            image_id = batch['image_id']
            label = batch['labels'].to(device)

            inputs = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            inputs_dict = {'input_ids': inputs, 'attention_mask': attention_mask}

            outputs = best_model(**inputs_dict)

            _, predicted = torch.max(outputs, 1) # accuracy purpose
            true_labels.extend(label.cpu().numpy())
            predicted_labels.extend(predicted.cpu().numpy())

            labels_batch  = [labels[l] for l in label] # concat to the dataframe
            predict_batch = [labels[o.argmax()] for o in outputs]
            new_rows = {'image_id': image_id, 'original': labels_batch, 'predicted': predict_batch}
            new_batch = pd.DataFrame(data=new_rows)

            df = pd.concat([df, new_batch])

    true_labels = np.array(true_labels)
    predicted_labels = np.array(predicted_labels)


    accuracy = (true_labels == predicted_labels).sum()/len(true_labels) #.mean()
    print(f"{txt} set accuracy: ", accuracy)
    
    return df, accuracy

In [20]:
best_model =  TextModel(cfg.model,cfg.num_class)
checkpoint = torch.load('/kaggle/working/best_model.pth', map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
best_model.load_state_dict(checkpoint['model_state_dict'])

best_model = best_model.to(device)

In [21]:
testdf = pd.DataFrame(columns= ['image_id', 'original', 'predicted'])

testdf, test_acc = accuracy_check(best_model, test_loader, testdf, "test")
testdf.head()

  0%|          | 0/24 [00:00<?, ?it/s]

test set accuracy:  0.06657963446475196


Unnamed: 0,image_id,original,predicted
0,chitron_924.png,দুই,দুই
1,chitron_1150.png,দুই,দুই
2,chitron_3534.png,দুই,দুই
3,chitron_1410.png,তিন,দুই
4,chitron_8886.png,দুই,দুই


In [22]:
validdf = pd.DataFrame(columns= ['image_id', 'original', 'predicted'])

validdf, valid_acc = accuracy_check(best_model, val_loader, validdf, "valid")
validdf.head()

  0%|          | 0/24 [00:00<?, ?it/s]

valid set accuracy:  0.07128842380640942


Unnamed: 0,image_id,original,predicted
0,chitron_7881.png,পাঁচ,দুই
1,chitron_5952.png,দুই,দুই
2,chitron_1272.png,তিন,দুই
3,chitron_3587.png,আট,দুই
4,chitron_3106.png,দুই,দুই


## Bert score of lables

In [23]:
bertmodel = AutoModel.from_pretrained("csebuetnlp/banglabert")
berttokenizer = AutoTokenizer.from_pretrained("csebuetnlp/banglabert")

cos = nn.CosineSimilarity(dim=1, eps=1e-6)

config.json:   0%|          | 0.00/586 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/528k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [24]:
def bertScorer(df):
    sim_list = []
    for index, row in df.iterrows():
        
        original = berttokenizer.encode_plus(normalize(str(row['original'])), return_tensors="pt")
        preds = berttokenizer.encode_plus(normalize(str(row['predicted'])), return_tensors="pt")
        
        with torch.no_grad():
            d1 = bertmodel(original['input_ids'],attention_mask= original['attention_mask'])['last_hidden_state'][:,0,:]
            d2 = bertmodel(preds['input_ids'],attention_mask= preds['attention_mask'])['last_hidden_state'][:,0,:]
        
        sim_list.append(cos(d1, d2).item())
        
    return sim_list

In [25]:
similarity_list = bertScorer(testdf)
len(similarity_list)

1532

In [26]:
testdf['bert_sim_score'] = similarity_list
testdf.head()

Unnamed: 0,image_id,original,predicted,bert_sim_score
0,chitron_924.png,দুই,দুই,1.0
1,chitron_1150.png,দুই,দুই,1.0
2,chitron_3534.png,দুই,দুই,1.0
3,chitron_1410.png,তিন,দুই,0.988962
4,chitron_8886.png,দুই,দুই,1.0


In [27]:
testdf.to_csv("/kaggle/working/prediction_on_test.csv", index=False)

In [28]:
similarity_list_val = bertScorer(validdf)
len(similarity_list_val)

1529

In [29]:
validdf['bert_sim_score'] = similarity_list_val
validdf.head()

Unnamed: 0,image_id,original,predicted,bert_sim_score
0,chitron_7881.png,পাঁচ,দুই,0.962783
1,chitron_5952.png,দুই,দুই,1.0
2,chitron_1272.png,তিন,দুই,0.988962
3,chitron_3587.png,আট,দুই,0.963852
4,chitron_3106.png,দুই,দুই,1.0


In [30]:
validdf.to_csv("/kaggle/working/prediction_on_valid.csv", index=False)

In [31]:
with open("/kaggle/working/acc.txt", "w") as file:
    file.write(f"Valid accuracy: {valid_acc}\n")    
    file.write(f"Test accuracy: {test_acc}\n") 
    file.write(f"Valid BertScore: {validdf['bert_sim_score'].mean()*100:.3f}\n")    
    file.write(f"Test BertScore: {testdf['bert_sim_score'].mean()*100:.3f}\n")    