# Imports

In [1]:
!pip install transformers==4.40.1 -q
!pip install git+https://github.com/csebuetnlp/normalizer -q

In [2]:
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
from torchvision.io import read_image, ImageReadMode
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from torchvision.transforms import Resize
from transformers import AutoProcessor, AutoModel, AutoConfig, AutoTokenizer
from datasets import load_dataset
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from PIL import Image
from tqdm.notebook import tqdm
import itertools
from normalizer import normalize
import pandas as pd
from typing import Any, Optional
import gc
import numpy as np

import transformers as hf
hf.utils.logging.set_verbosity_error()
print(hf.__version__)

2024-06-02 07:19:28.600074: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-02 07:19:28.600173: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-02 07:19:28.751254: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


4.40.1


# Model Selection

In [3]:
models = {
    "ConvNext" : "facebook/convnext-tiny-224",
    "EfficientNetb0" : "google/efficientnet-b0", 
    "EfficientNetb4" : "google/efficientnet-b4",
    "ResNet" : "microsoft/resnet-50", 
    "ViT" : "google/vit-base-patch16-224-in21k", 
    "DeiT" : "facebook/deit-base-patch16-384",
    "BeiT" : "microsoft/beit-large-patch16-384"
}

# Load data

In [4]:
train = pd.read_csv("/kaggle/input/vqa-bangla/Bangla_VQA/train.csv")
valid = pd.read_csv("/kaggle/input/vqa-bangla/Bangla_VQA/valid.csv")
test = pd.read_csv("/kaggle/input/vqa-bangla/Bangla_VQA/test.csv")

train.head()

Unnamed: 0,image_name,Captions,Question,Answer,Category,Question_en
0,bnature_663.jpg,খালের পানিতে তিনটি গাছের প্রতিচ্ছবি সাথে গৌধোল...,ছবিতে কতগুলো গাছের প্রতিচ্ছবি দেখা যাচ্ছে?,তিনটি,numeric,How many trees are reflected in the picture?
1,chitron_5113.png,অনেকগুলো মানুষ বসে আছে। মঞ্চের উপর কয়েকজন মানু...,ছবিতে কতজন মানুষ মঞ্চের উপর দাঁড়িয়ে আছে?,পাঁচজন,numeric,How many people are on the stage?
2,bnature_876.jpg,দুজন ছেলে ও দুজন মেয়ে রাস্তা দিয়ে পাশাপাশি হ...,ছবিতে কতজন ছেলে ও মেয়ে একসাথে হাটছে?,চারজন,numeric,How many boys and girls are walking together i...
3,bnature_1007.jpg,"রাস্তা দিয়ে কয়েকজন ছাত্র ছাত্রী যাচ্ছে, যাদে...",ছবিতে কতজন ছাত্র ছাত্রী রাস্তা দিয়ে হাঁটছে?,৪ জন,numeric,How many students are walking on the street in...
4,chitron_7446.png,'১ ইট তালগাছ ১ টি খেজুর গাছ এবং রাস্তা দিয়ে ছ...,ছবিতে কতগুলো গাছ দেখা যাচ্ছে?,২ টি,numeric,How many trees are shown in the picture?


In [5]:
concat = pd.concat([train, valid, test])

uniques = set(concat['Answer'].unique())
labels = list(uniques)

labels = [normalize(str(l)) for l in labels]

train.shape, valid.shape, test.shape, len(labels)

((12231, 6), (1529, 6), (1532, 6), 5699)

# Config class

In [6]:
class Configs:
    model = models['ConvNext']
    epochs= 20
    num_workers=4
    learning_rate=2e-5 
    batch_size=128 # batch size
    max_len= 512  # df.top_captions.map(len).max()    
    weight_decay=0.01 # for adam optimizer regulaization parameter
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42 # seed no. for random initialization 
    train=True
    num_class = len(labels) # Number of class in your dataset
    image_path = "/kaggle/input/vqa-bangla/Bangla_VQA/images"
    processor = None
    

cfg = Configs()
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f'Using device: {device}')

Using device: cuda


In [7]:
config = AutoConfig.from_pretrained(cfg.model)
# cfg.model_configs
# cfg.mlp_input, config
# config.hidden_size

config.json:   0%|          | 0.00/69.6k [00:00<?, ?B/s]

# Model

In [8]:
class VisModel(nn.Module):
    def __init__(self,  configs):
        super().__init__()
        self.model_name = configs.model
        self.model_config = AutoConfig.from_pretrained(configs.model)
        self.pretrained_model = AutoModel.from_pretrained(configs.model) 
        
#         for param in self.pretrained_model.parameters():
#             param.requires_grad = False

        self.mlp_input = (
            self.model_config.hidden_size if hasattr(self.model_config, 'hidden_size') else
            self.model_config.hidden_sizes[-1] if hasattr(self.model_config, 'hidden_sizes') else
            self.model_config.hidden_dim if hasattr(self.model_config, 'hidden_dim') else
            None
        )
        self.fc1 = nn.Linear(in_features=self.mlp_input, out_features=1024)
        self.relu = nn.ReLU()  
        self.fc2 = nn.Linear(in_features=1024, out_features=configs.num_class)
        
    def forward(self, **kwargs):
        embeds = self.pretrained_model(**kwargs)

        if self.model_name == models['ResNet']:
            embeds['pooler_output'] = embeds['pooler_output'].squeeze(2,3)
            
        outs = embeds.pooler_output
        
        x = self.fc1(outs)
        x = self.relu(x) 
        x = self.fc2(x)
        return x


model = VisModel(cfg)
model = nn.DataParallel(model)
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=cfg.learning_rate)
criterion = torch.nn.CrossEntropyLoss()

pytorch_model.bin:   0%|          | 0.00/114M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


# Custom dataset

In [9]:
cfg.processor = AutoProcessor.from_pretrained(cfg.model)

preprocessor_config.json:   0%|          | 0.00/266 [00:00<?, ?B/s]

In [10]:
class CustomDataset(Dataset):
    def __init__(self, data, configs, labels, transform) -> None:
        super().__init__()
        self.df = data
        self.processor = configs.processor
        self.classes = labels
        self.image_path = configs.image_path
        self.image_transform = transform
        
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        row = self.df.iloc[index]
        
        label = torch.tensor(self.classes.index(normalize(self.df.loc[index, "Answer"])), dtype=torch.long)
        image_id = row['image_name']
        
        image = read_image(f"{self.image_path}/{image_id}", mode=ImageReadMode.RGB)
        image = self.image_transform(image)
        
        image = self.processor(image, return_tensors="pt")

        return {
            "image_id" : image_id,
            "pixel_values" : image['pixel_values'].squeeze(),
            "labels": label
         }

In [11]:
resizer = transforms.Resize((224, 224), antialias=True)

def resize_images(img_tensor):
    return resizer(img_tensor)

In [12]:
train_dataset = CustomDataset(data=train, configs=cfg, labels=labels, transform=resizer)
valid_dataset = CustomDataset(data=valid, configs=cfg, labels=labels, transform=resizer)
test_dataset = CustomDataset(data=test, configs=cfg, labels=labels, transform=resizer)

In [13]:
train_loader = DataLoader(dataset=train_dataset, batch_size=cfg.batch_size, shuffle=True) #, num_workers=cfg.num_workers)
val_loader = DataLoader(dataset=valid_dataset, batch_size=cfg.batch_size, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=cfg.batch_size, shuffle=False)

In [14]:
%%time
for batch in train_loader:
    print(batch['pixel_values'].shape, batch['image_id'][0])
    break

torch.Size([128, 3, 224, 224]) chitron_5358.png
CPU times: user 4.1 s, sys: 102 ms, total: 4.2 s
Wall time: 2.83 s


# Training loop

In [15]:
best_loss = 1e10
best_acc = float('-inf')

for epoch in tqdm(range(cfg.epochs)):
    loss = 0.0 
    
    model.train()
    for step, batch in enumerate(tqdm(train_loader)):
        label = batch['labels'].to(device)
        
        inputs = batch['pixel_values'].to(device)
        inputs_dict = {'pixel_values': inputs}
        
        outputs = model(**inputs_dict)
                
        loss = criterion(outputs, label)
        loss += loss.item()
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
    
    if step % 100 ==0:
            torch.cuda.empty_cache()
            gc.collect()
            
    loss /= len(train_loader)
    print(f"train: Epoch {epoch+1}/{cfg.epochs}, Loss: {loss}")
    
    
    model.eval()
    val_loss = 0.0
    acc = 0.0
    true_labels = []
    predicted_labels = []
    
    with torch.no_grad():
        for batch in tqdm(val_loader):
            label = batch['labels'].to(device)
            
            inputs = batch['pixel_values'].to(device)
            inputs_dict = {'pixel_values': inputs}
            
            outputs = model(**inputs_dict)
            
            val_loss = criterion(outputs, label)
            val_loss += val_loss.item()
            
            _, predicted = torch.max(outputs, 1)
            true_labels.extend(label.cpu().numpy())
            predicted_labels.extend(predicted.cpu().numpy())
    
#             print([labels[l] for l in label])
#             print([labels[p] for p in predicted])
            
        true_labels = np.array(true_labels)
        predicted_labels = np.array(predicted_labels)

        acc = (true_labels == predicted_labels).mean()

            
        val_loss /= len(val_loader)
        print(f"valid: Epoch {epoch+1}/{cfg.epochs}, Loss: {val_loss} \nAccuracy: {acc}")
        
    if best_acc < acc:
        best_acc = acc
        print("saving best model...")
        torch.save({
            'model_state_dict': model.module.state_dict(),
            'cfg': cfg
            }, '/kaggle/working/best_model.pth')

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/96 [00:00<?, ?it/s]

train: Epoch 1/20, Loss: 0.1771041601896286


  0%|          | 0/12 [00:00<?, ?it/s]

valid: Epoch 1/20, Loss: 1.4327898025512695 
Accuracy: 0.02354480052321779
saving best model...


  0%|          | 0/96 [00:00<?, ?it/s]

train: Epoch 2/20, Loss: 0.15741480886936188


  0%|          | 0/12 [00:00<?, ?it/s]

valid: Epoch 2/20, Loss: 1.4437825679779053 
Accuracy: 0.020928711576193592


  0%|          | 0/96 [00:00<?, ?it/s]

train: Epoch 3/20, Loss: 0.14568324387073517


  0%|          | 0/12 [00:00<?, ?it/s]

valid: Epoch 3/20, Loss: 1.4556951522827148 
Accuracy: 0.029431000654022238
saving best model...


  0%|          | 0/96 [00:00<?, ?it/s]

train: Epoch 4/20, Loss: 0.14959388971328735


  0%|          | 0/12 [00:00<?, ?it/s]

valid: Epoch 4/20, Loss: 1.4699152708053589 
Accuracy: 0.030085022890778287
saving best model...


  0%|          | 0/96 [00:00<?, ?it/s]

train: Epoch 5/20, Loss: 0.15100476145744324


  0%|          | 0/12 [00:00<?, ?it/s]

valid: Epoch 5/20, Loss: 1.4869239330291748 
Accuracy: 0.03662524525833878
saving best model...


  0%|          | 0/96 [00:00<?, ?it/s]

train: Epoch 6/20, Loss: 0.13555476069450378


  0%|          | 0/12 [00:00<?, ?it/s]

valid: Epoch 6/20, Loss: 1.4935548305511475 
Accuracy: 0.03531720078482668


  0%|          | 0/96 [00:00<?, ?it/s]

train: Epoch 7/20, Loss: 0.14709477126598358


  0%|          | 0/12 [00:00<?, ?it/s]

valid: Epoch 7/20, Loss: 1.494340419769287 
Accuracy: 0.041203400915631135
saving best model...


  0%|          | 0/96 [00:00<?, ?it/s]

train: Epoch 8/20, Loss: 0.12499725073575974


  0%|          | 0/12 [00:00<?, ?it/s]

valid: Epoch 8/20, Loss: 1.5214426517486572 
Accuracy: 0.04316546762589928
saving best model...


  0%|          | 0/96 [00:00<?, ?it/s]

train: Epoch 9/20, Loss: 0.14014869928359985


  0%|          | 0/12 [00:00<?, ?it/s]

valid: Epoch 9/20, Loss: 1.508690595626831 
Accuracy: 0.051013734466971876
saving best model...


  0%|          | 0/96 [00:00<?, ?it/s]

train: Epoch 10/20, Loss: 0.14026525616645813


  0%|          | 0/12 [00:00<?, ?it/s]

valid: Epoch 10/20, Loss: 1.5319671630859375 
Accuracy: 0.051667756703727925
saving best model...


  0%|          | 0/96 [00:00<?, ?it/s]

train: Epoch 11/20, Loss: 0.13504451513290405


  0%|          | 0/12 [00:00<?, ?it/s]

valid: Epoch 11/20, Loss: 1.5350755453109741 
Accuracy: 0.052321778940483975
saving best model...


  0%|          | 0/96 [00:00<?, ?it/s]

train: Epoch 12/20, Loss: 0.12643754482269287


  0%|          | 0/12 [00:00<?, ?it/s]

valid: Epoch 12/20, Loss: 1.5300135612487793 
Accuracy: 0.05428384565075212
saving best model...


  0%|          | 0/96 [00:00<?, ?it/s]

train: Epoch 13/20, Loss: 0.11761732399463654


  0%|          | 0/12 [00:00<?, ?it/s]

valid: Epoch 13/20, Loss: 1.550698161125183 
Accuracy: 0.052321778940483975


  0%|          | 0/96 [00:00<?, ?it/s]

train: Epoch 14/20, Loss: 0.12042760103940964


  0%|          | 0/12 [00:00<?, ?it/s]

valid: Epoch 14/20, Loss: 1.5852670669555664 
Accuracy: 0.050359712230215826


  0%|          | 0/96 [00:00<?, ?it/s]

train: Epoch 15/20, Loss: 0.12371836602687836


  0%|          | 0/12 [00:00<?, ?it/s]

valid: Epoch 15/20, Loss: 1.585686445236206 
Accuracy: 0.052975801177240024


  0%|          | 0/96 [00:00<?, ?it/s]

train: Epoch 16/20, Loss: 0.10369236767292023


  0%|          | 0/12 [00:00<?, ?it/s]

valid: Epoch 16/20, Loss: 1.6092742681503296 
Accuracy: 0.05493786788750817
saving best model...


  0%|          | 0/96 [00:00<?, ?it/s]

train: Epoch 17/20, Loss: 0.10970219224691391


  0%|          | 0/12 [00:00<?, ?it/s]

valid: Epoch 17/20, Loss: 1.6108354330062866 
Accuracy: 0.05428384565075212


  0%|          | 0/96 [00:00<?, ?it/s]

train: Epoch 18/20, Loss: 0.10712051391601562


  0%|          | 0/12 [00:00<?, ?it/s]

valid: Epoch 18/20, Loss: 1.6443538665771484 
Accuracy: 0.053629823413996074


  0%|          | 0/96 [00:00<?, ?it/s]

train: Epoch 19/20, Loss: 0.10685669630765915


  0%|          | 0/12 [00:00<?, ?it/s]

valid: Epoch 19/20, Loss: 1.6654841899871826 
Accuracy: 0.058207979071288427
saving best model...


  0%|          | 0/96 [00:00<?, ?it/s]

train: Epoch 20/20, Loss: 0.10443361848592758


  0%|          | 0/12 [00:00<?, ?it/s]

valid: Epoch 20/20, Loss: 1.7015018463134766 
Accuracy: 0.060824068018312624
saving best model...


In [16]:
# outputs

In [17]:
torch.cuda.empty_cache()
gc.collect()


126

## inference

In [18]:
def accuracy_check(best_model, loader, df, txt):
    
    true_labels = []
    predicted_labels = []
    
    with torch.no_grad():
        for batch in tqdm(loader):
            image_id = batch['image_id']
            label = batch['labels'].to(device)
            
            inputs = batch['pixel_values'].to(device)
            inputs_dict = {'pixel_values': inputs}
            
            outputs = best_model(**inputs_dict)

            _, predicted = torch.max(outputs, 1) # accuracy purpose
            true_labels.extend(label.cpu().numpy())
            predicted_labels.extend(predicted.cpu().numpy())

            labels_batch  = [labels[l] for l in label] # concat to the dataframe
            predict_batch = [labels[p] for p in predicted]
            
            new_rows = {'image_id': image_id, 'original': labels_batch, 'predicted': predict_batch}
            new_batch = pd.DataFrame(data=new_rows)

            df = pd.concat([df, new_batch])

    true_labels = np.array(true_labels)
    predicted_labels = np.array(predicted_labels)


    accuracy = (true_labels == predicted_labels).sum()/len(true_labels) #.mean()
    print(f"{txt} set accuracy: ", accuracy)
    
    return df, accuracy

In [19]:
# load the best model
checkpoint = torch.load('/kaggle/working/best_model.pth', map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
cfg = checkpoint['cfg']
best_model = VisModel(cfg)
best_model.load_state_dict(checkpoint['model_state_dict'])

best_model = best_model.to(device)

In [20]:
testdf = pd.DataFrame(columns= ['image_id', 'original', 'predicted'])

testdf, test_acc = accuracy_check(best_model, test_loader, testdf, "test")

  0%|          | 0/12 [00:00<?, ?it/s]

test set accuracy:  0.06984334203655353


In [21]:
print(testdf['predicted'].value_counts())
testdf.head()

predicted
শাড়ি    155
সাদা     104
লাল       99
তিনজন     96
২ জন      92
        ... 
পতাকা      1
জাল        1
রাতে       1
রিকশা      1
ফুল        1
Name: count, Length: 74, dtype: int64


Unnamed: 0,image_id,original,predicted
0,chitron_924.png,দুইজন,২ টি
1,chitron_1150.png,২ জন,দুইজন
2,chitron_3534.png,দুই,বসে আছে
3,chitron_1410.png,৩ জন,৪ জন
4,chitron_8886.png,দুই,তিনজন


In [22]:
validdf = pd.DataFrame(columns= ['image_id', 'original', 'predicted'])

validdf, valid_acc = accuracy_check(best_model, val_loader, validdf, "valid")
validdf.head()

  0%|          | 0/12 [00:00<?, ?it/s]

valid set accuracy:  0.060824068018312624


Unnamed: 0,image_id,original,predicted
0,chitron_7881.png,পাঁচজন,চারজন
1,chitron_5952.png,২ জন,দুইজন
2,chitron_1272.png,তিনজন,তিনজন
3,chitron_3587.png,৮ জন,পাঁচজন
4,chitron_3106.png,দুটি,বাংলাদেশ


In [23]:
print(validdf['predicted'].value_counts())

predicted
শাড়ি      180
তিনজন      111
সাদা        96
২ জন        84
লাল         83
          ... 
মোমবাতি      1
রিকশা        1
ফুল          1
পানি         1
ভাত          1
Name: count, Length: 74, dtype: int64


In [24]:
bertmodel = AutoModel.from_pretrained("csebuetnlp/banglabert")
berttokenizer = AutoTokenizer.from_pretrained("csebuetnlp/banglabert")

cos = nn.CosineSimilarity(dim=1, eps=1e-6)

config.json:   0%|          | 0.00/586 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/528k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [25]:
def bertScorer(df):
    sim_list = []
    for index, row in df.iterrows():
        
        original = berttokenizer.encode_plus(normalize(str(row['original'])), return_tensors="pt")
        preds = berttokenizer.encode_plus(normalize(str(row['predicted'])), return_tensors="pt")
        
        with torch.no_grad():
            d1 = bertmodel(original['input_ids'],attention_mask= original['attention_mask'])['last_hidden_state'][:,0,:]
            d2 = bertmodel(preds['input_ids'],attention_mask= preds['attention_mask'])['last_hidden_state'][:,0,:]
        
        sim_list.append(cos(d1, d2).item())
        
    return sim_list

In [26]:
similarity_list = bertScorer(testdf)
len(similarity_list)

1532

In [27]:
testdf['bert_sim_score'] = similarity_list
testdf.head()

Unnamed: 0,image_id,original,predicted,bert_sim_score
0,chitron_924.png,দুইজন,২ টি,0.814027
1,chitron_1150.png,২ জন,দুইজন,0.935986
2,chitron_3534.png,দুই,বসে আছে,0.849392
3,chitron_1410.png,৩ জন,৪ জন,0.997443
4,chitron_8886.png,দুই,তিনজন,0.941828


In [28]:
testdf.to_csv("/kaggle/working/prediction_on_test.csv", index=False)

In [29]:
similarity_list_val = bertScorer(validdf)
len(similarity_list_val)

1529

In [30]:
validdf['bert_sim_score'] = similarity_list_val
validdf.head()

Unnamed: 0,image_id,original,predicted,bert_sim_score
0,chitron_7881.png,পাঁচজন,চারজন,0.995786
1,chitron_5952.png,২ জন,দুইজন,0.935986
2,chitron_1272.png,তিনজন,তিনজন,1.0
3,chitron_3587.png,৮ জন,পাঁচজন,0.921859
4,chitron_3106.png,দুটি,বাংলাদেশ,0.897878


In [31]:
validdf.to_csv("/kaggle/working/prediction_on_valid.csv", index=False)

In [32]:
with open("/kaggle/working/acc.txt", "w") as file:
    file.write(f"Valid accuracy: {valid_acc}\n")    
    file.write(f"Test accuracy: {test_acc}")
