In [4]:
! pip install transformers
! pip install datasets 
! pip install --upgrade tqdm
! pip install torcheval



In [5]:
model_path ="/kaggle/working/models/"

In [6]:
!mkdir -p models

In [7]:
%load_ext autoreload
%autoreload 2

import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms, models
from torch.utils.data import Dataset, DataLoader, ConcatDataset
from torcheval.metrics.functional import binary_auroc
from torch.optim.lr_scheduler import ReduceLROnPlateau

from transformers import RobertaModel, RobertaTokenizerFast

import os
from PIL import Image
import requests
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm
import cv2

In [8]:
print(torch.__version__)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device=='cuda')

2.1.2
True


In [9]:
path = "/kaggle/input/facebook-hateful-memes/hateful_memes/"

In [10]:
import pandas as pd
val_seen = pd.read_json(path+'dev_seen.jsonl',lines=True)
val_unseen= pd.read_json(path+'dev_unseen.jsonl',lines=True)
test_seen=pd.read_json(path+'test_seen.jsonl',lines=True)
test_unseen=pd.read_json(path+'test_unseen.jsonl',lines=True)
df_train =pd.read_json(path+'train.jsonl',lines=True)

In [11]:
class HatefulMemesDataset(Dataset):
    def __init__(self, jsonl_file, root_dir, transform=None):
        self.annotations = pd.read_json(jsonl_file, lines=True)
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        id = self.annotations.loc[idx, 'id']
        img_path = os.path.join(self.root_dir, self.annotations.loc[idx, 'img'])
        label = self.annotations.loc[idx, 'label']
        text = self.annotations.loc[idx, 'text']

        try:
            # Load image
            image = cv2.imread(img_path)
            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

            # Apply transformations
            if self.transform:
                image = self.transform(image)

            sample = {"id": id, "image": image, "label": label, "text": text}

            return sample

        except Exception as e:
            print(f"Error loading image at index {idx}: {e}")
            return None

In [12]:
hm_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
])

In [13]:
train_jsonl = os.path.join(path, "train.jsonl")
val_seen_jsonl = os.path.join(path, "dev_seen.jsonl")
val_unseen_jsonl = os.path.join(path, "dev_unseen.jsonl")
test_seen_jsonl = os.path.join(path, "test_seen.jsonl")
test_unseen_jsonl = os.path.join(path, "test_unseen.jsonl")

In [14]:
# Create datasets
train_dataset = HatefulMemesDataset(jsonl_file=train_jsonl, root_dir=path, transform=hm_transform)
val_seen_dataset = HatefulMemesDataset(jsonl_file=val_seen_jsonl, root_dir=path, transform=hm_transform)
val_unseen_dataset = HatefulMemesDataset(jsonl_file=val_unseen_jsonl, root_dir=path, transform=hm_transform)
test_seen_dataset = HatefulMemesDataset(jsonl_file=test_seen_jsonl, root_dir=path, transform=hm_transform)
test_unseen_dataset = HatefulMemesDataset(jsonl_file=test_unseen_jsonl, root_dir=path, transform=hm_transform)

In [15]:
# Hyperparameters
batch_size = 16

In [16]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# val_seen_dataloader = DataLoader(val_seen_dataset, batch_size=batch_size, shuffle=False)
# val_unseen_dataloader = DataLoader(val_unseen_dataset, batch_size=batch_size, shuffle=False)
# test_seen_dataloader = DataLoader(test_seen_dataset, batch_size=batch_size, shuffle=False)
# test_unseen_dataloader = DataLoader(test_unseen_dataset, batch_size=batch_size, shuffle=False)

In [17]:
val_dataset = ConcatDataset([val_seen_dataset, val_unseen_dataset])
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [18]:
test_dataset = ConcatDataset([test_seen_dataset, test_unseen_dataset])
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Metrics

In [19]:
def get_metrics_and_losses(losses, predictions, labels):
  average_loss = losses.mean().item()
  accuracy = (predictions == labels).sum().item() / labels.numel()
  auroc = binary_auroc(predictions, labels)

  return average_loss, accuracy, auroc

# Resnet152

In [20]:
resnet152 = models.resnet152(pretrained=True)
resnet152_fe = nn.Sequential(*list(resnet152.children())[:-1])
for p in resnet152_fe.parameters():
    p.requires_grad = False
resnet152_fe.to(device)

Downloading: "https://download.pytorch.org/models/resnet152-394f9c45.pth" to /root/.cache/torch/hub/checkpoints/resnet152-394f9c45.pth
100%|██████████| 230M/230M [00:13<00:00, 17.6MB/s] 


Sequential(
  (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU(inplace=True)
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (4): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)


# RoBERTa

In [21]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
roberta_model = RobertaModel.from_pretrained("roberta-base")
for param in roberta_model.parameters():
    param.requires_grad = False
roberta_model.to(device)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

# Head

In [22]:
class SimpleHead(nn.Module):
    def __init__(self):
        super().__init__()

        self.linear_stack = nn.Sequential(
            nn.BatchNorm1d(2816),
            nn.Linear(2816, 1024),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(1024, 2),
        )
    
    def forward(self, x):
        logits = self.linear_stack(x)
        return logits

In [23]:
head = SimpleHead()
head.to(device)

SimpleHead(
  (linear_stack): Sequential(
    (0): BatchNorm1d(2816, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (1): Linear(in_features=2816, out_features=1024, bias=True)
    (2): ReLU()
    (3): Dropout(p=0.1, inplace=False)
    (4): Linear(in_features=1024, out_features=2, bias=True)
  )
)

In [27]:
lr = 1e-5
epochs = 40
weight_decay = 1e-3
accumulate_steps = 4

# Define criterion
criterion = nn.CrossEntropyLoss()

# Define optimizer
optimizer = optim.Adam(head.parameters(), lr=lr, weight_decay=weight_decay)

# Initialize learning rate scheduler
scheduler = ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=5, verbose=True)

# Initialize early stopping parameters
early_stopping_counter = 0
early_stopping_patience = 5
best_validation_auroc = float('-inf')

In [28]:
# Train model head
for e in range(epochs):
    # Training
    train_losses = []
    train_predictions = []
    train_labels = []

    roberta_model.train()
    optimizer.zero_grad()

    for i, data in tqdm(enumerate(train_dataloader), total=len(train_dataloader), desc="Epoch " + str(e+1) + " Training"):
        text = data['text']
        image = data['image'].to(device).float()
        labels = data['label'].to(device).to(torch.int64)

        image_features = resnet152_fe(image).squeeze()

        tokenized_input = tokenizer(text, padding=True, return_tensors="pt").to(device)
        text_features = roberta_model(**tokenized_input)

        concat_features = torch.cat((image_features, text_features["pooler_output"]), dim=1)

        scores = torch.squeeze(head(concat_features))
        predictions = scores.argmax(dim=-1)

        loss = criterion(scores, labels)
        loss.backward()

        if (i + 1) % accumulate_steps == 0 or i == len(train_dataloader) - 1:
            optimizer.step()
            optimizer.zero_grad()

        train_losses.append(loss.item())
        train_predictions.extend(predictions.cpu().numpy())
        train_labels.extend(labels.cpu().numpy())

    train_average_loss, train_accuracy, train_auroc = get_metrics_and_losses(torch.tensor(train_losses), torch.tensor(train_predictions), torch.tensor(train_labels))

    # Validation
    validate_losses = torch.zeros(len(val_dataloader)).to(device)
    validate_predictions = torch.Tensor().to(device)
    validate_labels = torch.Tensor().to(device)

    resnet152_fe.eval()
    roberta_model.eval()
    head.eval()

    with torch.no_grad():
        for i, data in tqdm(enumerate(val_dataloader), total=len(val_dataloader), desc="Epoch " + str(e+1) + " Validation"):
            text = data['text']
            image = data['image'].to(device).float()
            labels = data['label'].to(device).to(torch.int64)

            image_features = resnet152_fe(image).squeeze()

            tokenized_input = tokenizer(text, padding=True, return_tensors="pt").to(device)
            text_features = roberta_model(**tokenized_input)

            concat_features = torch.cat((image_features, text_features["pooler_output"]), dim=1)

            scores = torch.squeeze(head(concat_features))
            predictions = scores.argmax(dim=-1)

            loss = criterion(scores, labels)

            validate_losses[i] = loss
            validate_predictions = torch.cat((validate_predictions, predictions), dim=0)
            validate_labels = torch.cat((validate_labels, labels), dim=0)

    validate_average_loss, validate_accuracy, validate_auroc = get_metrics_and_losses(validate_losses, validate_predictions, validate_labels)

    # Update learning rate scheduler
    scheduler.step(validate_auroc)

    # Early stopping
    if validate_auroc > best_validation_auroc:
        best_validation_auroc = validate_auroc
        early_stopping_counter = 0
        # Save the best model
        torch.save(head, model_path + "/concat_roberta_best.pt")
        print("New best model saved at epoch", e+1)
    else:
        early_stopping_counter += 1
        if early_stopping_counter >= early_stopping_patience:
            print("Early stopping triggered at epoch", e+1)
            break

    # Additional logging or tasks can be performed here
    print("Epoch %d" % (e+1))
    print("Training Loss: %.4f. Validation Loss: %.4f. " % (train_average_loss, validate_average_loss))
    print("Training Accuracy: %.4f. Validation Accuracy: %.4f. " % (train_accuracy, validate_accuracy))
    print("Training AUROC: %.4f. Validation AUROC: %.4f. " % (train_auroc, validate_auroc))
    print("-----------------------------------")

print("Training complete.")

Epoch 1 Training: 100%|██████████| 532/532 [03:48<00:00,  2.33it/s]
Epoch 1 Validation: 100%|██████████| 65/65 [00:25<00:00,  2.55it/s]


New best model saved at epoch 1
Epoch 1
Training Loss: 0.6568. Validation Loss: 0.6987. 
Training Accuracy: 0.6129. Validation Accuracy: 0.5625. 
Training AUROC: 0.5317. Validation AUROC: 0.5023. 
-----------------------------------


Epoch 2 Training: 100%|██████████| 532/532 [03:06<00:00,  2.86it/s]
Epoch 2 Validation: 100%|██████████| 65/65 [00:22<00:00,  2.92it/s]


New best model saved at epoch 2
Epoch 2
Training Loss: 0.6239. Validation Loss: 0.7026. 
Training Accuracy: 0.6562. Validation Accuracy: 0.5625. 
Training AUROC: 0.5374. Validation AUROC: 0.5065. 
-----------------------------------


Epoch 3 Training: 100%|██████████| 532/532 [03:08<00:00,  2.83it/s]
Epoch 3 Validation: 100%|██████████| 65/65 [00:22<00:00,  2.83it/s]


Epoch 3
Training Loss: 0.6059. Validation Loss: 0.7062. 
Training Accuracy: 0.6711. Validation Accuracy: 0.5615. 
Training AUROC: 0.5630. Validation AUROC: 0.5040. 
-----------------------------------


Epoch 4 Training: 100%|██████████| 532/532 [03:07<00:00,  2.83it/s]
Epoch 4 Validation: 100%|██████████| 65/65 [00:22<00:00,  2.92it/s]


New best model saved at epoch 4
Epoch 4
Training Loss: 0.5948. Validation Loss: 0.7080. 
Training Accuracy: 0.6832. Validation Accuracy: 0.5654. 
Training AUROC: 0.5792. Validation AUROC: 0.5115. 
-----------------------------------


Epoch 5 Training: 100%|██████████| 532/532 [03:08<00:00,  2.83it/s]
Epoch 5 Validation: 100%|██████████| 65/65 [00:22<00:00,  2.85it/s]


New best model saved at epoch 5
Epoch 5
Training Loss: 0.5826. Validation Loss: 0.7127. 
Training Accuracy: 0.6956. Validation Accuracy: 0.5712. 
Training AUROC: 0.6003. Validation AUROC: 0.5146. 
-----------------------------------


Epoch 6 Training: 100%|██████████| 532/532 [03:07<00:00,  2.84it/s]
Epoch 6 Validation: 100%|██████████| 65/65 [00:22<00:00,  2.90it/s]


New best model saved at epoch 6
Epoch 6
Training Loss: 0.5733. Validation Loss: 0.7156. 
Training Accuracy: 0.7072. Validation Accuracy: 0.5740. 
Training AUROC: 0.6124. Validation AUROC: 0.5163. 
-----------------------------------


Epoch 7 Training: 100%|██████████| 532/532 [03:08<00:00,  2.82it/s]
Epoch 7 Validation: 100%|██████████| 65/65 [00:22<00:00,  2.91it/s]


New best model saved at epoch 7
Epoch 7
Training Loss: 0.5634. Validation Loss: 0.7172. 
Training Accuracy: 0.7144. Validation Accuracy: 0.5798. 
Training AUROC: 0.6244. Validation AUROC: 0.5261. 
-----------------------------------


Epoch 8 Training: 100%|██████████| 532/532 [03:06<00:00,  2.85it/s]
Epoch 8 Validation: 100%|██████████| 65/65 [00:22<00:00,  2.90it/s]


Epoch 8
Training Loss: 0.5551. Validation Loss: 0.7240. 
Training Accuracy: 0.7207. Validation Accuracy: 0.5712. 
Training AUROC: 0.6335. Validation AUROC: 0.5143. 
-----------------------------------


Epoch 9 Training: 100%|██████████| 532/532 [03:06<00:00,  2.85it/s]
Epoch 9 Validation: 100%|██████████| 65/65 [00:22<00:00,  2.88it/s]


Epoch 9
Training Loss: 0.5441. Validation Loss: 0.7255. 
Training Accuracy: 0.7352. Validation Accuracy: 0.5721. 
Training AUROC: 0.6509. Validation AUROC: 0.5152. 
-----------------------------------


Epoch 10 Training: 100%|██████████| 532/532 [03:07<00:00,  2.83it/s]
Epoch 10 Validation: 100%|██████████| 65/65 [00:22<00:00,  2.86it/s]


Epoch 10
Training Loss: 0.5375. Validation Loss: 0.7218. 
Training Accuracy: 0.7346. Validation Accuracy: 0.5673. 
Training AUROC: 0.6523. Validation AUROC: 0.5134. 
-----------------------------------


Epoch 11 Training: 100%|██████████| 532/532 [03:06<00:00,  2.85it/s]
Epoch 11 Validation: 100%|██████████| 65/65 [00:22<00:00,  2.87it/s]


Epoch 11
Training Loss: 0.5293. Validation Loss: 0.7328. 
Training Accuracy: 0.7482. Validation Accuracy: 0.5779. 
Training AUROC: 0.6714. Validation AUROC: 0.5208. 
-----------------------------------


Epoch 12 Training: 100%|██████████| 532/532 [03:06<00:00,  2.85it/s]
Epoch 12 Validation: 100%|██████████| 65/65 [00:22<00:00,  2.88it/s]

Early stopping triggered at epoch 12
Training complete.





In [29]:
# Rebuild the model architecture
best_head = torch.load(model_path + "/concat_roberta_best.pt")
best_head.to(device)

# Load the saved head weights
# best_head.load_state_dict(torch.load(model_path + "/concat_roberta_best.pt"))

# Evaluate on test set
test_losses = torch.zeros(len(test_dataloader)).to(device)
test_predictions = torch.Tensor().to(device)
test_labels = torch.Tensor().to(device)

best_head.eval()  # Set the head to evaluation mode

with torch.no_grad():
    for i, data in tqdm(enumerate(test_dataloader), total=len(test_dataloader), desc="Testing"):
        text = data['text']
        image = data['image'].to(device).float()
        labels = data['label'].to(device).to(torch.int64)

        image_features = resnet152_fe(image).squeeze()

        tokenized_input = tokenizer(text, padding=True, return_tensors="pt").to(device)
        text_features = roberta_model(**tokenized_input)

        concat_features = torch.cat((image_features, text_features["pooler_output"]), dim=1)

        scores = torch.squeeze(best_head(concat_features))
        predictions = scores.argmax(dim=-1)

        loss = criterion(scores, labels)

        test_losses[i] = loss
        test_predictions = torch.cat((test_predictions, predictions), dim=0)
        test_labels = torch.cat((test_labels, labels), dim=0)

test_average_loss, test_accuracy, test_auroc = get_metrics_and_losses(test_losses, test_predictions, test_labels)

print("Test Loss: %.4f" % test_average_loss)
print("Test Accuracy: %.4f" % test_accuracy)
print("Test AUROC: %.4f" % test_auroc)

Testing: 100%|██████████| 188/188 [01:17<00:00,  2.42it/s]

Test Loss: 0.6885
Test Accuracy: 0.5883
Test AUROC: 0.5199



