In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import zipfile as zf
files = zf.ZipFile("/content/drive/MyDrive/FBHM.zip", 'r')
files.extractall('memes')
files.close()

In [3]:
!pip install efficientnet_pytorch

Collecting efficientnet_pytorch
  Downloading efficientnet_pytorch-0.7.1.tar.gz (21 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->efficientnet_pytorch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->efficientnet_pytorch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->efficientnet_pytorch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->efficientnet_pytorch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch->efficientnet_pytorch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metada

In [4]:
import os
import json
import numpy as np
import torch
import torch.nn as nn

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torchvision import transforms
from transformers import LxmertModel, LxmertTokenizer
from efficientnet_pytorch import EfficientNet
from sklearn.metrics import precision_score, recall_score, f1_score
from tqdm import tqdm
from PIL import Image
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [None]:
import os

class ToxicMemeDataset(Dataset):
    def __init__(self, data_list, img_dir, tokenizer, max_len, transform=None):
        self.data = data_list
        self.img_dir = img_dir
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.transform = transform

    def __len__(self):
        return len(self.data)  # Return the length of the data list

    def __getitem__(self, idx):
        item = self.data[idx]
        text = item["text"]
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )

        # Ensure correct image path handling
        img_path = os.path.join(self.img_dir, item["img"].lstrip("/"))  # Remove leading '/'

        # Check if image exists
        if not os.path.exists(img_path):
            raise FileNotFoundError(f"Image not found: {img_path}")

        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)

        return {
            "input_ids": encoding["input_ids"].squeeze(0).long(),
            "attention_mask": encoding["attention_mask"].squeeze(0).long(),
            "image": image,
            "label": torch.tensor(item["label"], dtype=torch.long),
        }

In [None]:
class ToxicMemeClassifier(nn.Module):
    def __init__(self, num_classes=2):
        super().__init__()
        self.lxmert = LxmertModel.from_pretrained("unc-nlp/lxmert-base-uncased")

        # EfficientNet backbone with partial unfreezing
        self.cnn_backbone = EfficientNet.from_pretrained('efficientnet-b4')
        for param in self.cnn_backbone.parameters():
            param.requires_grad = False
        for block in self.cnn_backbone._blocks[-6:]:
            for param in block.parameters():
                param.requires_grad = True

        self.visual_fc = nn.Linear(self.cnn_backbone._fc.in_features, 2048)
        self.cnn_backbone._fc = nn.Identity()

        # Enhanced classifier head
        self.classifier = nn.Sequential(
            nn.Linear(self.lxmert.config.hidden_size, 512),
            nn.GELU(),
            nn.Dropout(0.3),
            nn.Linear(512, num_classes)
        )

    def forward(self, input_ids, attention_mask, images):
        # Ensure image tensor is float
        images = images.float()

        visual_feats = self.cnn_backbone(images)
        visual_feats = self.visual_fc(visual_feats).unsqueeze(1)

        batch_size = visual_feats.size(0)
        visual_pos = torch.tensor([[0, 0, 1, 1]], device=device).repeat(batch_size, 1, 1).float()

        outputs = self.lxmert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            visual_feats=visual_feats,
            visual_pos=visual_pos,
        )
        return self.classifier(outputs.pooled_output)

In [None]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.8, gamma=2):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        ce_loss = nn.CrossEntropyLoss(reduction='none')(inputs, targets)
        pt = torch.exp(-ce_loss)
        return (self.alpha * (1 - pt) ** self.gamma * ce_loss).mean()

def train_model(model, train_loader, val_loader, optimizer, criterion, device, epochs):
    model = model.to(device).float()
    scheduler = ReduceLROnPlateau(optimizer, 'min', patience=2, factor=0.2)
    best_f1 = 0

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        progress = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")

        for batch in progress:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            images = batch["image"].to(device)
            labels = batch["label"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, images=images)
            loss = criterion(outputs, labels)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            total_loss += loss.item()
            progress.set_postfix(loss=loss.item())

        val_metrics = evaluate_model(model, val_loader, device)
        scheduler.step(val_metrics['loss'])

        if val_metrics['f1'] > best_f1:
            best_f1 = val_metrics['f1']
            torch.save(model.state_dict(), "best_model.pth")
            print(f"New best model saved with F1: {best_f1:.4f}")

In [None]:
def evaluate_model(model, data_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    total_loss = 0
    criterion = nn.CrossEntropyLoss()

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            images = batch["image"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, images=images)
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return {
        'loss': total_loss / len(data_loader),
        'accuracy': (np.array(all_preds) == np.array(all_labels)).mean(),
        'precision': precision_score(all_labels, all_preds, zero_division=0),
        'recall': recall_score(all_labels, all_preds, zero_division=0),
        'f1': f1_score(all_labels, all_preds, zero_division=0)
    }

In [7]:
# Main Execution
if __name__ == "__main__":
    # Configuration
    BATCH_SIZE = 32
    MAX_LEN = 128
    LR = 2e-5
    EPOCHS = 5

    # Data Preparation
    with open("/content/train_balanced.json", "r") as f:
        all_data = json.load(f)

    # Split dataset
    train_size = int(0.8 * len(all_data))
    train_data, val_data = torch.utils.data.random_split(all_data, [train_size, len(all_data)-train_size])

    # Transforms with augmentation
    train_transform = transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ColorJitter(0.2, 0.2, 0.2),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

    val_transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

    tokenizer = LxmertTokenizer.from_pretrained("unc-nlp/lxmert-base-uncased")

    train_dataset = ToxicMemeDataset(
        [all_data[i] for i in train_data.indices],
        "/content/memes/FBHM/data/img",
        tokenizer,
        MAX_LEN,
        train_transform
    )

    val_dataset = ToxicMemeDataset(
        [all_data[i] for i in val_data.indices],
        "/content/memes/FBHM/data/img",
        tokenizer,
        MAX_LEN,
        val_transform
    )

    # Handle class imbalance
    labels = [d["label"] for d in train_dataset.data]
    class_counts = np.bincount(labels)
    class_weights = 1 / np.sqrt(class_counts)
    samples_weights = class_weights[labels]
    sampler = WeightedRandomSampler(samples_weights, len(samples_weights))

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=sampler)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    # Initialize model and training components
    model = ToxicMemeClassifier()
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=0.01)
    criterion = FocalLoss(alpha=0.75, gamma=1.5)

    # Train and evaluate
    train_model(model, train_loader, val_loader, optimizer, criterion, device, EPOCHS)
    model.load_state_dict(torch.load("best_model.pth"))
    final_metrics = evaluate_model(model, val_loader, device)

    print("\nFinal Validation Metrics:")
    print(f"Accuracy: {final_metrics['accuracy']:.4f}")
    print(f"Precision: {final_metrics['precision']:.4f}")
    print(f"Recall: {final_metrics['recall']:.4f}")
    print(f"F1 Score: {final_metrics['f1']:.4f}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/153 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.01G [00:00<?, ?B/s]

Some weights of the model checkpoint at unc-nlp/lxmert-base-uncased were not used when initializing LxmertModel: ['answer_head.logit_fc.0.bias', 'answer_head.logit_fc.0.weight', 'answer_head.logit_fc.2.bias', 'answer_head.logit_fc.2.weight', 'answer_head.logit_fc.3.bias', 'answer_head.logit_fc.3.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'obj_predict_head.decoder_dict.attr.bias', 'obj_predict_head.decoder_dict.attr.weight', 'obj_predict_head.decoder_dict.feat.bias', 'obj_predict_head.decoder_dict.feat.weight', 'obj_predict_head.decoder_dict.obj.bias', 'obj_predict_head.decoder_dict.obj.weight', 'obj_predict_head.transform.LayerNorm.bias', 'obj_predict_head.transform.LayerNorm.weight', 'obj_predict_head.transform.dense.bias', 'obj_pred

model.safetensors:   0%|          | 0.00/1.01G [00:00<?, ?B/s]


  0%|          | 0.00/74.4M [00:00<?, ?B/s][A
 14%|█▎        | 10.1M/74.4M [00:00<00:00, 101MB/s][A
 48%|████▊     | 35.9M/74.4M [00:00<00:00, 198MB/s][A
100%|██████████| 74.4M/74.4M [00:00<00:00, 219MB/s]


Loaded pretrained weights for efficientnet-b4



Epoch 1/5:   0%|          | 0/134 [00:00<?, ?it/s][A
Epoch 1/5:   0%|          | 0/134 [00:03<?, ?it/s, loss=0.189][A
Epoch 1/5:   1%|          | 1/134 [00:03<08:14,  3.72s/it, loss=0.189][A
Epoch 1/5:   1%|          | 1/134 [00:04<08:14,  3.72s/it, loss=0.182][A
Epoch 1/5:   1%|▏         | 2/134 [00:04<04:23,  2.00s/it, loss=0.182][A
Epoch 1/5:   1%|▏         | 2/134 [00:05<04:23,  2.00s/it, loss=0.188][A
Epoch 1/5:   2%|▏         | 3/134 [00:05<03:05,  1.42s/it, loss=0.188][A
Epoch 1/5:   2%|▏         | 3/134 [00:05<03:05,  1.42s/it, loss=0.198][A
Epoch 1/5:   3%|▎         | 4/134 [00:05<02:28,  1.15s/it, loss=0.198][A
Epoch 1/5:   3%|▎         | 4/134 [00:06<02:28,  1.15s/it, loss=0.192][A
Epoch 1/5:   4%|▎         | 5/134 [00:06<02:10,  1.01s/it, loss=0.192][A
Epoch 1/5:   4%|▎         | 5/134 [00:07<02:10,  1.01s/it, loss=0.178][A
Epoch 1/5:   4%|▍         | 6/134 [00:07<01:56,  1.10it/s, loss=0.178][A
Epoch 1/5:   4%|▍         | 6/134 [00:08<01:56,  1.10it/s, loss=0

New best model saved with F1: 0.6439


Epoch 2/5: 100%|██████████| 134/134 [01:41<00:00,  1.33it/s, loss=0.179]
Epoch 3/5: 100%|██████████| 134/134 [01:40<00:00,  1.33it/s, loss=0.108]
Epoch 4/5: 100%|██████████| 134/134 [01:40<00:00,  1.33it/s, loss=0.164]
Epoch 5/5: 100%|██████████| 134/134 [01:40<00:00,  1.33it/s, loss=0.0771]
  model.load_state_dict(torch.load("best_model.pth"))



Final Validation Metrics:
Accuracy: 0.6293
Precision: 0.6411
Recall: 0.6468
F1 Score: 0.6439
