In [1]:
from google.colab import drive
drive.mount('/content/drive')



Mounted at /content/drive


In [2]:
!pip install  torchvision  pillow datasets



Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.5.1->torchvision)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.5.1->torchvision)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.5.1->torchvision)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.5.1->torchvision)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.5.1->torchvision)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch==2.5.

In [3]:
!pip install transformers
!pip install torch




In [4]:
import torch
from torch.utils.data import Dataset
from PIL import Image
import os

In [5]:

import zipfile as zf
files = zf.ZipFile("/content/drive/MyDrive/FBHM.zip", 'r')
files.extractall('memes')
files.close()

In [6]:
import json

# Path to train.json in Google Drive
json_path = "/content/train_with_text_&_Kosmos_captions.json"

# Load JSON data
with open(json_path, "r") as file:
    data = json.load(file)

# Print an example
print(data[0])


{'id': 71083, 'img': 'img/71083.png', 'label': 0, 'text': 'i once dumped a cross eyed chick thought she was seeing someone else', 'caption': '"Describe the image in detail, including objects, people, actions, attire, facial expressions, and background elements. Do not include any text present in the image in your description." Two small chickens are standing next to each other, with one of them looking at the camera. The caption reads, ""I once dumped a cross-eyed chick thought she was seeing someone else.""'}


In [16]:


class MemeDataset(Dataset):
    def __init__(self, data, image_root, siglip_processor, roberta_tokenizer):
        self.data = data
        self.image_root = image_root  # Root directory for images
        self.siglip_processor = siglip_processor
        self.roberta_tokenizer = roberta_tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]

        # Load image
        image_path = os.path.join(self.image_root, item["img"])
        image = Image.open(image_path).convert("RGB")
        image_inputs = self.siglip_processor(images=image, return_tensors="pt")

        # Process Text + Caption
        text_inputs = self.roberta_tokenizer(
            item["text"] + " " + item["caption"],  # Combine both text and caption
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            max_length=128
        )

        # Label
        label = torch.tensor(item["label"], dtype=torch.long)

        return image_inputs, text_inputs, label

# Define dataset path
image_root = "/content/memes/data/"

# Initialize dataset
dataset = MemeDataset(data, image_root, siglip_processor, roberta_tokenizer)
print(f"Total memes in dataset: {len(dataset)}")


Total memes in dataset: 6800


In [50]:
from torch.utils.data import DataLoader

# DataLoader for training
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


In [51]:
import torch
from transformers import SiglipProcessor, SiglipModel, RobertaTokenizer, RobertaModel
from PIL import Image


In [52]:
from transformers import AutoModel, AutoProcessor, AutoTokenizer

In [53]:

# Load SigLIP for images
siglip_processor = SiglipProcessor.from_pretrained("google/siglip-base-patch16-224")
siglip_model = SiglipModel.from_pretrained("google/siglip-base-patch16-224")


In [54]:

# Load RoBERTa for text
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta_model = RobertaModel.from_pretrained("roberta-base")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [55]:
import torch.nn as nn

In [14]:


class MultimodalToxicClassifier(nn.Module):
    def __init__(self, siglip_model, roberta_model, embedding_dim=512):
        super(MultimodalToxicClassifier, self).__init__()
        self.siglip = siglip_model
        self.roberta = roberta_model

        # Freeze the pretrained models (Optional: Unfreeze for fine-tuning)
        for param in self.siglip.parameters():
            param.requires_grad = False
        for param in self.roberta.parameters():
            param.requires_grad = False

        # MLP Classifier
        self.classifier = nn.Sequential(
            nn.Linear(embedding_dim * 2, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 1),  # Output: Toxic or Non-Toxic (Binary Classification)
            nn.Sigmoid()
        )

    def forward(self, image_inputs, text_inputs):
        # Extract embeddings
        image_embeds = self.siglip.get_image_features(**image_inputs)

        text_embeds = self.roberta(**text_inputs).pooler_output

        # Concatenate both embeddings
        combined_embeds = torch.cat((image_embeds, text_embeds), dim=1)

        # Pass through classifier
        output = self.classifier(combined_embeds)
        return output


In [15]:
class MultimodalToxicClassifier(nn.Module):
    def __init__(self, siglip_model, roberta_model, embedding_dim=768):
        super(MultimodalToxicClassifier, self).__init__()
        self.siglip = siglip_model
        self.roberta = roberta_model

        # Optionally freeze the pretrained models
        for param in self.siglip.parameters():
            param.requires_grad = False
        for param in self.roberta.parameters():
            param.requires_grad = False

        # MLP classifier (adjust embedding_dim as needed)
        self.classifier = nn.Sequential(
            nn.Linear(embedding_dim * 2, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 1),
            nn.Sigmoid()
        )

    def forward(self, image_inputs, text_inputs):
        # Get image embeddings using SigLIP
        image_embeds = self.siglip.get_image_features(**image_inputs)
        # Get text embeddings using RoBERTa
        text_embeds = self.roberta(**text_inputs).pooler_output

        # Force both embeddings to the device of the classifier
        target_device = self.classifier[0].weight.device
        image_embeds = image_embeds.to(target_device)
        text_embeds = text_embeds.to(target_device)

        # Concatenate and pass through classifier
        combined_embeds = torch.cat((image_embeds, text_embeds), dim=1)
        output = self.classifier(combined_embeds)
        return output


In [56]:
model = MultimodalToxicClassifier(siglip_model, roberta_model)


In [58]:
import torch.optim as optim

# Loss Function & Optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.classifier.parameters(), lr=2e-5)

# Move to GPU (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


MultimodalToxicClassifier(
  (siglip): SiglipModel(
    (text_model): SiglipTextTransformer(
      (embeddings): SiglipTextEmbeddings(
        (token_embedding): Embedding(32000, 768)
        (position_embedding): Embedding(64, 768)
      )
      (encoder): SiglipEncoder(
        (layers): ModuleList(
          (0-11): 12 x SiglipEncoderLayer(
            (self_attn): SiglipSdpaAttention(
              (k_proj): Linear(in_features=768, out_features=768, bias=True)
              (v_proj): Linear(in_features=768, out_features=768, bias=True)
              (q_proj): Linear(in_features=768, out_features=768, bias=True)
              (out_proj): Linear(in_features=768, out_features=768, bias=True)
            )
            (layer_norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
            (mlp): SiglipMLP(
              (activation_fn): PytorchGELUTanh()
              (fc1): Linear(in_features=768, out_features=3072, bias=True)
              (fc2): Linear(in_features=3072, out

In [59]:
print(type(image_inputs))  # Should be a dict
print(image_inputs.keys())  # Check available keys


NameError: name 'image_inputs' is not defined

In [60]:
image_inputs["pixel_values"] = image_inputs["pixel_values"].squeeze(1)  # Remove the extra dimension

print(image_inputs["pixel_values"].shape)  # Should be (32, 3, 224, 224)


NameError: name 'image_inputs' is not defined

In [61]:
print(image_inputs["pixel_values"].shape)


NameError: name 'image_inputs' is not defined

In [62]:
print("Text Inputs Keys:", text_inputs.keys())


NameError: name 'text_inputs' is not defined

In [63]:
# Ensure the image tensor is exactly 4D: (batch_size, 3, height, width)
while image_inputs["pixel_values"].ndim > 4:
    image_inputs["pixel_values"] = image_inputs["pixel_values"].squeeze(1)


NameError: name 'image_inputs' is not defined

In [64]:
if "input_ids" not in text_inputs:
    raise ValueError(f"Missing 'input_ids' in text_inputs. Available keys: {text_inputs.keys()}")


NameError: name 'text_inputs' is not defined

In [65]:
num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for image_inputs, text_inputs, labels in dataloader:
        image_inputs = {k: v.to(device) for k, v in image_inputs.items()}
        text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
        labels = labels.to(device).float()

        # Squeeze image pixel dimension if needed
        if image_inputs["pixel_values"].ndim == 5:
            image_inputs["pixel_values"] = image_inputs["pixel_values"].squeeze(1)

        optimizer.zero_grad()
        outputs = model(image_inputs, text_inputs).squeeze()

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(dataloader):.4f}")


KeyboardInterrupt: 

In [None]:
num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for image_inputs, text_inputs, labels in dataloader:
        image_inputs = {k: v.to(device) for k, v in image_inputs.items()}
        text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
        labels = labels.to(device).float()

        # Ensure pixel_values shape is correct
        if image_inputs["pixel_values"].ndim == 5:
            image_inputs["pixel_values"] = image_inputs["pixel_values"].squeeze(1)

        optimizer.zero_grad()
        # Call model with the dictionaries as positional arguments
        outputs = model(image_inputs, text_inputs).squeeze()

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(dataloader):.4f}")


In [None]:
num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for image_inputs, text_inputs, labels in dataloader:
        image_inputs = {k: v.to(device) for k, v in image_inputs.items()}
        text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
        labels = labels.to(device).float()

        # Fix image dimensions if needed
        if len(image_inputs['pixel_values'].shape) == 5:
            image_inputs['pixel_values'] = image_inputs['pixel_values'].squeeze(1)

        optimizer.zero_grad()

        # Pass image and text inputs correctly to the model
        outputs = model(
            image_inputs=image_inputs,  # Contains 'pixel_values'
            input_ids=text_inputs['input_ids'],  # Explicitly pass text keys
            attention_mask=text_inputs['attention_mask']
        ).squeeze()

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(dataloader):.4f}")

TypeError: MultimodalToxicClassifier.forward() got an unexpected keyword argument 'input_ids'

In [None]:
num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for image_inputs, text_inputs, labels in dataloader:
        # Move data to device
        image_inputs = {k: v.to(device) for k, v in image_inputs.items()}
        text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
        labels = labels.to(device).float()

        # Fix image dimensions (remove extra dimension)
        if image_inputs['pixel_values'].ndim == 5:
            image_inputs['pixel_values'] = image_inputs['pixel_values'].squeeze(1)

        optimizer.zero_grad()

        # Pass BOTH dictionaries directly to the model
        outputs = model(image_inputs, text_inputs).squeeze()

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(dataloader):.4f}")

ValueError: too many values to unpack (expected 2)

In [66]:


def extract_image_features(image_path):
    """Extracts image embeddings using SigLIP"""
    image = Image.open(image_path).convert("RGB")
    inputs = siglip_processor(images=image, return_tensors="pt")
    with torch.no_grad():
        outputs = siglip_model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze(0)  # CLS token

def extract_text_features(text):
    """Extracts text embeddings using RoBERTa"""
    inputs = roberta_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = roberta_model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze(0)  # CLS token


In [67]:
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModel

# Load SigLIP model (pretrained)
device = "cuda" if torch.cuda.is_available() else "cpu"
model_id = "google/siglip-base-patch16-224"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id).to(device)

# Function to get image embeddings
def get_image_embedding(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(device)

    with torch.no_grad():
        image_features = model.get_image_features(**inputs)

    return image_features


In [28]:

# Install dependencies
!pip install torchvision pillow datasets transformers torch



In [68]:



# Imports
import torch
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os
import zipfile
import json
from transformers import SiglipProcessor, SiglipModel, RobertaTokenizer, RobertaModel
import torch.nn as nn
import torch.optim as optim

In [69]:

# Load JSON data
with open("/content/train_with_text_&_Kosmos_captions.json", "r") as file:
    data = json.load(file)

In [70]:



# Define dataset class with dimension fixes
class MemeDataset(Dataset):
    def __init__(self, data, image_root, siglip_processor, roberta_tokenizer):
        self.data = data
        self.image_root = image_root
        self.siglip_processor = siglip_processor
        self.roberta_tokenizer = roberta_tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]

        # Load and process image
        image_path = os.path.join(self.image_root, item["img"])
        image = Image.open(image_path).convert("RGB")
        image_inputs = self.siglip_processor(images=image, return_tensors="pt")

        # Process text and remove batch dimension
        text = item["text"] + " " + item["caption"]
        text_inputs = self.roberta_tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            max_length=128
        )
        text_inputs = {k: v.squeeze(0) for k, v in text_inputs.items()}

        label = torch.tensor(item["label"], dtype=torch.long)

        return image_inputs, text_inputs, label

In [71]:
# Initialize models and processors
siglip_processor = SiglipProcessor.from_pretrained("google/siglip-base-patch16-224")
siglip_model = SiglipModel.from_pretrained("google/siglip-base-patch16-224")
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta_model = RobertaModel.from_pretrained("roberta-base")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [82]:



# Create dataset
dataset = MemeDataset(
    data=data,
    image_root="/content/memes/FBHM/data",
    siglip_processor=siglip_processor,
    roberta_tokenizer=roberta_tokenizer
)

# Create dataloader
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)




In [83]:
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True  # Attempt to load truncated images

class MemeDataset(Dataset):
    def __init__(self, data, image_root, siglip_processor, roberta_tokenizer):
        self.data = data
        self.image_root = image_root
        self.siglip_processor = siglip_processor
        self.roberta_tokenizer = roberta_tokenizer
        self.valid_indices = self._validate_images()  # Filter out corrupted images

    def _validate_images(self):
        valid_indices = []
        for idx, item in enumerate(self.data):
            image_path = os.path.join(self.image_root, item["img"])
            try:
                with Image.open(image_path) as img:
                    img.verify()  # Verify if the image is not corrupted
                valid_indices.append(idx)
            except Exception as e:
                print(f"Corrupted image skipped: {image_path} - {e}")
        return valid_indices

    def __len__(self):
        return len(self.valid_indices)

    def __getitem__(self, idx):
        item = self.data[self.valid_indices[idx]]

        # Load and process image
        image_path = os.path.join(self.image_root, item["img"])
        image = Image.open(image_path).convert("RGB")
        image_inputs = self.siglip_processor(images=image, return_tensors="pt")

        # Process text and remove batch dimension
        text = item["text"] + " " + item["caption"]
        text_inputs = self.roberta_tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            max_length=128
        )
        text_inputs = {k: v.squeeze(0) for k, v in text_inputs.items()}

        label = torch.tensor(item["label"], dtype=torch.long)

        return image_inputs, text_inputs, label

In [84]:
# Initialize dataset
dataset = MemeDataset(
    data=data,
    image_root="/content/memes/FBHM/data",
    siglip_processor=siglip_processor,
    roberta_tokenizer=roberta_tokenizer
)

print(f"Total valid memes in dataset: {len(dataset)}")

Total valid memes in dataset: 6800


In [85]:
# Initialize dataset
dataset = MemeDataset(
    data=data,
    image_root="/content/memes/FBHM/data",
    siglip_processor=siglip_processor,
    roberta_tokenizer=roberta_tokenizer
)

print(f"Total valid memes in dataset: {len(dataset)}")

Total valid memes in dataset: 6800


In [86]:
class MultimodalToxicClassifier(nn.Module):
    def __init__(self, siglip_model, roberta_model):
        super().__init__()
        self.siglip = siglip_model
        self.roberta = roberta_model

        # Freeze pretrained models
        for param in self.siglip.parameters():
            param.requires_grad = False
        for param in self.roberta.parameters():
            param.requires_grad = False

        # Corrected Classifier (768*2 = 1536 input features)
        self.classifier = nn.Sequential(
            nn.Linear(1536, 512),  # SigLIP (768) + RoBERTa (768) = 1536
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 1),
            nn.Sigmoid()
        )

    def forward(self, image_inputs, text_inputs):
        # Get image features (768D)
        image_embeds = self.siglip.get_image_features(**image_inputs)

        # Get text features (768D)
        text_embeds = self.roberta(**text_inputs).pooler_output

        # Concatenate features (1536D)
        combined = torch.cat((image_embeds, text_embeds), dim=1)
        return self.classifier(combined)

In [87]:
# Test dimension alignment
test_image = torch.randn(32, 3, 224, 224).to(device)
test_text = torch.randint(0, 1000, (32, 128)).to(device)

with torch.no_grad():
    image_feats = model.siglip.get_image_features(pixel_values=test_image)
    text_feats = model.roberta(input_ids=test_text).pooler_output

print(f"Image features shape: {image_feats.shape}")  # Should be [32, 768]
print(f"Text features shape: {text_feats.shape}")    # Should be [32, 768]
print(f"Combined shape: {torch.cat((image_feats, text_feats), dim=1).shape}")  # Should be [32, 1536]

Image features shape: torch.Size([32, 768])
Text features shape: torch.Size([32, 768])
Combined shape: torch.Size([32, 1536])


In [88]:

# Initialize training components
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultimodalToxicClassifier(siglip_model, roberta_model).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=2e-5)



In [89]:
# Training loop
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for image_inputs, text_inputs, labels in dataloader:
        # Move data to device
        image_inputs = {k: v.to(device) for k, v in image_inputs.items()}
        text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
        labels = labels.to(device).float()

        # Fix image dimensions
        if image_inputs['pixel_values'].dim() == 5:
            image_inputs['pixel_values'] = image_inputs['pixel_values'].squeeze(1)

        # Forward pass
        optimizer.zero_grad()
        outputs = model(image_inputs, text_inputs).squeeze()

        # Calculate loss
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(dataloader):.4f}")

Epoch [1/10], Loss: 0.6473
Epoch [2/10], Loss: 0.6198
Epoch [3/10], Loss: 0.5969
Epoch [4/10], Loss: 0.5804
Epoch [5/10], Loss: 0.5697
Epoch [6/10], Loss: 0.5574
Epoch [7/10], Loss: 0.5467
Epoch [8/10], Loss: 0.5414
Epoch [9/10], Loss: 0.5342
Epoch [10/10], Loss: 0.5295


In [93]:
from sklearn.model_selection import train_test_split

# Split data into 80% train and 20% validation
train_data, val_data = train_test_split(
    data,
    test_size=0.2,
    random_state=42,
    stratify=[item["label"] for item in data]  # Preserve class balance
)

# Create datasets
train_dataset = MemeDataset(
    data=train_data,
    image_root="/content/memes/FBHM/data",  # Replace with your image root path
    siglip_processor=siglip_processor,  # Pass the SigLIP processor
    roberta_tokenizer=roberta_tokenizer  # Pass the RoBERTa tokenizer
)

val_dataset = MemeDataset(
    data=val_data,
    image_root="/content/memes/FBHM/data",  # Replace with your image root path
    siglip_processor=siglip_processor,  # Pass the SigLIP processor
    roberta_tokenizer=roberta_tokenizer  # Pass the RoBERTa tokenizer
)

# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [94]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_model(model, dataloader, device):
    model.eval()  # Set model to evaluation mode
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():  # Disable gradient computation
        for image_inputs, text_inputs, labels in dataloader:
            # Move data to device
            image_inputs = {k: v.to(device) for k, v in image_inputs.items()}
            text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
            labels = labels.to(device).float()

            # Fix image dimensions
            if image_inputs['pixel_values'].dim() == 5:
                image_inputs['pixel_values'] = image_inputs['pixel_values'].squeeze(1)

            # Forward pass
            outputs = model(image_inputs, text_inputs).squeeze()

            # Calculate loss
            loss = criterion(outputs, labels)
            total_loss += loss.item()

            # Convert probabilities to binary predictions
            preds = (outputs > 0.5).float()  # Threshold at 0.5
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    return {
        "loss": total_loss / len(dataloader),
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

In [95]:
num_epochs = 5
best_f1 = 0  # Track best F1-score

for epoch in range(num_epochs):
    # --- Training ---
    model.train()
    total_loss = 0

    for image_inputs, text_inputs, labels in dataloader:
        # Existing training code...
        total_loss += loss.item()

    avg_train_loss = total_loss / len(dataloader)

    # --- Validation ---
    val_metrics = evaluate_model(model, val_dataloader, device)

    print(f"\nEpoch [{epoch+1}/{num_epochs}]")
    print(f"Train Loss: {avg_train_loss:.4f} | Val Loss: {val_metrics['loss']:.4f}")
    print(f"Val Accuracy: {val_metrics['accuracy']:.4f}")
    print(f"Precision: {val_metrics['precision']:.4f} | Recall: {val_metrics['recall']:.4f}")
    print(f"F1-Score: {val_metrics['f1']:.4f}")

    # Save best model
    if val_metrics['f1'] > best_f1:
        best_f1 = val_metrics['f1']
        torch.save(model.state_dict(), "best_model.pt")
        print("Saved best model checkpoint!")


Epoch [1/5]
Train Loss: 0.5986 | Val Loss: 0.5259
Val Accuracy: 0.7419
Precision: 0.6908 | Recall: 0.5082
F1-Score: 0.5856
Saved best model checkpoint!

Epoch [2/5]
Train Loss: 0.5986 | Val Loss: 0.5259
Val Accuracy: 0.7419
Precision: 0.6908 | Recall: 0.5082
F1-Score: 0.5856

Epoch [3/5]
Train Loss: 0.5986 | Val Loss: 0.5259
Val Accuracy: 0.7419
Precision: 0.6908 | Recall: 0.5082
F1-Score: 0.5856

Epoch [4/5]
Train Loss: 0.5986 | Val Loss: 0.5259
Val Accuracy: 0.7419
Precision: 0.6908 | Recall: 0.5082
F1-Score: 0.5856

Epoch [5/5]
Train Loss: 0.5986 | Val Loss: 0.5259
Val Accuracy: 0.7419
Precision: 0.6908 | Recall: 0.5082
F1-Score: 0.5856


In [96]:
# Load best model
model.load_state_dict(torch.load("best_model.pt"))

# Evaluate
test_metrics = evaluate_model(model, val_dataloader, device)  # Replace with test_dataloader if available

print("\n--- Final Results ---")
print(f"Loss: {test_metrics['loss']:.4f}")
print(f"Accuracy: {test_metrics['accuracy']:.4f}")
print(f"Precision: {test_metrics['precision']:.4f}")
print(f"Recall: {test_metrics['recall']:.4f}")
print(f"F1-Score: {test_metrics['f1']:.4f}")

  model.load_state_dict(torch.load("best_model.pt"))



--- Final Results ---
Loss: 0.5259
Accuracy: 0.7419
Precision: 0.6908
Recall: 0.5082
F1-Score: 0.5856


In [101]:
import json

# Load test data
with open("/content/convertjson.json", "r") as file:
    test_data = json.load(file)

# Create test dataset
test_dataset = MemeDataset(
    data=test_data,
    image_root="/content/memes/FBHM/data",  # Replace with your image root path
    siglip_processor=siglip_processor,
    roberta_tokenizer=roberta_tokenizer
)

# Create test dataloader
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [103]:
class MemeDataset(Dataset):
    def __init__(self, data, image_root, siglip_processor, roberta_tokenizer):
        self.data = data
        self.image_root = image_root
        self.siglip_processor = siglip_processor
        self.roberta_tokenizer = roberta_tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]

        # Load and process image
        image_path = os.path.join(self.image_root, item["img"])
        image = Image.open(image_path).convert("RGB")
        image_inputs = self.siglip_processor(images=image, return_tensors="pt")

        # Process text (handle missing "caption" field)
        text = item["text"]
        if "caption" in item:  # Check if "caption" exists
            text += " " + item["caption"]

        text_inputs = self.roberta_tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            max_length=128
        )
        text_inputs = {k: v.squeeze(0) for k, v in text_inputs.items()}

        label = torch.tensor(item["label"], dtype=torch.long)

        return image_inputs, text_inputs, label

In [105]:
# Create test dataset
test_dataset = MemeDataset(
    data=test_data,
    image_root="/content/memes/FBHM/data",  # Replace with your image root path
    siglip_processor=siglip_processor,
    roberta_tokenizer=roberta_tokenizer
)

# Create test dataloader
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [106]:
# Load the trained model
model.load_state_dict(torch.load("/content/best_model.pt", weights_only=True))
model.to(device)

MultimodalToxicClassifier(
  (siglip): SiglipModel(
    (text_model): SiglipTextTransformer(
      (embeddings): SiglipTextEmbeddings(
        (token_embedding): Embedding(32000, 768)
        (position_embedding): Embedding(64, 768)
      )
      (encoder): SiglipEncoder(
        (layers): ModuleList(
          (0-11): 12 x SiglipEncoderLayer(
            (self_attn): SiglipSdpaAttention(
              (k_proj): Linear(in_features=768, out_features=768, bias=True)
              (v_proj): Linear(in_features=768, out_features=768, bias=True)
              (q_proj): Linear(in_features=768, out_features=768, bias=True)
              (out_proj): Linear(in_features=768, out_features=768, bias=True)
            )
            (layer_norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
            (mlp): SiglipMLP(
              (activation_fn): PytorchGELUTanh()
              (fc1): Linear(in_features=768, out_features=3072, bias=True)
              (fc2): Linear(in_features=3072, out

In [107]:
# Evaluate on test data
test_metrics = evaluate_model(model, test_dataloader, device)

# Print results
print("--- Test Results ---")
print(f"Accuracy: {test_metrics['accuracy']:.4f}")
print(f"Precision: {test_metrics['precision']:.4f}")
print(f"Recall: {test_metrics['recall']:.4f}")
print(f"F1-Score: {test_metrics['f1']:.4f}")

--- Test Results ---
Accuracy: 0.5920
Precision: 0.6949
Recall: 0.3280
F1-Score: 0.4457


In [108]:
class MemeDataset(Dataset):
    def __init__(self, data, image_root, siglip_processor, roberta_tokenizer):
        self.data = data
        self.image_root = image_root
        self.siglip_processor = siglip_processor
        self.roberta_tokenizer = roberta_tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]

        # Load and process image
        image_path = os.path.join(self.image_root, item["img"])
        image = Image.open(image_path).convert("RGB")
        image_inputs = self.siglip_processor(images=image, return_tensors="pt")

        # Process text and caption
        text = item["text"] + " " + item["caption"]  # Combine text and caption
        text_inputs = self.roberta_tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            max_length=128
        )
        text_inputs = {k: v.squeeze(0) for k, v in text_inputs.items()}

        label = torch.tensor(item["label"], dtype=torch.long)

        return image_inputs, text_inputs, label

In [110]:
# Read the file line by line
with open("/content/train_with_text_&_Kosmos_captions.json", "r") as file:
    lines = file.readlines()

# Parse each line as a JSON object
data = [json.loads(line) for line in lines]

# Save as a valid JSON array
with open("/content/train_with_text_&_Kosmos_captions_fixed.json", "w") as file:
    json.dump(data, file)

JSONDecodeError: Expecting value: line 2 column 1 (char 2)

In [109]:
import json

# Load test data
with open("/content/train_with_text_&_Kosmos_captions.json", "r") as file:
    test_data = json.load(file)

# Create test dataset
test_dataset = MemeDataset(
    data=test_data,
    image_root="/content/memes/FBHM/data",  # Replace with your image root path
    siglip_processor=siglip_processor,
    roberta_tokenizer=roberta_tokenizer
)

# Create test dataloader
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

JSONDecodeError: Extra data: line 58851 column 3 (char 5482370)

In [97]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.1-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.6/233.6 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.9-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [99]:
!pip install ray[tune]

Collecting ray[tune]
  Downloading ray-2.43.0-cp311-cp311-manylinux2014_x86_64.whl.metadata (19 kB)
Collecting tensorboardX>=1.9 (from ray[tune])
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ray-2.43.0-cp311-cp311-manylinux2014_x86_64.whl (67.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.7/67.7 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tensorboardX, ray
Successfully installed ray-2.43.0 tensorboardX-2.6.2.2


In [98]:
import optuna

def objective(trial):
    # Define hyperparameters to tune
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-4, log=True)
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64])
    dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5)

    # Initialize model and optimizer
    model = MultimodalToxicClassifier(siglip_model, roberta_model, dropout_rate=dropout_rate).to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Create dataloader
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Train and evaluate
    val_f1 = train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, num_epochs=5)

    return val_f1

# Run optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

# Best hyperparameters
print("Best hyperparameters:", study.best_params)

[I 2025-02-28 16:40:31,865] A new study created in memory with name: no-name-164fb9a4-925d-40dc-9eac-e9faf9b84c1b
[W 2025-02-28 16:40:31,867] Trial 0 failed with parameters: {'learning_rate': 5.5493495737884776e-05, 'batch_size': 64, 'dropout_rate': 0.45237970398444116} because of the following error: TypeError("MultimodalToxicClassifier.__init__() got an unexpected keyword argument 'dropout_rate'").
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "<ipython-input-98-2ef56cf1293e>", line 10, in objective
    model = MultimodalToxicClassifier(siglip_model, roberta_model, dropout_rate=dropout_rate).to(device)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
TypeError: MultimodalToxicClassifier.__init__() got an unexpected keyword argument 'dropout_rate'
[W 2025-02-28 16:40:31,869] Tria

TypeError: MultimodalToxicClassifier.__init__() got an unexpected keyword argument 'dropout_rate'

In [100]:
from ray import tune
from ray.tune.schedulers import ASHAScheduler

def trainable(config):
    # Initialize model and optimizer
    model = MultimodalToxicClassifier(siglip_model, roberta_model, dropout_rate=config["dropout_rate"]).to(device)
    optimizer = optim.Adam(model.parameters(), lr=config["learning_rate"])

    # Create dataloader
    train_dataloader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True)

    # Train and evaluate
    val_f1 = train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, num_epochs=5)

    tune.report(val_f1=val_f1)

# Define search space
config = {
    "learning_rate": tune.loguniform(1e-5, 1e-4),
    "batch_size": tune.choice([16, 32, 64]),
    "dropout_rate": tune.uniform(0.1, 0.5)
}

# Run optimization
analysis = tune.run(
    trainable,
    config=config,
    num_samples=20,  # Number of trials
    scheduler=ASHAScheduler(metric="val_f1", mode="max"),
    resources_per_trial={"cpu": 2, "gpu": 1}
)

# Best hyperparameters
print("Best hyperparameters:", analysis.best_config)

2025-02-28 16:41:22,345	INFO worker.py:1841 -- Started a local Ray instance.
2025-02-28 16:41:23,498	INFO tune.py:253 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `tune.run(...)`.


TuneError: The Trainable/training function is too large for grpc resource limit. Check that its definition is not implicitly capturing a large array or other object in scope. Tip: use tune.with_parameters() to put large objects in the Ray object store. 
Original exception: Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/ray/tune/experiment/experiment.py", line 149, in __init__
    self._run_identifier = Experiment.register_if_needed(run)
                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/ray/tune/experiment/experiment.py", line 351, in register_if_needed
    register_trainable(name, run_object)
  File "/usr/local/lib/python3.11/dist-packages/ray/tune/registry.py", line 117, in register_trainable
    _global_registry.register(TRAINABLE_CLASS, name, trainable)
  File "/usr/local/lib/python3.11/dist-packages/ray/tune/registry.py", line 244, in register
    self.flush_values()
  File "/usr/local/lib/python3.11/dist-packages/ray/tune/registry.py", line 282, in flush_values
    _internal_kv_put(
  File "/usr/local/lib/python3.11/dist-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/ray/experimental/internal_kv.py", line 94, in _internal_kv_put
    return global_gcs_client.internal_kv_put(key, value, overwrite, namespace) == 0
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "python/ray/includes/gcs_client.pxi", line 129, in ray._raylet.InnerGcsClient.internal_kv_put
  File "python/ray/includes/gcs_client.pxi", line 650, in ray._raylet.raise_or_return
  File "python/ray/includes/gcs_client.pxi", line 802, in ray._raylet.convert_optional_bool
  File "python/ray/includes/common.pxi", line 120, in ray._raylet.check_status_timeout_as_rpc_error
  File "python/ray/includes/common.pxi", line 95, in ray._raylet.check_status
ray.exceptions.RpcError: RPC Error message: Sent message larger than max (1317807824 vs. 536870912); RPC Error details: 


In [90]:
def evaluate_model(model, dataloader, device):
    model.eval()
    total_correct = 0
    total_samples = 0
    with torch.no_grad():
        for image_inputs, text_inputs, labels in dataloader:
            image_inputs = {k: v.to(device) for k, v in image_inputs.items()}
            text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
            labels = labels.to(device).float()

            if image_inputs["pixel_values"].ndim == 5:
                image_inputs["pixel_values"] = image_inputs["pixel_values"].squeeze(1)

            outputs = model(image_inputs, text_inputs).squeeze()
            predictions = (outputs > 0.5).float()  # Threshold for binary classification

            total_correct += (predictions == labels).sum().item()
            total_samples += labels.size(0)
    return total_correct / total_samples

# Assume you have a separate dataloader for validation
val_accuracy = evaluate_model(model, val_dataloader, device)
print(f"Validation Accuracy: {val_accuracy:.4f}")


NameError: name 'val_dataloader' is not defined

In [91]:
torch.save(model.state_dict(), "toxic_meme_classifier.pth")
