In [1]:
from torch.utils.data import Dataset
from transformers import BertTokenizer, ViTFeatureExtractor
from PIL import Image
import os

class MultimodalDataset(Dataset):
    def __init__(self, dataframe, text_column, image_column, label_column, tokenizer, feature_extractor):
        self.data = dataframe
        self.text_column = text_column
        self.image_column = image_column
        self.label_column = label_column
        self.tokenizer = tokenizer
        self.feature_extractor = feature_extractor

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Load text and image
        text = self.data.iloc[idx][self.text_column]
        image_path = self.data.iloc[idx][self.image_column]
        label = self.data.iloc[idx][self.label_column]
        
        # Tokenize text
        text_inputs = self.tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
        
        # Preprocess image
        image = Image.open(image_path).convert("RGB")
        image_inputs = self.feature_extractor(images=image, return_tensors="pt")
        
        # Return processed data
        return {
            "input_ids": text_inputs["input_ids"].squeeze(),
            "attention_mask": text_inputs["attention_mask"].squeeze(),
            "pixel_values": image_inputs["pixel_values"].squeeze(),
            "label": torch.tensor(label, dtype=torch.long)
        }


In [2]:
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
import torch.nn.functional as F
import torch.optim as optim
import torch

# Split data into train and validation
train_df, val_df = train_test_split(your_dataframe, test_size=0.2)

# Load tokenizer and feature extractor
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")

# Create datasets and dataloaders
train_dataset = MultimodalDataset(train_df, "text", "image_path", "label", tokenizer, feature_extractor)
val_dataset = MultimodalDataset(val_df, "text", "image_path", "label", tokenizer, feature_extractor)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Define model, loss, and optimizer
model = MultimodalTransformer()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Training loop
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=5):
    model.train()
    for epoch in range(epochs):
        train_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            
            # Forward pass
            logits = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                pixel_values=batch["pixel_values"]
            )
            loss = criterion(logits, batch["label"])
            train_loss += loss.item()
            
            # Backward pass
            loss.backward()
            optimizer.step()
        
        # Validation
        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            for batch in val_loader:
                logits = model(
                    input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"],
                    pixel_values=batch["pixel_values"]
                )
                loss = criterion(logits, batch["label"])
                val_loss += loss.item()
                
                preds = torch.argmax(F.softmax(logits, dim=1), dim=1)
                correct += (preds == batch["label"]).sum().item()
                total += batch["label"].size(0)
        
        # Print epoch results
        print(f"Epoch {epoch+1}/{epochs}")
        print(f"Train Loss: {train_loss/len(train_loader)}")
        print(f"Validation Loss: {val_loss/len(val_loader)}")
        print(f"Validation Accuracy: {correct/total:.4f}")

# Train the model
train_model(model, train_loader, val_loader, criterion, optimizer, epochs=5)


ModuleNotFoundError: No module named 'sklearn'