# ENEL 645 Assignment 2
Group 11 Team Members: Steven Au, Laurel Flanagan, Rhys Wickens, Austen Zhang

## Image Classification Transfer Learning
Pre-trained Model: Efficient Net V2 S (https://pytorch.org/vision/main/models/generated/torchvision.models.efficientnet_v2_s.html#torchvision.models.efficientnet_v2_s)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader, Dataset
import os
import re
import numpy as np
from transformers import DistilBertModel, DistilBertTokenizer
import wandb
from sklearn.metrics import confusion_matrix

# Define Data Directories
data_dir = "C:/Users/Auste/Documents/ENEL645_GarbageData/"
train_dir = os.path.join(data_dir, "CVPR_2024_dataset_Train")
val_dir = os.path.join(data_dir, "CVPR_2024_dataset_Val")
test_dir = os.path.join(data_dir, "CVPR_2024_dataset_Test")

# Initialize wandb
def initialize_wandb():
    if wandb.run is None:
        wandb.init(
            entity="shcau-university-of-calgary-in-alberta",
            project="transfer_learning_garbage",
            name="Multimodal_Model_RTX4060_R3",
            tags=["distilBERT", "efficientnet", "CVPR_2024_dataset"],
            notes="Multimodal classification model using distilBERT and efficientnet.",
            config={"epochs": 5, "batch_size": 128, "dataset": "CVPR_2024_dataset"},
            job_type="train",
            resume="allow",
        )

initialize_wandb()

# Define transformations
transform = {
    "train": transforms.Compose([
        models.EfficientNet_V2_S_Weights.IMAGENET1K_V1.transforms(), # This includes the following preprocessing: The images are resized to resize_size=[384] using interpolation=InterpolationMode.BILINEAR,
        # followed by a central crop of crop_size=[384]. Finally, the values are first rescaled to [0.0, 1.0] and then normalized using mean=[0.485, 0.456, 0.406] and std=[0.229, 0.224, 0.225]
        transforms.RandomHorizontalFlip(), # additional data augmentation step added to training data set
    ]),
    "val": models.EfficientNet_V2_S_Weights.IMAGENET1K_V1.transforms(),
    "test": models.EfficientNet_V2_S_Weights.IMAGENET1K_V1.transforms(),
}

# Load datasets
image_datasets = {
    "train": datasets.ImageFolder(train_dir, transform=transform["train"]),
    "val": datasets.ImageFolder(val_dir, transform=transform["val"]),
    "test": datasets.ImageFolder(test_dir, transform=transform["test"]),
}


# Text Classification

# Extract text from file names as well as labels
def read_text_files_with_labels(path):
    texts = []
    labels = []
    class_folders = sorted(os.listdir(path))
    label_map = {class_name: idx for idx, class_name in enumerate(class_folders)}

    for class_name in class_folders:
        class_path = os.path.join(path, class_name)
        if os.path.isdir(class_path):
            file_names = os.listdir(class_path)
            for file_name in file_names:
                file_path = os.path.join(class_path, file_name)
                if os.path.isfile(file_path):
                    file_name_no_ext, _ = os.path.splitext(file_name)
                    text = file_name_no_ext.replace('_', ' ')
                    text_without_digits = re.sub(r'\d+', '', text)
                    texts.append(text_without_digits)
                    labels.append(label_map[class_name])

    return np.array(texts), np.array(labels)

class CustomTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


# Prepare text data

text_train,labels_train = read_text_files_with_labels(train_dir)
text_val,labels_val = read_text_files_with_labels(val_dir)
text_test,labels_test = read_text_files_with_labels(test_dir)

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
max_len = 24

#Define number of epochs
EPOCHS = 5


class MultimodalDataset(Dataset):
    def __init__(self, image_dataset, text_dataset):
        self.image_dataset = image_dataset
        self.text_dataset = text_dataset

    def __len__(self):
        return min(len(self.image_dataset), len(self.text_dataset))

    def __getitem__(self, idx):
        image, label = self.image_dataset[idx]
        text_data = self.text_dataset[idx]
        return {
            "image": image,
            "input_ids": text_data["input_ids"],
            "label": label
        }
    

class MultimodalClassifier(nn.Module):
    def __init__(self, num_classes):
        super(MultimodalClassifier, self).__init__()

        # EfficientNet (Image)
        self.image_model = models.efficientnet_v2_s(weights=models.EfficientNet_V2_S_Weights.IMAGENET1K_V1)

        # Freeze feature layers
        for param in self.image_model.features.parameters():
            param.requires_grad = False

        num_ftrs = self.image_model.classifier[1].in_features

        #Remove EfficientNet classifier
        self.image_model.classifier = nn.Identity()

        #Project features to 256 nodes
        self.image_fc = nn.Linear(num_ftrs, 256)

        # DistilBERT (Text)
        self.text_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.text_fc = nn.Linear(self.text_model.config.hidden_size, 256)

        # Normalization layers
        self.text_norms = nn.LayerNorm(256)
        self.image_norm = nn.LayerNorm(256)

        # Feature fusion Layer (Concatenation)
        self.fusion_fc = nn.Linear(512, self.text_model.config.hidden_size)

        # Classification Layer
        self.classifier = nn.Linear(self.text_model.config.hidden_size, num_classes)
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, input_ids, image_inputs):
        # Extract features
        text_output = self.text_model(input_ids=input_ids)
        text_features = self.text_norms(self.text_fc(text_output.last_hidden_state[:, 0, :]))  # Use CLS token
        image_features = self.image_norm(self.image_fc(self.image_model(image_inputs)))

        # Concatenate text and image features
        combined_features = torch.cat((text_features, image_features), dim=1)

        combined_features = self.fusion_fc(combined_features)
        output = self.classifier(self.dropout(combined_features))

        return output
    
# Data Loaders
BATCH_SIZE = 128
train_loader = DataLoader(MultimodalDataset(image_datasets["train"], CustomTextDataset(text_train, labels_train, tokenizer, max_len)), batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(MultimodalDataset(image_datasets["val"], CustomTextDataset(text_val, labels_val, tokenizer, max_len)), batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(MultimodalDataset(image_datasets["test"], CustomTextDataset(text_test, labels_test, tokenizer, max_len)), batch_size=BATCH_SIZE, shuffle=False)

# Evaluation Function
def evaluate_model(model, dataloader, device):
    model.eval()
    total_loss = 0
    correct, total = 0, 0
    criterion = nn.CrossEntropyLoss()

    with torch.no_grad():
        for batch in dataloader:
            images = batch["image"].to(device)
            input_ids = batch["input_ids"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, images)

            loss = criterion(outputs, labels)
            total_loss += loss.item()

            correct += (outputs.argmax(1) == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    return total_loss / len(dataloader), accuracy
        
# Model Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultimodalClassifier(num_classes=4).to(device)
optimizer = optim.Adam([
    {'params': model.text_model.parameters(), 'lr': 0.0001},  
    {'params': model.image_fc.parameters(), 'lr': 0.001},  
    {'params': model.classifier.parameters(), 'lr': 0.001}
])
criterion = nn.CrossEntropyLoss()

wandb.watch(model, log="all")
best_val_loss = float("inf")

# Training
for epoch in range(EPOCHS):
    model.train()
    total_train_loss = 0
    for batch in train_loader:
        images = batch["image"].to(device)
        input_ids = batch["input_ids"].to(device)
        labels = batch["label"].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()

    val_loss, val_acc = evaluate_model(model, val_loader, device)
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best_multimodal_model.pth")

    wandb.log({"epoch": epoch+1, "train_loss": total_train_loss, "val_loss": val_loss, "val_accuracy": val_acc})
    print(f"Epoch {epoch+1}/{5}, Train Loss: {total_train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

# Load Best Model for Testing
model.load_state_dict(torch.load("best_multimodal_model.pth"))
test_loss, test_acc = evaluate_model(model, test_loader, device)
wandb.log({"test_accuracy": test_acc})
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}")
wandb.finish()

Downloading: "https://download.pytorch.org/models/efficientnet_v2_s-dd5fe13b.pth" to C:\Users\rhysw/.cache\torch\hub\checkpoints\efficientnet_v2_s-dd5fe13b.pth
100%|██████████| 82.7M/82.7M [00:06<00:00, 14.0MB/s]


Epoch 1/20
train Loss: 1.3184 Acc: 0.3830
val Loss: 1.2754 Acc: 0.4545
Epoch 2/20
train Loss: 1.1786 Acc: 0.4731
val Loss: 1.1917 Acc: 0.6477
Epoch 3/20
train Loss: 1.0860 Acc: 0.5754
val Loss: 1.1136 Acc: 0.6477
Epoch 4/20
train Loss: 1.0211 Acc: 0.6603
val Loss: 1.0411 Acc: 0.6818
Epoch 5/20
train Loss: 0.9580 Acc: 0.6620
val Loss: 0.9873 Acc: 0.6932
Epoch 6/20
train Loss: 0.9145 Acc: 0.6898
val Loss: 0.9621 Acc: 0.6591
Epoch 7/20
train Loss: 0.8503 Acc: 0.6967
val Loss: 0.9268 Acc: 0.6705
Epoch 8/20
train Loss: 0.8257 Acc: 0.7227
val Loss: 0.9133 Acc: 0.6705
Epoch 9/20
train Loss: 0.8101 Acc: 0.7175
val Loss: 0.8939 Acc: 0.6932
Epoch 10/20
train Loss: 0.7831 Acc: 0.7383
val Loss: 0.8749 Acc: 0.6932
Epoch 11/20
train Loss: 0.7583 Acc: 0.7383
val Loss: 0.8663 Acc: 0.7045
Epoch 12/20
train Loss: 0.7296 Acc: 0.7435
val Loss: 0.8557 Acc: 0.6932
Epoch 13/20
train Loss: 0.7075 Acc: 0.7591
val Loss: 0.8490 Acc: 0.6932
Epoch 14/20
train Loss: 0.6759 Acc: 0.7626
val Loss: 0.8472 Acc: 0.6932
E

  model.load_state_dict(torch.load("best_model.pth"))


Test Accuracy: 56.80%


## Text Classification Transfer Learning
Pre-trained model: DistilBERT Classifier

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertModel, DistilBertTokenizer
import matplotlib.pyplot as plt
import numpy as np
import os
import re
from sklearn.metrics import confusion_matrix
import seaborn as sns

In [6]:
# Define functions

# Extract text from file names as well as labels
def read_text_files_with_labels(path):
    texts = []
    labels = []
    class_folders = sorted(os.listdir(path))  # Assuming class folders are sorted
    label_map = {class_name: idx for idx, class_name in enumerate(class_folders)}

    for class_name in class_folders:
        class_path = os.path.join(path, class_name)
        if os.path.isdir(class_path):
            file_names = os.listdir(class_path)
            for file_name in file_names:
                file_path = os.path.join(class_path, file_name)
                if os.path.isfile(file_path):
                    file_name_no_ext, _ = os.path.splitext(file_name)
                    text = file_name_no_ext.replace('_', ' ')
                    text_without_digits = re.sub(r'\d+', '', text)
                    texts.append(text_without_digits)
                    labels.append(label_map[class_name])

    return np.array(texts), np.array(labels)

# Define your dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Define the model
class DistilBERTClassifier(nn.Module):
    def __init__(self, num_classes):
        super(DistilBERTClassifier, self).__init__()
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.drop = nn.Dropout(0.3)
        self.out = nn.Linear(self.distilbert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        pooled_output = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)[0]
        output = self.drop(pooled_output[:,0])
        return self.out(output)

# Define training function
def train(model, iterator, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in iterator:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        output = model(input_ids, attention_mask)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(iterator)

# Define evaluation function
def evaluate(model, iterator, criterion, device):
    model.eval() # set model to evaluation model
    total_loss = 0
    with torch.no_grad(): # don't need dropout to be active so we disable gradients
        for batch in iterator:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            output = model(input_ids, attention_mask)
            loss = criterion(output, labels)

            total_loss += loss.item()

    return total_loss / len(iterator)

# Define prediction function
def predict(model, dataloader, device):
    model.eval()  # Set the model to evaluation mode
    predictions = []
    with torch.no_grad():  # Disable gradient tracking
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)  # Assuming input_ids are in the batch
            attention_mask = batch['attention_mask'].to(device)  # Assuming attention_mask is in the batch

            # Forward pass
            outputs = model(input_ids, attention_mask)

            # Get predictions
            _, preds = torch.max(outputs, dim=1)

            # Convert predictions to CPU and append to the list
            predictions.extend(preds.cpu().numpy())

    return predictions

In [7]:
# Use the paths for the data from the image classification

text_train,labels_train = read_text_files_with_labels(train_dir)
text_val,labels_val = read_text_files_with_labels(val_dir)
text_test,labels_test = read_text_files_with_labels(test_dir)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader, Dataset
import os
import re
import numpy as np
from transformers import DistilBertModel, DistilBertTokenizer
import wandb

# Define Data Directories
data_dir = "C:/Users/Auste/Documents/ENEL645_GarbageData/"
train_dir = os.path.join(data_dir, "CVPR_2024_dataset_Train")
val_dir = os.path.join(data_dir, "CVPR_2024_dataset_Val")
test_dir = os.path.join(data_dir, "CVPR_2024_dataset_Test")

# Initialize wandb
def initialize_wandb():
    if wandb.run is None:
        wandb.init(
            entity="shcau-university-of-calgary-in-alberta",
            project="transfer_learning_garbage",
            name="Multimodal_Model_RTX4060_R3",
            tags=["distilBERT", "efficientnet", "CVPR_2024_dataset"],
            notes="Multimodal classification model using distilBERT and efficientnet.",
            config={"epochs": 5, "batch_size": 128, "dataset": "CVPR_2024_dataset"},
            job_type="train",
            resume="allow",
        )

initialize_wandb()

# Define transformations
transform = {
    "train": transforms.Compose([
        models.EfficientNet_V2_S_Weights.IMAGENET1K_V1.transforms(), # This includes the following preprocessing: The images are resized to resize_size=[384] using interpolation=InterpolationMode.BILINEAR,
        # followed by a central crop of crop_size=[384]. Finally, the values are first rescaled to [0.0, 1.0] and then normalized using mean=[0.485, 0.456, 0.406] and std=[0.229, 0.224, 0.225]
        transforms.RandomHorizontalFlip(), # additional data augmentation step added to training data set
    ]),
    "val": models.EfficientNet_V2_S_Weights.IMAGENET1K_V1.transforms(),
    "test": models.EfficientNet_V2_S_Weights.IMAGENET1K_V1.transforms(),
}

# Load datasets
image_datasets = {
    "train": datasets.ImageFolder(train_dir, transform=transform["train"]),
    "val": datasets.ImageFolder(val_dir, transform=transform["val"]),
    "test": datasets.ImageFolder(test_dir, transform=transform["test"]),
}


# Text Classification

# Extract text from file names as well as labels
def read_text_files_with_labels(path):
    texts = []
    labels = []
    class_folders = sorted(os.listdir(path))
    label_map = {class_name: idx for idx, class_name in enumerate(class_folders)}

    for class_name in class_folders:
        class_path = os.path.join(path, class_name)
        if os.path.isdir(class_path):
            file_names = os.listdir(class_path)
            for file_name in file_names:
                file_path = os.path.join(class_path, file_name)
                if os.path.isfile(file_path):
                    file_name_no_ext, _ = os.path.splitext(file_name)
                    text = file_name_no_ext.replace('_', ' ')
                    text_without_digits = re.sub(r'\d+', '', text)
                    texts.append(text_without_digits)
                    labels.append(label_map[class_name])

    return np.array(texts), np.array(labels)

class CustomTextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


# Prepare text data

text_train,labels_train = read_text_files_with_labels(train_dir)
text_val,labels_val = read_text_files_with_labels(val_dir)
text_test,labels_test = read_text_files_with_labels(test_dir)

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
max_len = 24

#Define number of epochs
EPOCHS = 5


class MultimodalDataset(Dataset):
    def __init__(self, image_dataset, text_dataset):
        self.image_dataset = image_dataset
        self.text_dataset = text_dataset

    def __len__(self):
        return min(len(self.image_dataset), len(self.text_dataset))

    def __getitem__(self, idx):
        image, label = self.image_dataset[idx]
        text_data = self.text_dataset[idx]
        return {
            "image": image,
            "input_ids": text_data["input_ids"],
            "label": label
        }
    

class MultimodalClassifier(nn.Module):
    def __init__(self, num_classes):
        super(MultimodalClassifier, self).__init__()

        # EfficientNet (Image)
        self.image_model = models.efficientnet_v2_s(weights=models.EfficientNet_V2_S_Weights.IMAGENET1K_V1)

        # Freeze feature layers
        for param in self.image_model.features.parameters():
            param.requires_grad = False

        num_ftrs = self.image_model.classifier[1].in_features

        #Remove EfficientNet classifier
        self.image_model.classifier = nn.Identity()

        #Project features to 256 nodes
        self.image_fc = nn.Linear(num_ftrs, 256)

        # DistilBERT (Text)
        self.text_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.text_fc = nn.Linear(self.text_model.config.hidden_size, 256)

        # Normalization layers
        self.text_norms = nn.LayerNorm(256)
        self.image_norm = nn.LayerNorm(256)

        # Feature fusion Layer (Concatenation)
        self.fusion_fc = nn.Linear(512, self.text_model.config.hidden_size)

        # Classification Layer
        self.classifier = nn.Linear(self.text_model.config.hidden_size, num_classes)
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, input_ids, image_inputs):
        # Extract features
        text_output = self.text_model(input_ids=input_ids)
        text_features = self.text_norms(self.text_fc(text_output.last_hidden_state[:, 0, :]))  # Use CLS token
        image_features = self.image_norm(self.image_fc(self.image_model(image_inputs)))

        # Concatenate text and image features
        combined_features = torch.cat((text_features, image_features), dim=1)

        combined_features = self.fusion_fc(combined_features)
        output = self.classifier(self.dropout(combined_features))

        return output
    
# Data Loaders
BATCH_SIZE = 128
train_loader = DataLoader(MultimodalDataset(image_datasets["train"], CustomTextDataset(text_train, labels_train, tokenizer, max_len)), batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(MultimodalDataset(image_datasets["val"], CustomTextDataset(text_val, labels_val, tokenizer, max_len)), batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(MultimodalDataset(image_datasets["test"], CustomTextDataset(text_test, labels_test, tokenizer, max_len)), batch_size=BATCH_SIZE, shuffle=False)

# Evaluation Function
def evaluate_model(model, dataloader, device):
    model.eval()
    total_loss = 0
    correct, total = 0, 0
    criterion = nn.CrossEntropyLoss()

    with torch.no_grad():
        for batch in dataloader:
            images = batch["image"].to(device)
            input_ids = batch["input_ids"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, images)

            loss = criterion(outputs, labels)
            total_loss += loss.item()

            correct += (outputs.argmax(1) == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    return total_loss / len(dataloader), accuracy
        
# Model Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultimodalClassifier(num_classes=4).to(device)
optimizer = optim.Adam([
    {'params': model.text_model.parameters(), 'lr': 0.0001},  
    {'params': model.image_fc.parameters(), 'lr': 0.001},  
    {'params': model.classifier.parameters(), 'lr': 0.001}
])
criterion = nn.CrossEntropyLoss()

wandb.watch(model, log="all")
best_val_loss = float("inf")

# Training
for epoch in range(EPOCHS):
    model.train()
    total_train_loss = 0
    for batch in train_loader:
        images = batch["image"].to(device)
        input_ids = batch["input_ids"].to(device)
        labels = batch["label"].to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()

    val_loss, val_acc = evaluate_model(model, val_loader, device)
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best_multimodal_model.pth")

    wandb.log({"epoch": epoch+1, "train_loss": total_train_loss, "val_loss": val_loss, "val_accuracy": val_acc})
    print(f"Epoch {epoch+1}/{5}, Train Loss: {total_train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Epoch: 1, Train Loss: 1.2118
Epoch: 1, Val Loss: 0.9150
Epoch: 2, Train Loss: 0.7579
Epoch: 2, Val Loss: 0.5738
Epoch: 3, Train Loss: 0.4443
Epoch: 3, Val Loss: 0.5192
Epoch: 4, Train Loss: 0.2504
Epoch: 4, Val Loss: 0.5777


In [None]:
# Load Best Model for Testing
model.load_state_dict(torch.load("best_multimodal_model.pth"))
test_loss, test_acc = evaluate_model(model, test_loader, device)
wandb.log({"test_accuracy": test_acc})
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_acc:.4f}")
wandb.finish()

  text_model.load_state_dict(torch.load('best_model.pth'))


Accuracy:  0.7811
