In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
brset_embed = pd.read_csv('embeddings.csv')
brset_split = pd.read_csv('split.csv')

In [4]:
text_column_names = brset_embed.columns[brset_embed.columns.str.match('text_\d+')]
image_column_names = brset_embed.columns[brset_embed.columns.str.match('image_\d+')]
text_columns = brset_embed[text_column_names]
image_columns = brset_embed[image_column_names]

In [5]:
text_embed = torch.tensor(text_columns.values)
image_embed = torch.tensor(image_columns.values)
y = torch.tensor(brset_embed['DR_2'].values)

### Training Function

In [6]:
def train(model, train_loader, val_loader, criterion, optimizer, num_epochs=10, verbose=True, scheduler=None):
    model.to(device)
    history = {'train_loss': [], 'val_loss': [], 'val_auc': [], 'val_accuracy': [], 'val_f1': []}
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for X, y in train_loader:
            X = X.to(device).float()
            y = y.to(device).float()
            optimizer.zero_grad()
            val_logits = model(X)
            loss = criterion(val_logits, y.unsqueeze(1))
            loss.backward()
            train_loss += loss.item()
            optimizer.step()
        train_loss /= len(train_loader)
        history['train_loss'].append(train_loss)

        model.eval()
        with torch.no_grad():
            val_logits = []
            val_labels = []
            val_loss = 0
            for X, y in val_loader:
                X = X.to(device).float()
                y = y.to(device).float()
                val_labels.extend(y.tolist())
                y_pred = model(X)
                val_logits.append(y_pred.cpu().numpy())
                loss = criterion(y_pred, y.unsqueeze(1))
                val_loss += loss.item()
            val_loss /= len(val_loader)
            history['val_loss'].append(val_loss)
            val_logits = np.concatenate(val_logits)
            val_preds = nn.Sigmoid()(torch.tensor(val_logits)).cpu().numpy()
            auc = roc_auc_score(val_labels, val_preds)
            history['val_auc'].append(auc)
            accuracy = accuracy_score(val_labels, val_preds > 0.5)
            history['val_accuracy'].append(accuracy)
            f1 = f1_score(val_labels, val_preds > 0.5)
            history['val_f1'].append(f1)
            if scheduler is not None:
                scheduler.step(val_loss)
                last_lr = scheduler.get_last_lr()[0]
            else:
                last_lr = optimizer.param_groups[0]['lr']
            if verbose:
                print(f'Epoch {epoch+1}/{num_epochs}, train loss: {train_loss:.4f}, val loss: {val_loss:.4f}, val auc: {auc:.4f}, val accuracy: {accuracy:.4f}, val f1: {f1:.4f}, LR: {last_lr}')
    return history


def get_probs(model, loader):
    model.eval()
    model.to(device)
    y_hat = torch.tensor([]).to(device)
    with torch.no_grad():
        for X,_ in loader:
            X = X.to(device).float()
            y_hat = torch.cat((y_hat, model(X)))
    return nn.Sigmoid()(y_hat).cpu().numpy().flatten()

def get_optimal_f1_threshold(y_true, y_pred):
    epsilon = 1e-10
    precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
    f1 = 2 * precision * recall / (precision + recall + epsilon)
    return thresholds[np.argmax(f1)]

# Simple Dataset to support embeddings
class SimpleDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [7]:
# Copied from https://github.com/luisnakayama/BRSET/blob/main/src/FocalLoss.py
class BinaryFocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduction='mean'):
        super(BinaryFocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        bce_loss = nn.BCEWithLogitsLoss(reduction='none')(inputs, targets)
        p_t = torch.exp(-bce_loss)
        focal_loss = self.alpha * (1 - p_t) ** self.gamma * bce_loss

        if self.reduction == 'mean':
            return torch.mean(focal_loss)
        elif self.reduction == 'sum':
            return torch.sum(focal_loss)
        else:
            return focal_loss

### Image Only Model - Embedding data split

In [8]:
from sklearn.model_selection import train_test_split

train_embed_idx = brset_split[brset_split['embeddings_split'] == 'train'].index
test_embed_idx = brset_split[brset_split['embeddings_split'] == 'test'].index
train_img_emsplit = image_embed[train_embed_idx]
test_img_emsplit = image_embed[test_embed_idx]
train_y_emsplit = y[train_embed_idx]
test_y_emsplit = y[test_embed_idx]

train_img_emsplit, val_img_emsplit, train_y_emsplit, val_y_emsplit = train_test_split(train_img_emsplit, train_y_emsplit, 
                                                                                      test_size=len(test_embed_idx)/len(train_embed_idx),
                                                                                      random_state=42)

print(train_img_emsplit.shape, val_img_emsplit.shape, test_img_emsplit.shape)

torch.Size([9758, 1536]) torch.Size([3254, 1536]) torch.Size([3254, 1536])


In [9]:
image_emsplit_train_dataset = SimpleDataset(train_img_emsplit, train_y_emsplit)
image_emsplit_val_dataset = SimpleDataset(val_img_emsplit, val_y_emsplit)
image_emsplit_test_dataset = SimpleDataset(test_img_emsplit, test_y_emsplit)

image_emsplit_train_loader = DataLoader(image_emsplit_train_dataset, batch_size=32, shuffle=True)
image_emsplit_val_loader = DataLoader(image_emsplit_val_dataset, batch_size=32, shuffle=False)
image_emsplit_test_loader = DataLoader(image_emsplit_test_dataset, batch_size=32, shuffle=False)

In [10]:
# Train image only model
image_only_model_emsplit = nn.Sequential(
    nn.Linear(1536, 256),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(256, 1),
    # nn.Sigmoid()
)

In [11]:
p1 = sum(train_y_emsplit)/len(train_y_emsplit)
p0 = 1 - p1
pos_weight = torch.tensor(p0/p1).to(device)

# criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
criterion = BinaryFocalLoss(alpha=1-p1, gamma=2)
optimizer = optim.Adam(image_only_model_emsplit.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5)
history = train(image_only_model_emsplit, image_emsplit_train_loader, image_emsplit_val_loader, criterion, optimizer, num_epochs=50, scheduler=scheduler)

  pos_weight = torch.tensor(p0/p1).to(device)
  _torch_pytree._register_pytree_node(


Epoch 1/50, train loss: 0.0709, val loss: 0.0469, val auc: 0.9281, val accuracy: 0.9462, val f1: 0.5658, LR: 0.001
Epoch 2/50, train loss: 0.0389, val loss: 0.0370, val auc: 0.9311, val accuracy: 0.9474, val f1: 0.4771, LR: 0.001
Epoch 3/50, train loss: 0.0375, val loss: 0.0503, val auc: 0.9364, val accuracy: 0.9373, val f1: 0.2444, LR: 0.001
Epoch 4/50, train loss: 0.0363, val loss: 0.0455, val auc: 0.9411, val accuracy: 0.9462, val f1: 0.4224, LR: 0.001
Epoch 5/50, train loss: 0.0343, val loss: 0.0339, val auc: 0.9404, val accuracy: 0.9539, val f1: 0.6053, LR: 0.001
Epoch 6/50, train loss: 0.0336, val loss: 0.0443, val auc: 0.9445, val accuracy: 0.9468, val f1: 0.4328, LR: 0.001
Epoch 7/50, train loss: 0.0340, val loss: 0.0701, val auc: 0.9408, val accuracy: 0.9388, val f1: 0.2491, LR: 0.001
Epoch 8/50, train loss: 0.0340, val loss: 0.0376, val auc: 0.9467, val accuracy: 0.9505, val f1: 0.6596, LR: 0.001
Epoch 9/50, train loss: 0.0321, val loss: 0.0324, val auc: 0.9475, val accuracy:

In [12]:
# Evaluate image only model on test set
y_probs = get_probs(image_only_model_emsplit, image_emsplit_test_loader)
y_preds = (np.array(y_probs) > 0.5).astype(int)
image_only_roc = roc_auc_score(test_y_emsplit.numpy(), y_probs)
image_only_accuracy = accuracy_score(test_y_emsplit.numpy(), y_preds)
image_only_f1 = f1_score(test_y_emsplit.numpy(), y_preds)
print(f'Image Only ROC: {image_only_roc}, Accuracy: {image_only_accuracy}, F1: {image_only_f1}')

threshold = get_optimal_f1_threshold(test_y_emsplit.numpy(), y_probs)
y_preds = (np.array(y_probs) > threshold).astype(int)
image_only_accuracy = accuracy_score(test_y_emsplit.numpy(), y_preds)
image_only_f1 = f1_score(test_y_emsplit.numpy(), y_preds)
print(f'Image Only Accuracy: {image_only_accuracy}, F1: {image_only_f1}')

np.save('probs/image_only_test_emsplit_probs.npy', y_probs)

Image Only ROC: 0.9480277164746064, Accuracy: 0.9668100799016595, F1: 0.7096774193548386
Image Only Accuracy: 0.9683466502765826, F1: 0.7146814404432132


### Image Only Model - Resplit Data

In [13]:
train_idx = brset_split[brset_split['split'] == 'train'].index
val_idx = brset_split[brset_split['split'] == 'val'].index
test_idx = brset_split[brset_split['split'] == 'test'].index

image_train = image_embed[train_idx]
image_val = image_embed[val_idx]
image_test = image_embed[test_idx]

y_train = y[train_idx]
y_val = y[val_idx]
y_test = y[test_idx]

# DataSet
image_train_dataset = SimpleDataset(image_train, y_train)
image_val_dataset = SimpleDataset(image_val, y_val)
image_test_dataset = SimpleDataset(image_test, y_test)

# DataLoader
image_train_loader = DataLoader(image_train_dataset, batch_size=32, shuffle=True)
image_val_loader = DataLoader(image_val_dataset, batch_size=32, shuffle=False)
image_test_loader = DataLoader(image_test_dataset, batch_size=32, shuffle=False)

In [14]:
image_only_model = nn.Sequential(
    nn.Linear(1536, 256),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(256, 1),
    # nn.Sigmoid()
)

In [15]:
criterion = BinaryFocalLoss(alpha=1-p1, gamma=2)
optimizer = optim.Adam(image_only_model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5)
history = train(image_only_model, image_train_loader, image_val_loader, criterion, optimizer, num_epochs=50, scheduler=scheduler)

Epoch 1/50, train loss: 0.0595, val loss: 0.0378, val auc: 0.9081, val accuracy: 0.9521, val f1: 0.5465, LR: 0.001
Epoch 2/50, train loss: 0.0434, val loss: 0.0457, val auc: 0.9250, val accuracy: 0.9487, val f1: 0.6089, LR: 0.001
Epoch 3/50, train loss: 0.0402, val loss: 0.0355, val auc: 0.9277, val accuracy: 0.9530, val f1: 0.5049, LR: 0.001
Epoch 4/50, train loss: 0.0370, val loss: 0.0390, val auc: 0.9360, val accuracy: 0.9585, val f1: 0.5714, LR: 0.001
Epoch 5/50, train loss: 0.0368, val loss: 0.0396, val auc: 0.9278, val accuracy: 0.9555, val f1: 0.6348, LR: 0.001
Epoch 6/50, train loss: 0.0360, val loss: 0.0327, val auc: 0.9373, val accuracy: 0.9604, val f1: 0.6195, LR: 0.001
Epoch 7/50, train loss: 0.0338, val loss: 0.0349, val auc: 0.9433, val accuracy: 0.9542, val f1: 0.4880, LR: 0.001
Epoch 8/50, train loss: 0.0329, val loss: 0.0307, val auc: 0.9413, val accuracy: 0.9622, val f1: 0.6535, LR: 0.001
Epoch 9/50, train loss: 0.0335, val loss: 0.0319, val auc: 0.9316, val accuracy:

In [16]:
y_probs = get_probs(image_only_model, image_test_loader)
y_preds = (np.array(y_probs) > 0.5).astype(int)
image_only_roc = roc_auc_score(y[test_idx].numpy(), y_probs)
image_only_accuracy = accuracy_score(y[test_idx].numpy(), y_preds)
image_only_f1 = f1_score(y[test_idx].numpy(), y_preds)
print(f'Image Only ROC: {image_only_roc}, Accuracy: {image_only_accuracy}, F1: {image_only_f1}')

threshold = get_optimal_f1_threshold(y[test_idx].numpy(), y_probs)
y_preds = (np.array(y_probs) > threshold).astype(int)
image_only_accuracy = accuracy_score(y[test_idx].numpy(), y_preds)
image_only_f1 = f1_score(y[test_idx].numpy(), y_preds)
print(f'Image Only Accuracy: {image_only_accuracy}, F1: {image_only_f1}')

np.save('probs/image_only_test_probs.npy', y_probs)

Image Only ROC: 0.9557650850470365, Accuracy: 0.9637257915770059, F1: 0.6775956284153005
Image Only Accuracy: 0.9600368890255149, F1: 0.7161572052401747


### Text Only Model

In [17]:
# split text_embed into train and test based on brset_embed['split']
text_train = text_embed[train_idx]
text_val = text_embed[val_idx]
text_test = text_embed[test_idx]


In [18]:
text_train_dataset = SimpleDataset(text_train, y_train)
text_val_dataset = SimpleDataset(text_val, y_val)
text_test_dataset = SimpleDataset(text_test, y_test)

text_train_loader = DataLoader(text_train_dataset, batch_size=32, shuffle=True)
text_val_loader = DataLoader(text_val_dataset, batch_size=32, shuffle=False)
text_test_loader = DataLoader(text_test_dataset, batch_size=32, shuffle=False)

In [19]:
text_only_model = nn.Sequential(
    nn.Linear(4096, 256),
    nn.ReLU(),
    nn.Dropout(0.2),
    # nn.BatchNorm1d(256),
    nn.Linear(256, 1),
    # nn.Sigmoid()
)

In [20]:
# criterion = nn.BCEWithLogitsLoss()
criterion = BinaryFocalLoss(alpha=1-p1, gamma=2)
optimizer = optim.Adam(text_only_model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5)
history = train(text_only_model, text_train_loader, text_val_loader, criterion, optimizer, num_epochs=50, scheduler=scheduler)

Epoch 1/50, train loss: 0.0905, val loss: 0.0330, val auc: 0.9505, val accuracy: 0.9493, val f1: 0.3726, LR: 0.001
Epoch 2/50, train loss: 0.0331, val loss: 0.0293, val auc: 0.9565, val accuracy: 0.9705, val f1: 0.7513, LR: 0.001
Epoch 3/50, train loss: 0.0321, val loss: 0.0309, val auc: 0.9548, val accuracy: 0.9352, val f1: 0.0000, LR: 0.001
Epoch 4/50, train loss: 0.0356, val loss: 0.0321, val auc: 0.9523, val accuracy: 0.9564, val f1: 0.5449, LR: 0.001
Epoch 5/50, train loss: 0.0328, val loss: 0.0431, val auc: 0.9565, val accuracy: 0.9561, val f1: 0.4911, LR: 0.001
Epoch 6/50, train loss: 0.0335, val loss: 0.0287, val auc: 0.9561, val accuracy: 0.9613, val f1: 0.6379, LR: 0.001
Epoch 7/50, train loss: 0.0341, val loss: 0.0360, val auc: 0.9546, val accuracy: 0.9671, val f1: 0.7617, LR: 0.001
Epoch 8/50, train loss: 0.0341, val loss: 0.0294, val auc: 0.9573, val accuracy: 0.9721, val f1: 0.7378, LR: 0.001
Epoch 9/50, train loss: 0.0305, val loss: 0.0293, val auc: 0.9603, val accuracy:

In [21]:
# Evaluate text only model on test set
y_probs = get_probs(text_only_model, text_test_loader)
y_preds = (np.array(y_probs) > 0.5).astype(int)
text_only_roc = roc_auc_score(y_test.numpy(), y_probs)
text_only_accuracy = accuracy_score(y_test.numpy(), y_preds)
text_only_f1 = f1_score(y_test.numpy(), y_preds)
print(f'Text Only ROC: {text_only_roc}, Accuracy: {text_only_accuracy}, F1: {text_only_f1}')

threshold = get_optimal_f1_threshold(y_test.numpy(), y_probs)
y_preds = (np.array(y_probs) > threshold).astype(int)
text_only_accuracy = accuracy_score(y_test.numpy(), y_preds)
text_only_f1 = f1_score(y_test.numpy(), y_preds)
print(f'Text Only Accuracy: {text_only_accuracy}, F1: {text_only_f1}')

np.save('probs/text_only_test_probs.npy', y_probs)

Text Only ROC: 0.9794025026678107, Accuracy: 0.9809406701506301, F1: 0.8495145631067962
Text Only Accuracy: 0.9827851214263756, F1: 0.8585858585858586


### Simple Early Fusion Model

In [22]:
simple_early_fusion_model = nn.Sequential(
    nn.Linear(4096+1536, 512),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(512, 1),
    # nn.Sigmoid()
)

combined_train = torch.cat((text_train, image_train), dim=1)
combined_val = torch.cat((text_val, image_val), dim=1)
combined_test = torch.cat((text_test, image_test), dim=1)

combined_train_dataset = SimpleDataset(combined_train, y_train)
combined_val_dataset = SimpleDataset(combined_val, y_val)
combined_test_dataset = SimpleDataset(combined_test, y_test)

combined_train_loader = DataLoader(combined_train_dataset, batch_size=32, shuffle=True)
combined_val_loader = DataLoader(combined_val_dataset, batch_size=32, shuffle=False)
combined_test_loader = DataLoader(combined_test_dataset, batch_size=32, shuffle=False)

In [23]:
criterion = BinaryFocalLoss(alpha=1-p1, gamma=2)
optimizer = optim.Adam(simple_early_fusion_model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5)
history = train(simple_early_fusion_model, combined_train_loader, combined_val_loader, criterion, optimizer, num_epochs=50, scheduler=scheduler)

Epoch 1/50, train loss: 0.0672, val loss: 0.0274, val auc: 0.9603, val accuracy: 0.9610, val f1: 0.5916, LR: 0.001
Epoch 2/50, train loss: 0.0307, val loss: 0.0353, val auc: 0.9628, val accuracy: 0.9524, val f1: 0.4238, LR: 0.001
Epoch 3/50, train loss: 0.0328, val loss: 0.0355, val auc: 0.9621, val accuracy: 0.9438, val f1: 0.2343, LR: 0.001
Epoch 4/50, train loss: 0.0310, val loss: 0.0271, val auc: 0.9646, val accuracy: 0.9748, val f1: 0.7819, LR: 0.001
Epoch 5/50, train loss: 0.0293, val loss: 0.0262, val auc: 0.9664, val accuracy: 0.9730, val f1: 0.7514, LR: 0.001
Epoch 6/50, train loss: 0.0301, val loss: 0.0323, val auc: 0.9600, val accuracy: 0.9714, val f1: 0.7748, LR: 0.001
Epoch 7/50, train loss: 0.0348, val loss: 0.0367, val auc: 0.9628, val accuracy: 0.9555, val f1: 0.4803, LR: 0.001
Epoch 8/50, train loss: 0.0314, val loss: 0.0266, val auc: 0.9685, val accuracy: 0.9721, val f1: 0.7300, LR: 0.001
Epoch 9/50, train loss: 0.0321, val loss: 0.0265, val auc: 0.9669, val accuracy:

In [24]:
y_probs = get_probs(simple_early_fusion_model, combined_test_loader)
y_preds = (np.array(y_probs) > 0.5).astype(int)
early_fusion_roc = roc_auc_score(y_test.numpy(), y_probs)
early_fusion_accuracy = accuracy_score(y_test.numpy(), y_preds)
early_fusion_f1 = f1_score(y_test.numpy(), y_preds)
print(f'Early Fusion ROC: {early_fusion_roc}, Accuracy: {early_fusion_accuracy}, F1: {early_fusion_f1}')

threshold = get_optimal_f1_threshold(y_test.numpy(), y_probs)
y_preds = (np.array(y_probs) > threshold).astype(int)
early_fusion_accuracy = accuracy_score(y_test.numpy(), y_preds)
early_fusion_f1 = f1_score(y_test.numpy(), y_preds)
print(f'Early Fusion Accuracy: {early_fusion_accuracy}, F1: {early_fusion_f1}')

np.save('probs/early_fusion_test_probs.npy', y_probs)

Early Fusion ROC: 0.9869392600246637, Accuracy: 0.9837073470642483, F1: 0.8651399491094147
Early Fusion Accuracy: 0.9833999385182908, F1: 0.864321608040201


### Simple Late Fusion Model

In [25]:
# Define separate modules for text and image processing
class TextModule(nn.Module):
    def __init__(self):
        super(TextModule, self).__init__()
        self.fc1 = nn.Linear(4096, 256)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        return x

class ImageModule(nn.Module):
    def __init__(self):
        super(ImageModule, self).__init__()
        self.fc1 = nn.Linear(1536, 256)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        return x

class SimpleLateFusionModel(nn.Module):
    def __init__(self):
        super(SimpleLateFusionModel, self).__init__()
        self.text_module = TextModule()
        self.image_module = ImageModule()
        self.fc1 = nn.Linear(512, 128)
        self.fc2 = nn.Linear(128, 1)
        # self.output = nn.Sigmoid()

    def forward(self, combined_data):
        text_data = combined_data[:, :4096]
        image_data = combined_data[:, 4096:]
        text_features = self.text_module(text_data)
        image_features = self.image_module(image_data)
        combined_features = torch.cat((text_features, image_features), dim=1)
        x = self.fc1(combined_features)
        x = torch.relu(x)
        x = self.fc2(x)
        # x = self.output(x)
        return x

late_fusion_model = SimpleLateFusionModel()

In [26]:
criterion = BinaryFocalLoss(alpha=1-p1, gamma=2)
optimizer = optim.Adam(late_fusion_model.parameters(), lr=0.0001, weight_decay=1e-5)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5)
history = train(late_fusion_model, combined_train_loader, combined_val_loader, criterion, optimizer, num_epochs=50, scheduler=scheduler)

Epoch 1/50, train loss: 0.0396, val loss: 0.0313, val auc: 0.9571, val accuracy: 0.9545, val f1: 0.4752, LR: 0.0001
Epoch 2/50, train loss: 0.0240, val loss: 0.0280, val auc: 0.9665, val accuracy: 0.9644, val f1: 0.6258, LR: 0.0001
Epoch 3/50, train loss: 0.0215, val loss: 0.0224, val auc: 0.9686, val accuracy: 0.9724, val f1: 0.7384, LR: 0.0001
Epoch 4/50, train loss: 0.0189, val loss: 0.0217, val auc: 0.9704, val accuracy: 0.9770, val f1: 0.7899, LR: 0.0001
Epoch 5/50, train loss: 0.0184, val loss: 0.0190, val auc: 0.9702, val accuracy: 0.9785, val f1: 0.8223, LR: 0.0001
Epoch 6/50, train loss: 0.0181, val loss: 0.0261, val auc: 0.9716, val accuracy: 0.9687, val f1: 0.6832, LR: 0.0001
Epoch 7/50, train loss: 0.0172, val loss: 0.0195, val auc: 0.9726, val accuracy: 0.9782, val f1: 0.8065, LR: 0.0001
Epoch 8/50, train loss: 0.0159, val loss: 0.0324, val auc: 0.9725, val accuracy: 0.9678, val f1: 0.6708, LR: 0.0001
Epoch 9/50, train loss: 0.0164, val loss: 0.0172, val auc: 0.9722, val a

In [27]:
y_probs = get_probs(late_fusion_model, combined_test_loader)
y_preds = (np.array(y_probs) > 0.5).astype(int)
late_fusion_roc = roc_auc_score(y_test.numpy(), y_probs)
late_fusion_accuracy = accuracy_score(y_test.numpy(), y_preds)
late_fusion_f1 = f1_score(y_test.numpy(), y_preds)
print(f'Late Fusion ROC: {late_fusion_roc}, Accuracy: {late_fusion_accuracy}, F1: {late_fusion_f1}')

threshold = get_optimal_f1_threshold(y_test.numpy(), y_probs)
y_preds = (np.array(y_probs) > threshold).astype(int)
late_fusion_accuracy = accuracy_score(y_test.numpy(), y_preds)
late_fusion_f1 = f1_score(y_test.numpy(), y_preds)
print(f'Late Fusion Accuracy: {late_fusion_accuracy}, F1: {late_fusion_f1}')

np.save('probs/late_fusion_test_probs.npy', y_probs)

Late Fusion ROC: 0.9874512951567319, Accuracy: 0.9843221641561636, F1: 0.874074074074074
Late Fusion Accuracy: 0.9852443897940363, F1: 0.8829268292682927


### Attention Model

In [28]:
class ImageAttentionModule(nn.Module):
    def __init__(self):
        super(ImageAttentionModule, self).__init__()
        self.fc1 = nn.Linear(1536, 256)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.attention = nn.MultiheadAttention(embed_dim=16, num_heads=4, batch_first=True)

    def forward(self, x):
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.dropout(x)

        # Reshape x from [batch_size, 256] to [batch_size, 16, 16] for attention
        x = x.view(-1, 16, 16)

        # Apply attention
        attn_output, _ = self.attention(x, x, x)

        # Flatten the output for the final fully connected layer
        x = attn_output.reshape(-1, 256)  # Reshape back to original shape after attention
        return x

class TextAttentionModule(nn.Module):
    def __init__(self):
        super(TextAttentionModule, self).__init__()
        self.fc1 = nn.Linear(4096, 256)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.attention = nn.MultiheadAttention(embed_dim=16, num_heads=4, batch_first=True)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)

        x = x.view(-1, 16, 16)

        # Apply attention
        attn_output, _ = self.attention(x, x, x)

        # Flatten the output for the final fully connected layer
        x = attn_output.reshape(-1, 256)  # Reshape back to original shape after attention
        return x

class AttentionFusionModel(nn.Module):
    def __init__(self):
        super(AttentionFusionModel, self).__init__()
        self.text_attention = TextAttentionModule()
        self.image_attention = ImageAttentionModule()
        self.cross_attention = nn.MultiheadAttention(embed_dim=256, num_heads=4, batch_first=True)
        self.fc1 = nn.Linear(256, 128)
        self.fc2 = nn.Linear(128, 1)
        # self.output = nn.Sigmoid()

    def forward(self, combined_data):
        text_data = combined_data[:, :4096]
        image_data = combined_data[:, 4096:]
        text_output = self.text_attention(text_data)
        image_output = self.image_attention(image_data)
        # combined_features = torch.cat((text_output, image_output), dim=1)
        combined_features, _ = self.cross_attention(text_output.unsqueeze(1), image_output.unsqueeze(1), image_output.unsqueeze(1))
        combined_features = combined_features.squeeze(1)
        x = self.fc1(combined_features)

        x = torch.relu(x)
 
        x = self.fc2(x)
        # x = self.output(x)
        return x
    
attention_fusion_model = AttentionFusionModel()

In [29]:
criterion = BinaryFocalLoss(alpha=1-p1, gamma=2)
optimizer = optim.Adam(attention_fusion_model.parameters(), lr=0.0001, weight_decay=1e-5)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=5)
history = train(attention_fusion_model, combined_train_loader, combined_val_loader, criterion, optimizer, num_epochs=50, scheduler=scheduler)

Epoch 1/50, train loss: 0.0567, val loss: 0.0510, val auc: 0.8974, val accuracy: 0.9367, val f1: 0.0463, LR: 0.0001
Epoch 2/50, train loss: 0.0405, val loss: 0.0354, val auc: 0.9188, val accuracy: 0.9533, val f1: 0.5936, LR: 0.0001
Epoch 3/50, train loss: 0.0381, val loss: 0.0376, val auc: 0.9251, val accuracy: 0.9502, val f1: 0.4130, LR: 0.0001
Epoch 4/50, train loss: 0.0382, val loss: 0.0384, val auc: 0.9256, val accuracy: 0.9481, val f1: 0.3525, LR: 0.0001
Epoch 5/50, train loss: 0.0362, val loss: 0.0352, val auc: 0.9322, val accuracy: 0.9582, val f1: 0.6402, LR: 0.0001
Epoch 6/50, train loss: 0.0340, val loss: 0.0335, val auc: 0.9330, val accuracy: 0.9582, val f1: 0.5641, LR: 0.0001
Epoch 7/50, train loss: 0.0333, val loss: 0.0311, val auc: 0.9369, val accuracy: 0.9619, val f1: 0.6517, LR: 0.0001
Epoch 8/50, train loss: 0.0324, val loss: 0.0313, val auc: 0.9374, val accuracy: 0.9598, val f1: 0.6431, LR: 0.0001
Epoch 9/50, train loss: 0.0316, val loss: 0.0302, val auc: 0.9414, val a

In [30]:
y_probs = get_probs(attention_fusion_model, combined_test_loader)
y_preds = (np.array(y_probs) > 0.5).astype(int)
attention_fusion_roc = roc_auc_score(y_test.numpy(), y_probs)
attention_fusion_accuracy = accuracy_score(y_test.numpy(), y_preds)
attention_fusion_f1 = f1_score(y_test.numpy(), y_preds)
print(f'Attention Fusion ROC: {attention_fusion_roc}, Accuracy: {attention_fusion_accuracy}, F1: {attention_fusion_f1}')

threshold = get_optimal_f1_threshold(y_test.numpy(), y_probs)
y_preds = (np.array(y_probs) > threshold).astype(int)
attention_fusion_accuracy = accuracy_score(y_test.numpy(), y_preds)
attention_fusion_f1 = f1_score(y_test.numpy(), y_preds)
print(f'Attention Fusion Accuracy: {attention_fusion_accuracy}, F1: {attention_fusion_f1}')

np.save('probs/attention_fusion_test_probs.npy', y_probs)

Attention Fusion ROC: 0.9537507726656272, Accuracy: 0.9640332001229635, F1: 0.6507462686567165
Attention Fusion Accuracy: 0.9637257915770059, F1: 0.7293577981651376
