In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
brset_embed = pd.read_csv('embeddings.csv')

In [4]:
text_column_names = brset_embed.columns[brset_embed.columns.str.match('text_\d+')]
image_column_names = brset_embed.columns[brset_embed.columns.str.match('image_\d+')]
text_columns = brset_embed[text_column_names]
image_columns = brset_embed[image_column_names]

In [5]:
text_embed = torch.tensor(text_columns.values)
image_embed = torch.tensor(image_columns.values)
y = torch.tensor(brset_embed['DR_2'].values)

### Text Only Model

In [6]:
text_only_model = nn.Sequential(
    nn.Linear(4096, 256),
    nn.ReLU(),
    nn.Dropout(0.2),
    # nn.BatchNorm1d(256),
    nn.Linear(256, 1),
    nn.Sigmoid()
)

In [7]:
# split text_embed into train and test based on brset_embed['split']
train_idx = brset_embed[brset_embed['split'] == 'train'].index
test_idx = brset_embed[brset_embed['split'] == 'test'].index

text_train = text_embed[train_idx]
text_test = text_embed[test_idx]
y_train = y[train_idx]
y_test = y[test_idx]

In [8]:
# create a dataset and dataloader for text only
class SimpleDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

text_train_dataset = SimpleDataset(text_train, y_train)
text_test_dataset = SimpleDataset(text_test, y_test)

text_train_loader = DataLoader(text_train_dataset, batch_size=32, shuffle=True)
text_test_loader = DataLoader(text_test_dataset, batch_size=32, shuffle=False)

In [9]:
def train(model, train_loader, test_loader, criterion, optimizer, num_epochs=10):
    for epoch in range(num_epochs):
        model.train()
        model.to(device)
        epoch_loss = 0
        for X, y in train_loader:
            X = X.to(device).float()
            y = y.to(device).float()
            optimizer.zero_grad()
            y_hat = model(X)
            loss = criterion(y_hat, y.unsqueeze(1))
            loss.backward()
            epoch_loss += loss.item()
            optimizer.step()
        # Probably would be more meaningful to use a validation set
        model.eval()
        with torch.no_grad():
            y_hat = torch.tensor([]).to(device)
            y_true = torch.tensor([]).to(device)
            for X, y in test_loader:
                X = X.to(device).float()
                y = y.to(device).float()
                y_hat = torch.cat((y_hat, model(X)))
                y_true = torch.cat((y_true, y.unsqueeze(1)))
            auc = roc_auc_score(y_true.cpu().numpy(), y_hat.cpu().numpy())
            accuracy = accuracy_score(y_true.cpu().numpy(), y_hat.cpu().numpy() > 0.5)
            f1 = f1_score(y_true.cpu().numpy(), y_hat.cpu().numpy() > 0.5)
            print(f'Epoch {epoch} Loss: {epoch_loss}, AUC: {auc}, Accuracy: {accuracy}, F1: {f1}')

def get_predicted_probs(model, loader):
    model.eval()
    model.to(device)
    y_hat = torch.tensor([]).to(device)
    with torch.no_grad():
        for X,_ in loader:
            X = X.to(device).float()
            y_hat = torch.cat((y_hat, model(X)))
    return y_hat.cpu().numpy().flatten()


In [10]:
criterion = nn.BCELoss()
optimizer = optim.Adam(text_only_model.parameters(), lr=0.001)
train(text_only_model, text_train_loader, text_test_loader, criterion, optimizer, num_epochs=20)


  _torch_pytree._register_pytree_node(


Epoch 0 Loss: 2601.4479944936684, AUC: 0.5, Accuracy: 0.9351567301782422, F1: 0.0
Epoch 1 Loss: 2639.5737512111664, AUC: 0.5, Accuracy: 0.9351567301782422, F1: 0.0
Epoch 2 Loss: 2640.5594022274017, AUC: 0.5, Accuracy: 0.9351567301782422, F1: 0.0
Epoch 3 Loss: 2635.1970212459564, AUC: 0.5, Accuracy: 0.9351567301782422, F1: 0.0
Epoch 4 Loss: 2635.176661968231, AUC: 0.5, Accuracy: 0.9351567301782422, F1: 0.0
Epoch 5 Loss: 2639.4062683582306, AUC: 0.5, Accuracy: 0.9351567301782422, F1: 0.0
Epoch 6 Loss: 2640.1730177402496, AUC: 0.5, Accuracy: 0.9351567301782422, F1: 0.0
Epoch 7 Loss: 2638.8391065597534, AUC: 0.5, Accuracy: 0.9351567301782422, F1: 0.0
Epoch 8 Loss: 2641.0860862731934, AUC: 0.5, Accuracy: 0.9351567301782422, F1: 0.0
Epoch 9 Loss: 2636.728802919388, AUC: 0.5, Accuracy: 0.9351567301782422, F1: 0.0
Epoch 10 Loss: 2637.664505958557, AUC: 0.5, Accuracy: 0.9351567301782422, F1: 0.0
Epoch 11 Loss: 2635.28834605217, AUC: 0.5, Accuracy: 0.9351567301782422, F1: 0.0
Epoch 12 Loss: 2638

In [11]:
# Evaluate text only model on test set
y_pred_probs = get_predicted_probs(text_only_model, text_test_loader)
y_preds = (np.array(y_pred_probs) > 0.5).astype(int)
text_only_roc = roc_auc_score(y_test.numpy(), y_pred_probs)
text_only_accuracy = accuracy_score(y_test.numpy(), y_preds)
text_only_f1 = f1_score(y_test.numpy(), y_preds)
print(f'Text Only ROC: {text_only_roc}, Accuracy: {text_only_accuracy}, F1: {text_only_f1}')

Text Only ROC: 0.5, Accuracy: 0.9351567301782422, F1: 0.0


### Image Only model

In [12]:
# Train image only model
image_only_model = nn.Sequential(
    nn.Linear(1536, 256),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(256, 1),
    nn.Sigmoid()
)

image_train = image_embed[train_idx]
image_test = image_embed[test_idx]

image_train_dataset = SimpleDataset(image_train, y_train)
image_test_dataset = SimpleDataset(image_test, y_test)

image_train_loader = DataLoader(image_train_dataset, batch_size=32, shuffle=True)
image_test_loader = DataLoader(image_test_dataset, batch_size=32, shuffle=False)

In [13]:
criterion = nn.BCELoss()
optimizer = optim.Adam(image_only_model.parameters(), lr=0.001)
train(image_only_model, image_train_loader, image_test_loader, criterion, optimizer, num_epochs=20)

Epoch 0 Loss: 76.19346247427166, AUC: 0.926287197873139, Accuracy: 0.9520590043023971, F1: 0.472972972972973
Epoch 1 Loss: 60.208522813394666, AUC: 0.9332334485331107, Accuracy: 0.9557467732022127, F1: 0.5443037974683544
Epoch 2 Loss: 58.06787451263517, AUC: 0.9370445416642657, Accuracy: 0.9511370620774432, F1: 0.4341637010676156
Epoch 3 Loss: 56.007340430282056, AUC: 0.9394305631914128, Accuracy: 0.96220036877689, F1: 0.6434782608695652
Epoch 4 Loss: 51.650988567620516, AUC: 0.9434597000652573, Accuracy: 0.9600491702519975, F1: 0.6036585365853658
Epoch 5 Loss: 50.063370938878506, AUC: 0.9404351218630904, Accuracy: 0.9631223110018439, F1: 0.6511627906976745
Epoch 6 Loss: 48.70467131724581, AUC: 0.9451261772415287, Accuracy: 0.9631223110018439, F1: 0.6685082872928176
Epoch 7 Loss: 47.15485030040145, AUC: 0.9458644110560637, Accuracy: 0.9652735095267363, F1: 0.6869806094182825
Epoch 8 Loss: 48.07534423749894, AUC: 0.9480432910276557, Accuracy: 0.96220036877689, F1: 0.6328358208955224
Epo

In [14]:
# Evaluate image only model on test set
y_pred_probs = get_predicted_probs(image_only_model, image_test_loader)
y_preds = (np.array(y_pred_probs) > 0.5).astype(int)
image_only_roc = roc_auc_score(y_test.numpy(), y_pred_probs)
image_only_accuracy = accuracy_score(y_test.numpy(), y_preds)
image_only_f1 = f1_score(y_test.numpy(), y_preds)
print(f'Image Only ROC: {image_only_roc}, Accuracy: {image_only_accuracy}, F1: {image_only_f1}')

Image Only ROC: 0.9517952008572232, Accuracy: 0.9569760295021512, F1: 0.6929824561403508


### Simple Early Fusion Model

In [15]:
simple_early_fusion_model = nn.Sequential(
    nn.Linear(4096+1536, 512),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(512, 1),
    nn.Sigmoid()
)

combined_train = torch.cat((text_train, image_train), dim=1)
combined_test = torch.cat((text_test, image_test), dim=1)

combined_train_dataset = SimpleDataset(combined_train, y_train)
combined_test_dataset = SimpleDataset(combined_test, y_test)

combined_train_loader = DataLoader(combined_train_dataset, batch_size=32, shuffle=True)
combined_test_loader = DataLoader(combined_test_dataset, batch_size=32, shuffle=False)

In [16]:
criterion = nn.BCELoss()
optimizer = optim.Adam(simple_early_fusion_model.parameters(), lr=0.0001)
train(simple_early_fusion_model, combined_train_loader, combined_test_loader, criterion, optimizer, num_epochs=40)

Epoch 0 Loss: 67.18348330259323, AUC: 0.9691468104094083, Accuracy: 0.9711124769514444, F1: 0.756476683937824
Epoch 1 Loss: 41.87031468027271, AUC: 0.9759248558964481, Accuracy: 0.9726490473263676, F1: 0.7464387464387465
Epoch 2 Loss: 34.91454142378643, AUC: 0.9787220456240957, Accuracy: 0.9708051628764598, F1: 0.7181008902077153
Epoch 3 Loss: 31.292649308452383, AUC: 0.9798963669240102, Accuracy: 0.9772587584511371, F1: 0.7944444444444444
Epoch 4 Loss: 30.941961539559998, AUC: 0.9806392731044601, Accuracy: 0.9800245851259988, F1: 0.8228882833787466
Epoch 5 Loss: 28.34704295359552, AUC: 0.9811205267936823, Accuracy: 0.9837123540258144, F1: 0.8651399491094148
Epoch 6 Loss: 28.188363395864144, AUC: 0.9806159112748862, Accuracy: 0.9830977258758451, F1: 0.8655256723716381
Epoch 7 Loss: 26.849562769173644, AUC: 0.9818307264127288, Accuracy: 0.9800245851259988, F1: 0.8209366391184573
Epoch 8 Loss: 27.01791507669259, AUC: 0.981466281871376, Accuracy: 0.984019668100799, F1: 0.8706467661691543


In [17]:
y_pred_probs = get_predicted_probs(simple_early_fusion_model, combined_test_loader)
y_preds = (np.array(y_pred_probs) > 0.5).astype(int)
early_fusion_roc = roc_auc_score(y_test.numpy(), y_pred_probs)
early_fusion_accuracy = accuracy_score(y_test.numpy(), y_preds)
early_fusion_f1 = f1_score(y_test.numpy(), y_preds)
print(f'Early Fusion ROC: {early_fusion_roc}, Accuracy: {early_fusion_accuracy}, F1: {early_fusion_f1}')

Early Fusion ROC: 0.985926833864685, Accuracy: 0.9852489244007375, F1: 0.8762886597938144


### Simple Late Fusion Model

In [18]:
# Define separate modules for text and image processing
class TextModule(nn.Module):
    def __init__(self):
        super(TextModule, self).__init__()
        self.fc1 = nn.Linear(4096, 256)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        return x

class ImageModule(nn.Module):
    def __init__(self):
        super(ImageModule, self).__init__()
        self.fc1 = nn.Linear(1536, 256)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        return x

class SimpleLateFusionModel(nn.Module):
    def __init__(self):
        super(SimpleLateFusionModel, self).__init__()
        self.text_module = TextModule()
        self.image_module = ImageModule()
        self.fc1 = nn.Linear(512, 128)
        self.fc2 = nn.Linear(128, 1)
        self.output = nn.Sigmoid()

    def forward(self, combined_data):
        text_data = combined_data[:, :4096]
        image_data = combined_data[:, 4096:]
        text_features = self.text_module(text_data)
        image_features = self.image_module(image_data)
        combined_features = torch.cat((text_features, image_features), dim=1)
        x = self.fc1(combined_features)
        x = torch.relu(x)
        x = self.fc2(x)
        x = self.output(x)
        return x

# Instantiate and configure the model, criterion, and optimizer



In [19]:
late_fusion_model = SimpleLateFusionModel()
criterion = nn.BCELoss()
optimizer = optim.Adam(late_fusion_model.parameters(), lr=0.0001)
train(late_fusion_model, combined_train_loader, combined_test_loader, criterion, optimizer, num_epochs=40)

Epoch 0 Loss: 68.11527671851218, AUC: 0.9675286143475899, Accuracy: 0.9572833435771358, F1: 0.5223367697594502
Epoch 1 Loss: 40.441093879169784, AUC: 0.9777564233350413, Accuracy: 0.9711124769514444, F1: 0.7235294117647059
Epoch 2 Loss: 32.83470815431792, AUC: 0.9789307446349559, Accuracy: 0.9827904118008605, F1: 0.8541666666666667
Epoch 3 Loss: 27.80017889046576, AUC: 0.9812840596006995, Accuracy: 0.9846342962507683, F1: 0.8743718592964824
Epoch 4 Loss: 28.721639645773394, AUC: 0.9806221410961059, Accuracy: 0.9837123540258144, F1: 0.8637532133676094
Epoch 5 Loss: 26.776782686880324, AUC: 0.9816687510610163, Accuracy: 0.9843269821757836, F1: 0.8740740740740741
Epoch 6 Loss: 26.23835004295688, AUC: 0.9810706882239246, Accuracy: 0.9815611555009219, F1: 0.853658536585366
Epoch 7 Loss: 23.18979552714154, AUC: 0.9825876496909232, Accuracy: 0.9837123540258144, F1: 0.8608923884514436
Epoch 8 Loss: 24.209635002858704, AUC: 0.9825549431295195, Accuracy: 0.9824830977258758, F1: 0.863309352517985

In [20]:
y_pred_probs = get_predicted_probs(late_fusion_model, combined_test_loader)
y_preds = (np.array(y_pred_probs) > 0.5).astype(int)
late_fusion_roc = roc_auc_score(y_test.numpy(), y_pred_probs)
late_fusion_accuracy = accuracy_score(y_test.numpy(), y_preds)
late_fusion_f1 = f1_score(y_test.numpy(), y_preds)
print(f'Late Fusion ROC: {late_fusion_roc}, Accuracy: {late_fusion_accuracy}, F1: {late_fusion_f1}')

Late Fusion ROC: 0.9836607363960173, Accuracy: 0.9830977258758451, F1: 0.8705882352941177


### Attention Model

In [21]:
class ImageAttentionModule(nn.Module):
    def __init__(self):
        super(ImageAttentionModule, self).__init__()
        self.fc1 = nn.Linear(1536, 256)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.attention = nn.MultiheadAttention(embed_dim=16, num_heads=4, batch_first=True)

    def forward(self, x):
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.dropout(x)

        # Reshape x from [batch_size, 256] to [batch_size, 16, 16] for attention
        x = x.view(-1, 16, 16)

        # Apply attention
        attn_output, _ = self.attention(x, x, x)

        # Flatten the output for the final fully connected layer
        x = attn_output.reshape(-1, 256)  # Reshape back to original shape after attention
        return x

class TextAttentionModule(nn.Module):
    def __init__(self):
        super(TextAttentionModule, self).__init__()
        self.fc1 = nn.Linear(4096, 256)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.attention = nn.MultiheadAttention(embed_dim=16, num_heads=4, batch_first=True)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)

        x = x.view(-1, 16, 16)

        # Apply attention
        attn_output, _ = self.attention(x, x, x)

        # Flatten the output for the final fully connected layer
        x = attn_output.reshape(-1, 256)  # Reshape back to original shape after attention
        return x

class AttentionFusionModel(nn.Module):
    def __init__(self):
        super(AttentionFusionModel, self).__init__()
        self.text_attention = TextAttentionModule()
        self.image_attention = ImageAttentionModule()
        self.cross_attention = nn.MultiheadAttention(embed_dim=256, num_heads=4, batch_first=True)
        self.fc1 = nn.Linear(256, 128)
        self.fc2 = nn.Linear(128, 1)
        self.output = nn.Sigmoid()

    def forward(self, combined_data):
        text_data = combined_data[:, :4096]
        image_data = combined_data[:, 4096:]
        text_output = self.text_attention(text_data)
        image_output = self.image_attention(image_data)
        # combined_features = torch.cat((text_output, image_output), dim=1)
        combined_features, _ = self.cross_attention(text_output.unsqueeze(1), image_output.unsqueeze(1), image_output.unsqueeze(1))
        combined_features = combined_features.squeeze(1)
        x = self.fc1(combined_features)

        x = torch.relu(x)
 
        x = self.fc2(x)
        x = self.output(x)
        return x

In [22]:
attention_fusion_model = AttentionFusionModel()
criterion = nn.BCELoss()
optimizer = optim.Adam(attention_fusion_model.parameters(), lr=0.0001)
train(attention_fusion_model, combined_train_loader, combined_test_loader, criterion, optimizer, num_epochs=40)

Epoch 0 Loss: 93.73704588413239, AUC: 0.8956224603744434, Accuracy: 0.9363859864781807, F1: 0.08810572687224669
Epoch 1 Loss: 65.28769809193909, AUC: 0.925480436025187, Accuracy: 0.9594345421020283, F1: 0.6271186440677966
Epoch 2 Loss: 59.690624251030385, AUC: 0.9321494596408819, Accuracy: 0.9557467732022127, F1: 0.5414012738853504
Epoch 3 Loss: 55.741092530079186, AUC: 0.9356490617110516, Accuracy: 0.958819913952059, F1: 0.5838509316770186
Epoch 4 Loss: 52.87391759082675, AUC: 0.9353500302925056, Accuracy: 0.9591272280270436, F1: 0.5777777777777778
Epoch 5 Loss: 50.79753237683326, AUC: 0.9400519878580785, Accuracy: 0.9532882606023356, F1: 0.45714285714285713
Epoch 6 Loss: 49.525770098902285, AUC: 0.9426825298680991, Accuracy: 0.9560540872771973, F1: 0.5051903114186852
Epoch 7 Loss: 46.95384145854041, AUC: 0.9418897851178916, Accuracy: 0.9532882606023356, F1: 0.4492753623188405
Epoch 8 Loss: 47.327771314419806, AUC: 0.9458270321287454, Accuracy: 0.9640442532267978, F1: 0.66666666666666

In [23]:
y_pred_probs = get_predicted_probs(attention_fusion_model, combined_test_loader)
y_preds = (np.array(y_pred_probs) > 0.5).astype(int)
attention_fusion_roc = roc_auc_score(y_test.numpy(), y_pred_probs)
attention_fusion_accuracy = accuracy_score(y_test.numpy(), y_preds)
attention_fusion_f1 = f1_score(y_test.numpy(), y_preds)
print(f'Attention Fusion ROC: {attention_fusion_roc}, Accuracy: {attention_fusion_accuracy}, F1: {attention_fusion_f1}')

Attention Fusion ROC: 0.953812105477103, Accuracy: 0.964658881376767, F1: 0.7132169576059849
