In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install torch



In [3]:
!pip install torchvision



In [4]:
!pip install transformers



In [5]:
!pip install scikit-learn



In [6]:
!pip install torchcrf



In [7]:
!pip install pytorch-crf



In [8]:
!pip install tqdm



In [9]:
import os
import torch
from torch import nn, optim
from torchvision.models import resnet152
from transformers import BertModel, BertTokenizer
from torch.utils.data import DataLoader, Dataset
import numpy as np
from PIL import Image
import requests
from io import BytesIO
import os
import torchvision.transforms
import torch
from transformers import BertTokenizer, BertConfig, BertPreTrainedModel
from torch.utils.data import DataLoader, Dataset
from PIL import Image
from sklearn.metrics import f1_score
from torch.optim import Adam
from transformers import get_linear_schedule_with_warmup
from torch.nn.utils.rnn import pad_sequence
from torchcrf import CRF
from tqdm import tqdm

In [8]:
!cp -r /content/drive/MyDrive/twitter2015 /content/data/

In [9]:
!cp -r /content/drive/MyDrive/twitter2015_images /content/data/

In [None]:
!cp -r /content/drive/MyDrive/twitter2017 /content/data/

In [None]:
!cp -r /content/drive/MyDrive/twitter2017_images /content/data/

In [10]:
class TwitterDataset(Dataset):
    def __init__(self, data_folder, img_folder, tokenizer, transform, file_name):
        super().__init__()
        self.data_lines = []
        self.img_folder = img_folder
        self.tokenizer = tokenizer
        self.transform = transform

        # Load data from the specified file
        file_path = os.path.join(data_folder, file_name)
        with open(file_path, 'r', encoding="utf8") as file:
            img_id = None
            text = []
            labels = []

            counter = 0
            for line in file:
                if counter == 100:
                  break
                if line.strip() == '' and img_id is not None:  # save previous instance
                    self.data_lines.append((img_id, text, labels))
                    img_id = None  # Reset for the next image
                    text = []
                    labels = []
                elif line.startswith('IMGID:'):
                    img_id = line.strip().split(':')[1] + '.jpg'  # New image id
                else:
                    parts = line.strip().split('\t')
                    if len(parts) == 2:
                        text.append(parts[0])
                        labels.append(parts[1])

                counter+=1

            # Save last instance if not empty
            if img_id is not None:
                self.data_lines.append((img_id, text, labels))

    def __len__(self):
        return len(self.data_lines)

    def __getitem__(self, idx):
        img_id, text, labels = self.data_lines[idx]
        image_path = os.path.join(self.img_folder, img_id)
        image = Image.open(image_path).convert('RGB')
        text = ' '.join(text)
        labels = [self.label_to_idx(label) for label in labels]  # Convert labels to indices

        inputs = self.tokenizer(text, padding='max_length', max_length=128, truncation=True,return_tensors="pt")
        image = self.transform(image)
        labels = torch.tensor(labels, dtype=torch.long)

        return inputs.input_ids.squeeze(0), inputs.attention_mask.squeeze(0), image, labels

    @staticmethod
    def label_to_idx(label):
        # Define your label to index mapping based on your dataset's labels
        label_map = {
            'O': 0,
            'B-LOC': 1, 'I-LOC': 2,
            'B-PER': 3, 'I-PER': 4,
            'B-ORG': 5, 'I-ORG': 6
        }
        return label_map.get(label, 0)  # Convert unrecognized labels to 'O'


In [11]:
class BertCrossAttention(nn.Module):
    """Implements cross-attention between two different modalities using a decoder layer."""

    def __init__(self, config, num_layers):
        super().__init__()
        self.layers = nn.ModuleList([
            nn.TransformerDecoderLayer(d_model=config.hidden_size, nhead=config.num_attention_heads)
            for _ in range(num_layers)
        ])

    def forward(self, query, key, mask=None):
        output = query
        for layer in self.layers:
            output = layer(output, key, tgt_key_padding_mask=mask)
        return output


class MTCCMBertForMMTokenClassificationCRF(BertPreTrainedModel):
    def __init__(self, config, num_labels, add_context_aware_gate=False, use_dynamic_cross_modal_fusion=False):
        super().__init__(config)
        self.num_labels = num_labels
        self.bert = BertModel(config)
        self.resnet = resnet152(pretrained=True)
        self.resnet.fc = nn.Identity()  # Adapt ResNet to remove the final fully connected layer

        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.vismap2text = nn.Linear(2048, config.hidden_size)

        self.txt2img_attention = BertCrossAttention(config, num_layers=1)
        self.img2txt_attention = BertCrossAttention(config, num_layers=1)

        self.add_context_aware_gate = add_context_aware_gate
        # Initialize the visual filter gate
        if add_context_aware_gate:
            self.visual_gate = ContextAwareGate(config.hidden_size, 768)

        if use_dynamic_cross_modal_fusion:
            self.dynamic_attention = DynamicAttentionModule(config.hidden_size)

        self.classifier = nn.Linear(config.hidden_size * 2, num_labels)
        self.crf = CRF(num_labels, batch_first=True)
        self.init_weights()

        self.use_dynamic_cross_modal_fusion = use_dynamic_cross_modal_fusion

    def forward(self, input_ids, attention_mask, visual_embeds, labels=None):
        # Text feature extraction
        text_outputs = self.bert(input_ids, attention_mask=attention_mask)
        text_features = self.dropout(text_outputs.last_hidden_state)

        # Image feature extraction with ResNet-152
        visual_features = self.resnet(visual_embeds)  # Assuming visual_embeds is [batch_size, 3, 224, 224]
        visual_features = visual_features.view(visual_features.size(0), -1)  # Flatten the output of the ResNet
        visual_features = self.vismap2text(visual_features)  # Transform to match BERT hidden size
        visual_features = visual_features.unsqueeze(1).expand(-1, text_features.size(1),
                                                              -1)  # Expand to match text sequence length

        if self.add_context_aware_gate:
            visual_features = self.visual_gate(text_features, visual_features)

        # Cross-modal attention
        if self.use_dynamic_cross_modal_fusion:
            attended_text, attended_visuals = self.dynamic_attention(text_features, visual_features)
            combined_features = torch.cat([attended_text, attended_visuals], dim=-1)
        else:
            txt_attended_visuals = self.txt2img_attention(text_features, visual_features)
            img_attended_text = self.img2txt_attention(visual_features, text_features)
            combined_features = torch.cat([txt_attended_visuals, img_attended_text], dim=-1)

        logits = self.classifier(combined_features)

        # crf processing
        if labels is not None:
            # Ensure labels and logits have the same sequence length
            labels = torch.where(labels == -100, torch.zeros_like(labels), labels)

            seq_length = logits.size(1)
            if labels.size(1) < seq_length:
                padding_size = seq_length - labels.size(1)
                # Use a valid label index for padding, e.g., 0
                labels_padded = torch.full((labels.size(0), padding_size), fill_value=0, dtype=torch.long,
                                           device=labels.device)
                labels = torch.cat([labels, labels_padded], dim=1)

                # Adjust attention_mask to cover only the non-padded areas
                attention_mask_padded = torch.zeros((attention_mask.size(0), seq_length), dtype=torch.uint8,
                                                    device=attention_mask.device)
                attention_mask_padded[:, :attention_mask.size(1)] = attention_mask
                attention_mask = attention_mask_padded

            # CRF loss calculation
            loss = -self.crf(logits, labels, mask=attention_mask.byte(), reduction='mean')
            return loss
        else:
            return self.crf.decode(logits, mask=attention_mask.byte())


class ContextAwareGate(nn.Module):
    def __init__(self, text_dim, visual_dim):
        super().__init__()
        # network to calculate threshold
        self.threshold_network = nn.Sequential(
            nn.Linear(text_dim, 1),  # Averaging text features to a single value
            nn.Sigmoid()  # Ensure the threshold is between 0 and 1
        )
        self.gate = nn.Sequential(
            nn.Linear(visual_dim * 2, visual_dim),  # Combine visual and transformed text features
            nn.Tanh(),
            nn.Linear(visual_dim, visual_dim),
            nn.Sigmoid()
        )

    def forward(self, text_features, visual_features):
        combined_features = torch.cat([text_features, visual_features], dim=-1)
        # Compute gating values
        gate_values = self.gate(combined_features)
        # Apply the gate to the visual features only
        text_mean = torch.mean(text_features, dim=1)
        update_threshold = self.threshold_network(text_mean).squeeze()  # Ensuring scalar output per batch item

        # if the threshold is larger than certain value apply combined features to the visual features
        update_threshold_expanded = update_threshold.unsqueeze(-1).unsqueeze(-1)
        update_mask = (gate_values > update_threshold_expanded).float()

        # Apply the gate to the visual features conditionally
        updated_visual_features = visual_features * (1 - update_mask) + (visual_features * gate_values) * update_mask

        return updated_visual_features


class DynamicAttentionModule(nn.Module):
    def __init__(self, feature_dim):
        super().__init__()
        self.text_weight_predictor = nn.Sequential(
            nn.Linear(feature_dim, 1),
            nn.Sigmoid()
        )
        self.visual_weight_predictor = nn.Sequential(
            nn.Linear(feature_dim, 1),
            nn.Sigmoid()
        )

    def forward(self, text_features, visual_features):
        text_weights = self.text_weight_predictor(text_features).expand_as(text_features)
        visual_weights = self.visual_weight_predictor(visual_features).expand_as(visual_features)
        attended_text = text_features * text_weights
        attended_visuals = visual_features * visual_weights
        return attended_text, attended_visuals


In [15]:
def collate_fn(batch):
    input_ids, attention_masks, images, labels = zip(*batch)

    # Pad the sequences
    input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    attention_masks = pad_sequence(attention_masks, batch_first=True, padding_value=0)

    # Ensure the first timestep of each mask is on
    attention_masks[:, 0] = 1

    # Stack images and pad labels
    images = torch.stack(images)
    labels = pad_sequence(labels, batch_first=True, padding_value=-100)  # Assuming -100 is your ignore index for labels

    return input_ids, attention_masks, images, labels


def train_model(model, train_loader, val_loader, optimizer, scheduler, num_epochs=10):
    best_f1 = 0.0

    for epoch in range(num_epochs):
        print("Training for epoch ", str(epoch))
        model.train()
        total_loss = 0
        train_progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}', leave=False)


        for inputs, masks, images, labels in train_progress_bar:
            optimizer.zero_grad()
            loss = model(inputs, masks, images, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        val_loss, val_f1 = evaluate_model(model, val_loader)

        print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {avg_train_loss}, Val Loss: {val_loss}, Val F1: {val_f1}")

        # Save the model if the validation F1 score is the best we've seen so far.
        save_path = "/content/output/epoch_" + str(epoch+1) + "_valf1_" + str(val_f1) + ".pth"
        if val_f1 > best_f1:
            best_f1 = val_f1
            torch.save(model.state_dict(), save_path)
            print("Saved best model")


def evaluate_model(model, data_loader):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    with torch.no_grad():
        validation_progress_bar = tqdm(data_loader, desc='Validating', leave=False)

        for inputs, masks, images, labels in validation_progress_bar:
            loss = model(inputs, masks, images, labels)
            total_loss += loss.item()
            predictions = model(inputs, masks, images)  # Decoding without labels returns predictions

            # Flatten the labels for evaluation
            labels_flattened = labels.view(-1).cpu().numpy()  # Flatten labels and move to CPU

            # Flatten predictions and convert to numpy array if predictions are a list
            if isinstance(predictions, list):
                # Convert list of lists to a flat list if necessary
                flat_predictions = [item for sublist in predictions for item in sublist]
                predictions = torch.tensor(flat_predictions)  # Convert to tensor
            else:
                predictions = predictions.view(-1)  # Flatten predictions

            predictions = predictions.cpu().numpy()  # Convert to numpy for f1 score calculation

            # Filter out the padding index (-100) from labels and corresponding predictions
            valid_indices = labels_flattened != -100
            valid_labels = labels_flattened[valid_indices]
            valid_predictions = predictions[valid_indices]

            all_preds.extend(valid_predictions)
            all_labels.extend(valid_labels)

    print(all_labels)
    print(all_preds)
    avg_loss = total_loss / len(data_loader)
    f1 = f1_score(all_labels, all_preds, average='macro')  # Calculate macro F1 Score
    return avg_loss, f1

In [13]:
data_folder = 'data/twitter2015'  # Update accordingly
img_folder = 'data/twitter2015_images'  # Update accordingly

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
transform = torchvision.transforms.Compose([
    torchvision.transforms.Resize((224, 224)),
    torchvision.transforms.ToTensor(),
])

train_dataset = TwitterDataset(data_folder, img_folder, tokenizer, transform, 'train.txt')
val_dataset = TwitterDataset(data_folder, img_folder, tokenizer, transform, 'valid.txt')
test_dataset = TwitterDataset(data_folder, img_folder, tokenizer, transform, 'test.txt')

# Data loaders
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [16]:
label2id = {
    'O': 0,
    'B-LOC': 1, 'I-LOC': 2,
    'B-PER': 3, 'I-PER': 4,
    'B-ORG': 5, 'I-ORG': 6
}
id2label = {v: k for k, v in label2id.items()}

config = BertConfig.from_pretrained('bert-base-uncased', num_labels=7)
config.label2id = label2id
config.id2label = id2label
model = MTCCMBertForMMTokenClassificationCRF(config=config, num_labels=7, add_context_aware_gate=True,
                                              use_dynamic_cross_modal_fusion=True)


optimizer = Adam(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0,
                                            num_training_steps=len(train_loader) * 10)

train_model(model, train_loader, val_loader, optimizer, scheduler, num_epochs=1)

Training for epoch  0




IndexError: boolean index did not match indexed array along dimension 0; dimension is 100 but corresponding boolean dimension is 38