In [None]:
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from transformers import BertTokenizer, BertModel, AdamW
import torchvision.models as models
from torch import nn
import time
from tqdm import tqdm

# Paths to the CSV files and image directories
csv_paths = {
    'train': '/kaggle/input/intent/Intent/train.csv',
    'test': '/kaggle/input/intent/Intent/validation.csv',
    'validation': '/kaggle/input/intent/Intent/validation.csv'
}

image_dirs = {
    'train': '/kaggle/input/intent/Intent/train',
    'test': '/kaggle/input/intent/Intent/validation',
    'validation': '/kaggle/input/intent/Intent/validation'
}
output_dir = '/kaggle/working/'  # Output directory to save the CSV files

# Function to check for matching Meme_ID and image files, and add image paths
def check_matches(csv_path, image_dir):
    df = pd.read_csv(csv_path)
    image_files = os.listdir(image_dir)
    image_names = {os.path.splitext(image_file)[0]: os.path.join(image_dir, image_file) for image_file in image_files}
    
    # Add Image_Path column to the dataframe
    df['Image_Path'] = df['Image_ID'].apply(lambda x: image_names.get(x, None))
    
    # Filter rows where Image_Path is not None (i.e., matched Meme_IDs)
    matched_df = df[df['Image_Path'].notna()]
    
    return matched_df

# Function to encode Intent_Taxonomy classes into labels
def encode_labels(df):
    label_encoder = LabelEncoder()
    df['Intent_Taxonomy_Labels'] = label_encoder.fit_transform(df['Intent_Taxonomy'])
    return df, label_encoder.classes_

# Check matches for each set (Train, Test, Validation)
for key in csv_paths:
    matched_df = check_matches(csv_paths[key], image_dirs[key])
    
    # Encode Intent_Taxonomy labels
    matched_df, classes = encode_labels(matched_df)
    
    matches_output_path = os.path.join(output_dir, f'{key}_matches.csv')
    
    # Save the processed dataframe to CSV
    matched_df.to_csv(matches_output_path, index=False)
    
    print(f"{key} set:")
    print(f"Matched Meme_IDs with image paths and labels saved to {matches_output_path}")
    print(f"Classes and their corresponding labels:\n{dict(zip(classes, range(len(classes))))}\n")

In [None]:
train_df = pd.read_csv('/kaggle/working/train_matches.csv')
train_df.head(10)

In [None]:
test_df = pd.read_csv('/kaggle/working/test_matches.csv')
test_df.head(10)

In [None]:
validation_df = pd.read_csv('/kaggle/working/validation_matches.csv')
validation_df.head(10)

In [17]:
# Define your transformations using transforms.Compose
transform = transforms.Compose([
    transforms.Resize(224),
    transforms.CenterCrop(224),  # Crop the center to 224x224
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

class MyMultimodalDataset(Dataset):
    def __init__(self, image_paths, image_captions, intent_taxonomy_labels, transform=None):
        self.image_paths = image_paths
        self.image_captions = image_captions
        self.intent_taxonomy = intent_taxonomy_labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        text = self.image_captions[idx]
        label = self.intent_taxonomy[idx]

        # Load and preprocess image
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)

        return image, text, label

In [18]:
# Assuming you have lists or arrays of image paths, captions, and encoded labels:
train_dataset = MyMultimodalDataset(train_df['Image_Path'], train_df['Image_Caption'], train_df['Intent_Taxonomy_Labels'], transform=transform)
val_dataset = MyMultimodalDataset(validation_df['Image_Path'], validation_df['Image_Caption'], validation_df['Intent_Taxonomy_Labels'], transform=transform)
test_dataset = MyMultimodalDataset(test_df['Image_Path'], test_df['Image_Caption'], test_df['Intent_Taxonomy_Labels'], transform=transform)

# Define data loaders
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [16]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertModel, BertTokenizer,AdamW
from tqdm import tqdm
import torchvision.models as models
import time

In [19]:
# Initialize resnet50 with IMAGENET1K_V1 weights
resnet50 = models.resnet50(weights='IMAGENET1K_V1', progress=True)
resnet50 = torch.nn.Sequential(*(list(resnet50.children())[:-1]))  # Remove the classification layer

# Initialize BERT tokenizer and model
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
bert_model = BertModel.from_pretrained("bert-base-multilingual-cased")


In [None]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

In [None]:
resnet50.to(device)

In [None]:
bert_model.to(device)

In [25]:
import torch
import time
from torch.optim import AdamW
from torchvision import transforms
from PIL import Image
from tqdm import tqdm
import torch.nn as nn

In [None]:
# Hyperparameters
num_epochs = 35
num_classes = 6  # Assuming you have 6 classes
max_seq_length = 100  # Set your desired maximum sequence length
learning_rate = 0.001

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(list(resnet50.parameters()) + list(bert_model.parameters()), lr=learning_rate)

# Training and validation lists for losses and accuracies
train_losses = []
train_accuracies = []
val_losses = []
val_accuracies = []

start_time = time.time()
# Define the combined classifier for early fusion
class EarlyFusionClassifier(nn.Module):
    def __init__(self, img_feature_dim, text_feature_dim, num_classes):
        super(EarlyFusionClassifier, self).__init__()
        self.fc1 = nn.Linear(img_feature_dim + text_feature_dim, 512)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, img_features, text_features):
        combined_features = torch.cat((img_features, text_features), dim=1)
        x = self.fc1(combined_features)
        x = self.relu(x)
        x = self.dropout(x)
        logits = self.fc2(x)
        return logits

combined_classifier = EarlyFusionClassifier(img_feature_dim=2048, text_feature_dim=768, num_classes=num_classes).to(device)

# Training loop
for epoch in range(num_epochs):
    resnet50.train()
    bert_model.train()
    combined_classifier.train()

    running_train_loss = 0.0
    correct_train = 0
    total_train = 0

    for images, texts, labels in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}', leave=False):
        # Move tensors to the device
        images = images.to(device)
        labels = labels.to(device)

        # Preprocess images before passing to resnet50
        preprocess = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

        # Apply transformations to images if they are not already tensors
        if not torch.is_tensor(images):
            images = torch.stack([preprocess(image) for image in images])

        optimizer.zero_grad()

        img_feats = resnet50(images)
        img_feats = img_feats.view(img_feats.size(0), -1)  # Ensure img_feats has the shape (batch_size, 2048)


        # Convert texts to tensors and pad to a fixed sequence length
        texts = [bert_tokenizer(text, padding='max_length', truncation=True, max_length=max_seq_length, return_tensors='pt') for text in texts]
        input_ids = torch.stack([text['input_ids'].squeeze(0) for text in texts], dim=0).to(device)
        attention_mask = torch.stack([text['attention_mask'].squeeze(0) for text in texts], dim=0).to(device)

        outputs = bert_model(input_ids, attention_mask=attention_mask)
        text_feats = outputs.last_hidden_state[:, 0, :]  # Extract the [CLS] token representation
        text_feats = text_feats.view(text_feats.size(0), -1)  # Ensure text_feats has the shape (batch_size, 768)


        # Concatenate image and text features
        combined_feats = torch.cat((img_feats, text_feats), dim=1)


        # Get predictions from the combined classifier
        combined_logits = combined_classifier(img_feats, text_feats)

        # Ensure labels have the correct shape and type
        labels = labels.view(-1)  # Flatten labels to match batch size
        labels = labels.to(torch.long)  # Ensure labels are of type torch.long

        # Check if labels are empty
        if labels.numel() == 0:
            print(f"Skipping empty labels batch")
            continue

        # Adjust the shape of combined_logits to match the batch size of labels
        combined_logits = combined_logits.view(labels.size(0), -1)

        # Calculate loss
        loss = criterion(combined_logits, labels)

        # Backpropagation
        loss.backward()
        optimizer.step()

        running_train_loss += loss.item()
        _, predicted = combined_logits.max(1)
        total_train += labels.size(0)
        correct_train += predicted.eq(labels).sum().item()

    epoch_train_loss = running_train_loss / len(train_loader)
    epoch_train_accuracy = correct_train / total_train

    train_losses.append(epoch_train_loss)
    train_accuracies.append(epoch_train_accuracy)

    # Validation loop
    resnet50.eval()
    bert_model.eval()
    combined_classifier.eval()

    running_val_loss = 0.0
    correct_val = 0
    total_val = 0

    with torch.no_grad():
        for val_images, val_texts, val_labels in val_loader:
            val_images = val_images.to(device)
            val_labels = val_labels.to(device)

            if not torch.is_tensor(val_images):
                val_images = torch.stack([preprocess(image) for image in val_images])

            val_texts = [bert_tokenizer(text, padding='max_length', truncation=True, max_length=max_seq_length, return_tensors='pt') for text in val_texts]
            val_input_ids = torch.stack([text['input_ids'].squeeze(0) for text in val_texts], dim=0).to(device)
            val_attention_mask = torch.stack([text['attention_mask'].squeeze(0) for text in val_texts], dim=0).to(device)

            val_img_feats = resnet50(val_images)
            val_img_feats = val_img_feats.view(val_img_feats.size(0), -1)  # Ensure val_img_feats has the shape (batch_size, 2048)

            val_outputs = bert_model(val_input_ids, attention_mask=val_attention_mask)
            val_text_feats = val_outputs.last_hidden_state[:, 0, :]  # Extract the [CLS] token representation
            val_text_feats = val_text_feats.view(val_text_feats.size(0), -1)  # Ensure val_text_feats has the shape (batch_size, 768)

            # Concatenate image and text features
            combined_feats = torch.cat((val_img_feats, val_text_feats), dim=1)


            # Get predictions from the combined classifier
            val_combined_logits = combined_classifier(val_img_feats, val_text_feats)

            # Ensure the val_labels tensor is flattened to match the val_combined_logits batch size
            val_labels = val_labels.view(-1)

            val_loss = criterion(val_combined_logits, val_labels)

            running_val_loss += val_loss.item()
            _, val_predicted = val_combined_logits.max(1)
            total_val += val_labels.size(0)
            correct_val += val_predicted.eq(val_labels).sum().item()

    epoch_val_loss = running_val_loss / len(val_loader)
    epoch_val_accuracy = correct_val / total_val

    val_losses.append(epoch_val_loss)
    val_accuracies.append(epoch_val_accuracy)

    print(f"Epoch [{epoch + 1}/{num_epochs}] - "
          f"Train Loss: {epoch_train_loss:.4f}, Train Acc: {epoch_train_accuracy:.4f}, "
          f"Val Loss: {epoch_val_loss:.4f}, Val Acc: {epoch_val_accuracy:.4f}")

end_time = time.time()
execution_time = end_time - start_time
print(f"Total execution time: {execution_time:.2f} seconds")

In [None]:
import torch
import torch.nn as nn
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from tqdm import tqdm

# Assuming test_loader contains your test data loader

combined_classifier.eval()  # Set the model to evaluation mode
resnet50.eval()  # Set the ResNet model to evaluation mode
bert_model.eval()  # Set the BERT model to evaluation mode

test_losses = []
predictions = []
true_labels = []

with torch.no_grad():
    for images, texts, labels in tqdm(test_loader, desc='Testing', leave=False):
        images = images.to(device)
        labels = labels.to(device)

        # Tokenize and prepare text inputs
        texts = [bert_tokenizer(text, padding='max_length', truncation=True, max_length=max_seq_length, return_tensors='pt') for text in texts]
        input_ids = torch.stack([text['input_ids'].squeeze(0) for text in texts], dim=0).to(device)
        attention_mask = torch.stack([text['attention_mask'].squeeze(0) for text in texts], dim=0).to(device)

        # Extract features using ResNet and BERT
        img_feats = resnet50(images)
        img_feats = img_feats.view(img_feats.size(0), -1)

        outputs = bert_model(input_ids, attention_mask=attention_mask)
        text_feats = outputs.last_hidden_state[:, 0, :]

        # Get combined logits from the classifier
        logits = combined_classifier(img_feats, text_feats)

        # Calculate loss
        test_loss = criterion(logits, labels)
        test_losses.append(test_loss.item())

        # Get predictions
        _, predicted = logits.max(1)
        predictions.extend(predicted.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Calculate overall test metrics
average_test_loss = sum(test_losses) / len(test_losses)
test_accuracy = accuracy_score(true_labels, predictions)
precision, recall, f1_score, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
conf_matrix = confusion_matrix(true_labels, predictions)

# Print results
print(f"Test Loss: {average_test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1_score:.4f}")

In [None]:
# Plot confusion matrix
plt.figure(figsize=(8, 6))
# Class names according to the label encoding mapping
class_names = ['Advocative', 'Controversial', 'ExhIbitionist', 'Expressive', 'Informative', 'Promotive']
sns.heatmap(conf_matrix, annot=True, cmap='Blues', fmt='d', xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()