In [None]:
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from transformers import BertTokenizer, BertModel, AdamW
import torchvision.models as models
from torch import nn
import time
from tqdm import tqdm

# Paths to the CSV files and image directories
csv_paths = {
    'train': '/kaggle/input/intent/Intent/train.csv',
    'test': '/kaggle/input/intent/Intent/validation.csv',
    'validation': '/kaggle/input/intent/Intent/validation.csv'
}

image_dirs = {
    'train': '/kaggle/input/intent/Intent/train',
    'test': '/kaggle/input/intent/Intent/validation',
    'validation': '/kaggle/input/intent/Intent/validation'
}
output_dir = '/kaggle/working/'  # Output directory to save the CSV files

# Function to check for matching Meme_ID and image files, and add image paths
def check_matches(csv_path, image_dir):
    df = pd.read_csv(csv_path)
    image_files = os.listdir(image_dir)
    image_names = {os.path.splitext(image_file)[0]: os.path.join(image_dir, image_file) for image_file in image_files}
    
    # Add Image_Path column to the dataframe
    df['Image_Path'] = df['Image_ID'].apply(lambda x: image_names.get(x, None))
    
    # Filter rows where Image_Path is not None (i.e., matched Meme_IDs)
    matched_df = df[df['Image_Path'].notna()]
    
    return matched_df

# Function to encode Intent_Taxonomy classes into labels
def encode_labels(df):
    label_encoder = LabelEncoder()
    df['Intent_Taxonomy_Labels'] = label_encoder.fit_transform(df['Intent_Taxonomy'])
    return df, label_encoder.classes_

# Check matches for each set (Train, Test, Validation)
for key in csv_paths:
    matched_df = check_matches(csv_paths[key], image_dirs[key])
    
    # Encode Intent_Taxonomy labels
    matched_df, classes = encode_labels(matched_df)
    
    matches_output_path = os.path.join(output_dir, f'{key}_matches.csv')
    
    # Save the processed dataframe to CSV
    matched_df.to_csv(matches_output_path, index=False)
    
    print(f"{key} set:")
    print(f"Matched Meme_IDs with image paths and labels saved to {matches_output_path}")
    print(f"Classes and their corresponding labels:\n{dict(zip(classes, range(len(classes))))}\n")

In [None]:
train_df = pd.read_csv('/kaggle/working/train_matches.csv')
train_df.head(10)

In [None]:
test_df = pd.read_csv('/kaggle/working/test_matches.csv')
test_df.head(10)

In [None]:
validation_df = pd.read_csv('/kaggle/working/validation_matches.csv')
validation_df.head(10)

In [54]:
# Define your transformations using transforms.Compose
transform = transforms.Compose([
    transforms.Resize(224),
    transforms.CenterCrop(224),  # Crop the center to 224x224
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

class MyMultimodalDataset(Dataset):
    def __init__(self, image_paths, image_captions, intent_taxonomy_labels, transform=None):
        self.image_paths = image_paths
        self.image_captions = image_captions
        self.intent_taxonomy = intent_taxonomy_labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        text = self.image_captions[idx]
        label = self.intent_taxonomy[idx]

        # Load and preprocess image
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)

        return image, text, label

In [55]:
# Assuming you have lists or arrays of image paths, captions, and encoded labels:
train_dataset = MyMultimodalDataset(train_df['Image_Path'], train_df['Image_Caption'], train_df['Intent_Taxonomy_Labels'], transform=transform)
val_dataset = MyMultimodalDataset(validation_df['Image_Path'], validation_df['Image_Caption'], validation_df['Intent_Taxonomy_Labels'], transform=transform)
test_dataset = MyMultimodalDataset(test_df['Image_Path'], test_df['Image_Caption'], test_df['Intent_Taxonomy_Labels'], transform=transform)

# Define data loaders
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [56]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertModel, BertTokenizer,AdamW
from tqdm import tqdm
import torchvision.models as models
import time
from torchvision.models import densenet169

In [None]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

In [72]:
import torch
import time
from torch.optim import AdamW
from torchvision import transforms
from PIL import Image
from tqdm import tqdm
import torch.nn as nn

In [73]:
# Define optimizer and loss function
optimizer = AdamW(list(densenet169.parameters()) + list(bert_model.parameters()), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

In [None]:
# Define the densenet169Features class
class densenet169Features(nn.Module):
    def __init__(self, original_model):
        super(densenet169Features, self).__init__()
        self.features = original_model.features
        self.pooling = nn.AdaptiveAvgPool2d((1, 1))

    def forward(self, x):
        x = self.features(x)
        x = self.pooling(x)
        x = torch.flatten(x, 1)
        return x

# Initialize densenet169_model with IMAGENET1K_V1 weights
densenet169 = models.densenet169(weights='IMAGENET1K_V1', progress=True)
densenet169 = densenet169Features(densenet169)

# Initialize BERT tokenizer and model
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
bert_model = BertModel.from_pretrained("bert-base-multilingual-cased")

# Define the EarlyFusionModel class
class EarlyFusionModel(nn.Module):
    def __init__(self, densenet, bert_model, num_classes):
        super(EarlyFusionModel, self).__init__()
        self.densenet = densenet
        self.bert_model = bert_model
        self.fc = nn.Sequential(
            nn.Linear(1664 + 768, 512),  # Updated input dimension to match concatenated features
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )

    def forward(self, images, input_ids, attention_mask):
        # Image features
        img_features = self.densenet(images)
        # Ensure img_features shape is correct
        assert img_features.shape[1] == 1664, f"Unexpected img_features shape: {img_features.shape}"

        # Text features
        bert_outputs = self.bert_model(input_ids, attention_mask=attention_mask)
        text_features = bert_outputs.last_hidden_state[:, 0, :]

        # Ensure text_features shape is correct
        assert text_features.shape[1] == 768, f"Unexpected text_features shape: {text_features.shape}"

        # Concatenate image and text features
        combined_features = torch.cat((img_features, text_features), dim=1)

        # Ensure combined_features shape is correct
        assert combined_features.shape[1] == 2432, f"Unexpected combined_features shape: {combined_features.shape}"

        # Pass through the shared classifier
        logits = self.fc(combined_features)
        return logits

# Move models to the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
densenet169.to(device)
bert_model.to(device)

# Initialize the model
num_classes = 6  # Number of classes
model = EarlyFusionModel(densenet169, bert_model, num_classes)
model.to(device)

# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

num_epochs = 35
max_seq_length = 100  # Set your desired maximum sequence length

train_losses = []
train_accuracies = []
val_losses = []
val_accuracies = []

start_time = time.time()

# Assuming train_loader and val_loader are already defined
# Training loop
for epoch in range(num_epochs):
    model.train()
    running_train_loss = 0.0
    correct_train = 0
    total_train = 0

    for images, texts, labels in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}', leave=False):
        # Move tensors to the device
        images = images.to(device)
        labels = labels.to(device)

        # Convert texts to tensors and pad to a fixed sequence length
        texts = [bert_tokenizer(text, padding='max_length', truncation=True, max_length=max_seq_length, return_tensors='pt') for text in texts]
        input_ids = torch.stack([text['input_ids'].squeeze(0) for text in texts], dim=0).to(device)
        attention_mask = torch.stack([text['attention_mask'].squeeze(0) for text in texts], dim=0).to(device)

        optimizer.zero_grad()

        # Forward pass
        logits = model(images, input_ids, attention_mask)

        # Ensure labels have the correct shape and type
        labels = labels.view(-1)  # Flatten labels to match batch size
        labels = labels.to(torch.long)  # Ensure labels are of type torch.long

        # Check if labels are empty
        if labels.numel() == 0:
            print(f"Skipping empty labels batch")
            continue

        # Calculate loss
        loss = criterion(logits, labels)

        # Backpropagation
        loss.backward()
        optimizer.step()

        running_train_loss += loss.item()

        # Calculate accuracy
        _, predicted = logits.max(1)
        total_train += labels.size(0)
        correct_train += predicted.eq(labels).sum().item()

    epoch_train_loss = running_train_loss / len(train_loader)
    epoch_train_accuracy = correct_train / total_train

    train_losses.append(epoch_train_loss)
    train_accuracies.append(epoch_train_accuracy)

    # Validation loop
    model.eval()
    running_val_loss = 0.0
    correct_val = 0
    total_val = 0

    with torch.no_grad():
        for val_images, val_texts, val_labels in val_loader:
            val_images = val_images.to(device)
            val_labels = val_labels.to(device)

            val_texts = [bert_tokenizer(text, padding='max_length', truncation=True, max_length=max_seq_length, return_tensors='pt') for text in val_texts]
            val_input_ids = torch.stack([text['input_ids'].squeeze(0) for text in val_texts], dim=0).to(device)
            val_attention_mask = torch.stack([text['attention_mask'].squeeze(0) for text in val_texts], dim=0).to(device)

            # Forward pass
            val_logits = model(val_images, val_input_ids, val_attention_mask)

            # Ensure val_labels have the correct shape and type
            val_labels = val_labels.view(-1)  # Flatten val_labels to match batch size
            val_labels = val_labels.to(torch.long)  # Ensure val_labels are of type torch.long

            # Check if val_labels are empty
            if val_labels.numel() == 0:
                print(f"Skipping empty validation labels batch")
                continue

            # Calculate validation loss
            val_loss = criterion(val_logits, val_labels)

            running_val_loss += val_loss.item()

            # Calculate validation accuracy
            _, val_predicted = val_logits.max(1)
            total_val += val_labels.size(0)
            correct_val += val_predicted.eq(val_labels).sum().item()

    epoch_val_loss = running_val_loss / len(val_loader)
    epoch_val_accuracy = correct_val / total_val

    val_losses.append(epoch_val_loss)
    val_accuracies.append(epoch_val_accuracy)

    print(f"Epoch [{epoch + 1}/{num_epochs}] - "
          f"Train Loss: {epoch_train_loss:.4f}, Train Acc: {epoch_train_accuracy:.4f}, "
          f"Val Loss: {epoch_val_loss:.4f}, Val Acc: {epoch_val_accuracy:.4f}")

end_time = time.time()
execution_time = end_time - start_time
print(f"Total execution time: {execution_time:.2f} seconds")


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

# Evaluation function for the test dataset
def evaluate_model(model, test_loader, criterion):
    model.eval()
    running_test_loss = 0.0
    correct_test = 0
    total_test = 0

    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for test_images, test_texts, test_labels in test_loader:
            test_images = test_images.to(device)
            test_labels = test_labels.to(device)

            test_texts = [bert_tokenizer(text, padding='max_length', truncation=True, max_length=max_seq_length, return_tensors='pt') for text in test_texts]
            test_input_ids = torch.stack([text['input_ids'].squeeze(0) for text in test_texts], dim=0).to(device)
            test_attention_mask = torch.stack([text['attention_mask'].squeeze(0) for text in test_texts], dim=0).to(device)

            # Forward pass
            test_logits = model(test_images, test_input_ids, test_attention_mask)

            # Ensure test_labels have the correct shape and type
            test_labels = test_labels.view(-1)  # Flatten test_labels to match batch size
            test_labels = test_labels.to(torch.long)  # Ensure test_labels are of type torch.long

            # Calculate test loss
            test_loss = criterion(test_logits, test_labels)

            running_test_loss += test_loss.item()

            # Calculate test accuracy
            _, test_predicted = test_logits.max(1)
            total_test += test_labels.size(0)
            correct_test += test_predicted.eq(test_labels).sum().item()

            all_labels.extend(test_labels.cpu().numpy())
            all_predictions.extend(test_predicted.cpu().numpy())

    epoch_test_loss = running_test_loss / len(test_loader)
    epoch_test_accuracy = correct_test / total_test

    return epoch_test_loss, epoch_test_accuracy, all_labels, all_predictions

# Run evaluation on the test dataset
test_loss, test_accuracy, test_labels, test_predictions = evaluate_model(model, test_loader, criterion)

# Calculate metrics
accuracy = accuracy_score(test_labels, test_predictions)
precision = precision_score(test_labels, test_predictions, average='weighted')
recall = recall_score(test_labels, test_predictions, average='weighted')
f1 = f1_score(test_labels, test_predictions, average='weighted')
conf_matrix = confusion_matrix(test_labels, test_predictions)

# Print metrics
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision: {precision:.4f}")
print(f"Test Recall: {recall:.4f}")
print(f"Test F1 Score: {f1:.4f}")

In [None]:
# Plot confusion matrix
plt.figure(figsize=(8, 6))
# Class names according to the label encoding mapping
class_names = ['Advocative', 'Controversial', 'ExhIbitionist', 'Expressive', 'Informative', 'Promotive']
sns.heatmap(conf_matrix, annot=True, cmap='Blues', fmt='d', xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()