In [None]:
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from transformers import BertTokenizer, BertModel, AdamW
import torchvision.models as models
from torch import nn
import time
from tqdm import tqdm

# Paths to the CSV files and image directories
csv_paths = {
    'train': '/kaggle/input/intent/Intent/train.csv',
    'test': '/kaggle/input/intent/Intent/validation.csv',
    'validation': '/kaggle/input/intent/Intent/validation.csv'
}

image_dirs = {
    'train': '/kaggle/input/intent/Intent/train',
    'test': '/kaggle/input/intent/Intent/validation',
    'validation': '/kaggle/input/intent/Intent/validation'
}
output_dir = '/kaggle/working/'  # Output directory to save the CSV files

# Function to check for matching Meme_ID and image files, and add image paths
def check_matches(csv_path, image_dir):
    df = pd.read_csv(csv_path)
    image_files = os.listdir(image_dir)
    image_names = {os.path.splitext(image_file)[0]: os.path.join(image_dir, image_file) for image_file in image_files}
    
    # Add Image_Path column to the dataframe
    df['Image_Path'] = df['Image_ID'].apply(lambda x: image_names.get(x, None))
    
    # Filter rows where Image_Path is not None (i.e., matched Meme_IDs)
    matched_df = df[df['Image_Path'].notna()]
    
    return matched_df

# Function to encode Intent_Taxonomy classes into labels
def encode_labels(df):
    label_encoder = LabelEncoder()
    df['Intent_Taxonomy_Labels'] = label_encoder.fit_transform(df['Intent_Taxonomy'])
    return df, label_encoder.classes_

# Check matches for each set (Train, Test, Validation)
for key in csv_paths:
    matched_df = check_matches(csv_paths[key], image_dirs[key])
    
    # Encode Intent_Taxonomy labels
    matched_df, classes = encode_labels(matched_df)
    
    matches_output_path = os.path.join(output_dir, f'{key}_matches.csv')
    
    # Save the processed dataframe to CSV
    matched_df.to_csv(matches_output_path, index=False)
    
    print(f"{key} set:")
    print(f"Matched Meme_IDs with image paths and labels saved to {matches_output_path}")
    print(f"Classes and their corresponding labels:\n{dict(zip(classes, range(len(classes))))}\n")

In [None]:
train_df = pd.read_csv('/kaggle/working/train_matches.csv')
train_df.head()

In [None]:
test_df = pd.read_csv('/kaggle/working/test_matches.csv')
test_df.head()

In [None]:
validation_df = pd.read_csv('/kaggle/working/validation_matches.csv')
validation_df.head()

In [50]:
# Define your transformations using transforms.Compose
transform = transforms.Compose([
    transforms.Resize(224),
    transforms.CenterCrop(224),  # Crop the center to 224x224
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

class MyMultimodalDataset(Dataset):
    def __init__(self, image_paths, image_captions, intent_taxonomy_labels, transform=None):
        self.image_paths = image_paths
        self.image_captions = image_captions
        self.intent_taxonomy = intent_taxonomy_labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        text = self.image_captions[idx]
        label = self.intent_taxonomy[idx]

        # Load and preprocess image
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)

        return image, text, label

In [51]:
# Assuming you have lists or arrays of image paths, captions, and encoded labels:
train_dataset = MyMultimodalDataset(train_df['Image_Path'], train_df['Image_Caption'], train_df['Intent_Taxonomy_Labels'], transform=transform)
val_dataset = MyMultimodalDataset(validation_df['Image_Path'], validation_df['Image_Caption'], validation_df['Intent_Taxonomy_Labels'], transform=transform)
test_dataset = MyMultimodalDataset(test_df['Image_Path'], test_df['Image_Caption'], test_df['Intent_Taxonomy_Labels'], transform=transform)

# Define data loaders
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision.models import mobilenet_v2
from transformers import BertModel, BertTokenizer
from tqdm import tqdm
import time

# Define device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# # Initialize BERT tokenizer and model
# bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
# bert_model = BertModel.from_pretrained("bert-base-multilingual-cased").to(device)

# from transformers import AutoTokenizer, DistilBertModel, AdamW

# # Initialize BERT tokenizer and model
# bert_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
# bert_model = DistilBertModel.from_pretrained("distilbert-base-uncased").to(device)

from transformers import AutoTokenizer, XLMRobertaModel, AdamW
# Initialize BERT tokenizer and model
bert_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
bert_model = XLMRobertaModel.from_pretrained("xlm-roberta-base").to(device)

# Initialize MobileNetV2 (assuming it's your image feature extractor)
mobilenet_v2_model = mobilenet_v2(pretrained=True)
mobilenet_v2_model.classifier = nn.Identity()  # Remove the final classifier layer
mobilenet_v2_model.to(device)
mobilenet_v2_model.eval()  # Ensure in evaluation mode

In [None]:
# Define the combined classifier
class CombinedClassifier(nn.Module):
    def __init__(self, img_feature_dim, text_feature_dim, num_classes):
        super(CombinedClassifier, self).__init__()
        self.img_fc = nn.Sequential(
            nn.Linear(img_feature_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )
        self.text_fc = nn.Sequential(
            nn.Linear(text_feature_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )

    def forward(self, img_features, text_features):
        img_logits = self.img_fc(img_features)
        text_logits = self.text_fc(text_features)
        combined_logits = 0.5 * (img_logits + text_logits)  # Simple averaging
        return combined_logits


# Example usage assuming img_feature_dim is correctly set
combined_classifier = CombinedClassifier(img_feature_dim=1280, text_feature_dim=768, num_classes=6).to(device)

# Define optimizer and criterion
optimizer = torch.optim.AdamW(list(mobilenet_v2_model.parameters()) + list(bert_model.parameters()) + list(combined_classifier.parameters()), lr=1e-5)
criterion = nn.CrossEntropyLoss()

# Set number of epochs and other parameters
num_epochs = 40
max_seq_length = 100

train_losses = []
train_accuracies = []
val_losses = []
val_accuracies = []

start_time = time.time()

# Training loop
for epoch in range(num_epochs):
    mobilenet_v2_model.train()
    bert_model.train()
    combined_classifier.train()
    
    running_train_loss = 0.0
    correct_train = 0
    total_train = 0

    for images, texts, labels in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}', leave=False):
        # Move tensors to the device
        images = images.to(device)
        labels = labels.to(device)

        # Extract image features using MobileNetV2
        with torch.no_grad():
            img_feats = mobilenet_v2_model(images)
        
        # Reshape img_feats
        img_feats = img_feats.view(img_feats.size(0), -1)

        # Convert texts to tensors and pad to a fixed sequence length
        texts = [bert_tokenizer(text, padding='max_length', truncation=True, max_length=max_seq_length, return_tensors='pt') for text in texts]
        input_ids = torch.stack([text['input_ids'].squeeze(0) for text in texts], dim=0).to(device)
        attention_mask = torch.stack([text['attention_mask'].squeeze(0) for text in texts], dim=0).to(device)

        optimizer.zero_grad()

        outputs = bert_model(input_ids, attention_mask=attention_mask)
        text_feats = outputs.last_hidden_state[:, 0, :]

        combined_logits = combined_classifier(img_feats, text_feats)

        # Ensure the labels tensor is flattened to match the combined_logits batch size
        labels = labels.view(-1)

        loss = criterion(combined_logits, labels)

        loss.backward()
        optimizer.step()

        running_train_loss += loss.item()
        _, predicted = combined_logits.max(1)
        total_train += labels.size(0)
        correct_train += predicted.eq(labels).sum().item()

    epoch_train_loss = running_train_loss / len(train_loader)
    epoch_train_accuracy = correct_train / total_train

    train_losses.append(epoch_train_loss)
    train_accuracies.append(epoch_train_accuracy)
    
    # Validation loop
    mobilenet_v2_model.eval()
    bert_model.eval()
    combined_classifier.eval()

    running_val_loss = 0.0
    correct_val = 0
    total_val = 0

    with torch.no_grad():
        for val_images, val_texts, val_labels in val_loader:
            val_images = val_images.to(device)
            val_labels = val_labels.to(device)

            val_img_feats = mobilenet_v2_model(val_images)
            val_img_feats = val_img_feats.view(val_img_feats.size(0), -1)

            val_texts = [bert_tokenizer(text, padding='max_length', truncation=True, max_length=max_seq_length, return_tensors='pt') for text in val_texts]
            val_input_ids = torch.stack([text['input_ids'].squeeze(0) for text in val_texts], dim=0).to(device)
            val_attention_mask = torch.stack([text['attention_mask'].squeeze(0) for text in val_texts], dim=0).to(device)

            val_outputs = bert_model(val_input_ids, attention_mask=val_attention_mask)
            val_text_feats = val_outputs.last_hidden_state[:, 0, :]

            val_combined_logits = combined_classifier(val_img_feats, val_text_feats)

            # Ensure the val_labels tensor is flattened to match the val_combined_logits batch size
            val_labels = val_labels.view(-1)

            val_loss = criterion(val_combined_logits, val_labels)

            running_val_loss += val_loss.item()
            _, val_predicted = val_combined_logits.max(1)
            total_val += val_labels.size(0)
            correct_val += val_predicted.eq(val_labels).sum().item()

    epoch_val_loss = running_val_loss / len(val_loader)
    epoch_val_accuracy = correct_val / total_val

    val_losses.append(epoch_val_loss)
    val_accuracies.append(epoch_val_accuracy)

    print(f"Epoch [{epoch + 1}/{num_epochs}] - "
          f"Train Loss: {epoch_train_loss:.4f}, Train Acc: {epoch_train_accuracy:.4f}, "
          f"Val Loss: {epoch_val_loss:.4f}, Val Acc: {epoch_val_accuracy:.4f}")

end_time = time.time()
execution_time = end_time - start_time
print(f"Total execution time: {execution_time:.2f} seconds")


In [None]:
import torch
import torch.nn as nn
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

# Assuming test_loader contains your test data loader

combined_classifier.eval()  # Set the model to evaluation mode

test_losses = []
test_accuracies = []
predictions = []
true_labels = []

with torch.no_grad():
    for images, texts, labels in tqdm(test_loader, desc='Testing', leave=False):
        images = images.to(device)
        labels = labels.to(device)

        texts = [bert_tokenizer(text, padding='max_length', truncation=True, max_length=max_seq_length, return_tensors='pt') for text in texts]
        input_ids = torch.stack([text['input_ids'].squeeze(0) for text in texts], dim=0).to(device)
        attention_mask = torch.stack([text['attention_mask'].squeeze(0) for text in texts], dim=0).to(device)

        img_feats = mobilenet_v2_model(images)
        img_feats = img_feats.view(img_feats.size(0), -1)

        outputs = bert_model(input_ids, attention_mask=attention_mask)
        text_feats = outputs.last_hidden_state[:, 0, :]

        logits = combined_classifier(img_feats, text_feats)

        # Calculate loss if needed
        test_loss = criterion(logits, labels)
        test_losses.append(test_loss.item())

        # Calculate accuracy
        _, predicted = logits.max(1)
        predictions.extend(predicted.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())
        correct = predicted.eq(labels).sum().item()
        total = labels.size(0)
        test_accuracy = correct / total
        test_accuracies.append(test_accuracy)

# Calculate overall test metrics
average_test_loss = sum(test_losses) / len(test_losses)
test_accuracy = accuracy_score(true_labels, predictions)
precision, recall, f1_score, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
conf_matrix = confusion_matrix(true_labels, predictions)

# Print and plot results
print(f"Test Loss: {average_test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1_score:.4f}")


In [None]:
# Plot confusion matrix
plt.figure(figsize=(8, 6))
# Class names according to the label encoding mapping
class_names = ['Advocative', 'Controversial', 'ExhIbitionist', 'Expressive', 'Informative', 'Promotive']
sns.heatmap(conf_matrix, annot=True, cmap='Blues', fmt='d', xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()