In [None]:
!pip install --upgrade scipy

In [None]:
!pip install pandas

In [None]:
!pip install numpy

In [None]:
!pip install matplotlib

In [None]:
!pip install seaborn

In [None]:
!pip install nltk

In [None]:
import warnings

# Ignore warnings
warnings.filterwarnings("ignore", message="is_categorical_dtype is deprecated")

# Ignore the FutureWarning related to use_inf_as_na in seaborn
warnings.filterwarnings("ignore", message="use_inf_as_na is deprecated")

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

# **Dataset Manipulation**

In [None]:
import json
import os

# Path to your dataset directory
dataset_dir = '/kaggle/input/facebook-hateful-meme-dataset/data'

# Function to read JSONL files
def read_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return data

# Load train, test, dev JSONL files
train_data = read_jsonl(os.path.join(dataset_dir, 'train.jsonl'))
test_data = read_jsonl(os.path.join(dataset_dir, 'test.jsonl'))
dev_data = read_jsonl(os.path.join(dataset_dir, 'dev.jsonl'))



# **Images mapping to its text and label**

In [None]:
def map_images_to_data(data):
    image_map = {}
    for item in data:
        image_id = item['id']
        image_path = os.path.join(dataset_dir, item['img'])
        label = item.get('label', None)  # Using dict.get() to handle missing 'label'
        text = item.get('text', None)  # Using dict.get() to handle missing 'text'
        image_map[image_id] = {'path': image_path, 'label': label, 'text': text}
    return image_map

# Call the function to map image data to corresponding information
train_image_map = map_images_to_data(train_data)


# **First five data entries in train dataset**

In [None]:
# Print information for the first five data entries in train_data
for idx, item in enumerate(train_data[:5]):  # Loop through the first 5 items
    print(f"Entry {idx + 1}:")
    print(f"ID: {item['id']}")
    print(f"Image: {item['img']}")
    print(f"Label: {item.get('label', None)}")  # Handling missing 'label' key
    print(f"Text: {item.get('text', None)}")  # Handling missing 'text' key
    print("------------")


# **First five data entries in test dataset**

In [None]:
# Print information for the first five data entries in test_data
for idx, item in enumerate(test_data[:5]):  # Loop through the first 5 items
    print(f"Entry {idx + 1}:")
    print(f"ID: {item['id']}")
    print(f"Image: {item['img']}")
    print(f"Label: {item.get('label', None)}")  # Handling missing 'label' key
    print(f"Text: {item.get('text', None)}")  # Handling missing 'text' key
    print("------------")


# **First five data entries in dev dataset**

In [None]:
# Print information for the first five data entries in dev_data
for idx, item in enumerate(dev_data[:5]):  # Loop through the first 5 items
    print(f"Entry {idx + 1}:")
    print(f"ID: {item['id']}")
    print(f"Image: {item['img']}")
    print(f"Label: {item.get('label', None)}")  # Handling missing 'label' key
    print(f"Text: {item.get('text', None)}")  # Handling missing 'text' key
    print("------------")


# I will take only train dataset and split it into 3 parts:
* 1st 5000 - Train 
* 2nd 1000 - Test
* 3rd 500 - Validation

In [None]:
# Get the first 5000 entries for training
train_subset = train_data[:5000]

# Get the next 1000 entries for testing
test_subset = train_data[5000:6000]

# Get the next 500 entries for validation
validation_subset = train_data[6000:6500]

# Create DataFrames for each subset
train_df = pd.DataFrame([train_image_map[item['id']] for item in train_subset])
test_df = pd.DataFrame([train_image_map[item['id']] for item in test_subset])
validation_df = pd.DataFrame([train_image_map[item['id']] for item in validation_subset])

# **Data Preprocessing**

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove website links
    text = re.sub(r'http\S+|www\S+', '', text)

    # Remove extra spaces
    text = ' '.join(text.split())

    # Remove emails
    text = re.sub(r'\S*@\S*\s?', '', text)

    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in stop_words]

    return ' '.join(filtered_text)


# Apply the preprocessing function to the 'text' column
train_df['text'] = train_df['text'].apply(preprocess_text)
test_df['text'] = test_df['text'].apply(preprocess_text)
validation_df['text'] = validation_df['text'].apply(preprocess_text)

# **Train Dataset**

In [None]:
train_df.head()

In [None]:
# Get information about columns, data types, and missing values
print("\nInformation about the train dataset:")
print(train_df.info())

# **Image size in train dataset**

In [None]:
from PIL import Image

# Load the first image in the train_df DataFrame
image_path = train_df['path'].iloc[0]
image = Image.open(image_path)

# Get the dimensions (size) of the image
image_size = image.size
print("Image size:", image_size)

# **Visualization of Label Distribution in Train Dataset**
* # 0 - Not Hateful
* # 1 - Hateful

In [None]:
label_counts = train_df['label'].value_counts()

# Define custom colors for the bars ('Not hateful' and 'hateful')
custom_colors = ['#73aeea', '#2595b0']

# Define custom font dictionary for title and labels
font = {'family': 'Serif', 'weight': 'bold', 'size': 12}

plt.figure(figsize=(6, 5))

# Create bar plot with grid
bars = plt.bar(label_counts.index, label_counts.values, color=custom_colors)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Set title and axis labels using custom fontdict
plt.title('Hateful or Not Hateful meme Distribution', fontdict=font)
plt.xlabel('Labels', fontdict=font)
plt.ylabel('Number of Labels', fontdict=font)

# Set custom font for ticks on both x and y axes
plt.xticks(label_counts.index, label_counts.index, fontdict=font)
plt.yticks(fontname='Serif', fontsize=10)

# Adding annotations (count values) on top of each bar
for bar, count in zip(bars, label_counts.values):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), str(count),
             ha='center', va='bottom', fontdict=font)

# Show the plot
plt.show()


# **Visualization of Text Length Distribution in Train Dataset**

In [None]:
# Visualize text length distribution
text_lengths = train_df['text'].apply(lambda x: len(x.split()))
# Define custom font dictionary for title and labels
font = {'family': 'Serif', 'weight': 'bold', 'size': 12}
# Define custom colors for the bars
custom_colors = ['#e34861']
plt.figure(figsize=(8, 4))
plt.hist(text_lengths, bins=15, color=custom_colors, alpha=0.7)

# Set title and axis labels using custom fontdict
plt.title('Text Length Distribution',fontdict=font)
plt.xlabel('Length of Text',fontdict=font)
plt.ylabel('Number of Texts',fontdict=font)

# Set custom font for ticks on both x and y axes
plt.xticks(fontname='Serif', fontsize=10)
plt.yticks(fontname='Serif', fontsize=10)
plt.grid(True)
plt.show()

# **Checking if there is any null values in the train dataset**

In [None]:
# Check for null values in  'path', 'label', and 'text' columns
null_image = train_df['path'].isnull().sum()
null_label = train_df['label'].isnull().sum()
null_text = train_df['text'].isnull().sum()

print(f"Null values in 'image': {null_image}")
print(f"Null values in 'label': {null_label}")
print(f"Null values in 'text': {null_text}")

# **Text Dataset**

In [None]:
test_df.head()

In [None]:
# Get information about columns, data types, and missing values
print("\nInformation about the train dataset:")
print(test_df.info())

# **Image size in test dataset**

In [None]:
from PIL import Image

# Load the first image in the test_df DataFrame
image_path = test_df['path'].iloc[0]
image = Image.open(image_path)

# Get the dimensions (size) of the image
image_size = image.size
print("Image size:", image_size)

# **Visualization of Label Distribution in Test Dataset**
* # 0 - Not Hateful
* # 1 - Hateful

In [None]:
label_counts = test_df['label'].value_counts()

# Define custom colors for the bars ('Not hateful' and 'Hateful')
custom_colors = ['#73aeea', '#2595b0']

# Define custom font dictionary for title and labels
font = {'family': 'Serif', 'weight': 'bold', 'size': 12}

plt.figure(figsize=(6, 5))

# Create bar plot with grid
bars = plt.bar(label_counts.index, label_counts.values, color=custom_colors)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Set title and axis labels using custom fontdict
plt.title('Hateful or Not Hateful meme Distribution', fontdict=font)
plt.xlabel('Labels', fontdict=font)
plt.ylabel('Number of Labels', fontdict=font)

# Set custom font for ticks on both x and y axes
plt.xticks(label_counts.index, label_counts.index, fontdict=font)
plt.yticks(fontname='Serif', fontsize=10)

# Adding annotations (count values) on top of each bar
for bar, count in zip(bars, label_counts.values):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), str(count),
             ha='center', va='bottom', fontdict=font)

# Show the plot
plt.show()


# **Visualization of Text Length Distribution in Test Dataset**

In [None]:
# Visualize text length distribution
text_lengths = test_df['text'].apply(lambda x: len(x.split()))
# Define custom font dictionary for title and labels
font = {'family': 'Serif', 'weight': 'bold', 'size': 12}
# Define custom colors for the bars
custom_colors = ['#e34861']
plt.figure(figsize=(8, 4))
plt.hist(text_lengths, bins=15, color=custom_colors, alpha=0.7)

# Set title and axis labels using custom fontdict
plt.title('Text Length Distribution',fontdict=font)
plt.xlabel('Length of Text',fontdict=font)
plt.ylabel('Number of Texts',fontdict=font)

# Set custom font for ticks on both x and y axes
plt.xticks(fontname='Serif', fontsize=10)
plt.yticks(fontname='Serif', fontsize=10)
plt.grid(True)
plt.show()

# **Checking if there is any null values in the test dataset**

In [None]:
# Check for null values in  'path', 'label', and 'text' columns
null_image = test_df['path'].isnull().sum()
null_label = test_df['label'].isnull().sum()
null_text = test_df['text'].isnull().sum()

print(f"Null values in 'image': {null_image}")
print(f"Null values in 'label': {null_label}")
print(f"Null values in 'text': {null_text}")

# **Validation Dataset**

In [None]:
validation_df.head()

In [None]:
# Get information about columns, data types, and missing values
print("\nInformation about the train dataset:")
print(validation_df.info())

# **Image size in validation dataset**

In [None]:
from PIL import Image

# Load the first image in the validation_df DataFrame
image_path = validation_df['path'].iloc[0]
image = Image.open(image_path)

# Get the dimensions (size) of the image
image_size = image.size
print("Image size:", image_size)

# **Visualization of Label Distribution in Validation Dataset**
* # 0 - Not Hateful
* # 1 - Hateful

In [None]:
label_counts = validation_df['label'].value_counts()

# Define custom colors for the bars ('Not hateful' and 'hateful')
custom_colors = ['#73aeea', '#2595b0']

# Define custom font dictionary for title and labels
font = {'family': 'Serif', 'weight': 'bold', 'size': 12}

plt.figure(figsize=(6, 5))

# Create bar plot with grid
bars = plt.bar(label_counts.index, label_counts.values, color=custom_colors)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Set title and axis labels using custom fontdict
plt.title('Hateful or Not Hateful meme Distribution', fontdict=font)
plt.xlabel('Labels', fontdict=font)
plt.ylabel('Number of Labels', fontdict=font)

# Set custom font for ticks on both x and y axes
plt.xticks(label_counts.index, label_counts.index, fontdict=font)
plt.yticks(fontname='Serif', fontsize=10)

# Adding annotations (count values) on top of each bar
for bar, count in zip(bars, label_counts.values):
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), str(count),
             ha='center', va='bottom', fontdict=font)

# Show the plot
plt.show()


# **Visualization of Text Length Distribution in Validation Dataset**

In [None]:
# Visualize text length distribution
text_lengths = validation_df['text'].apply(lambda x: len(x.split()))
# Define custom font dictionary for title and labels
font = {'family': 'Serif', 'weight': 'bold', 'size': 12}
# Define custom colors for the bars
custom_colors = ['#e34861']
plt.figure(figsize=(8, 4))
plt.hist(text_lengths, bins=15, color=custom_colors, alpha=0.7)

# Set title and axis labels using custom fontdict
plt.title('Text Length Distribution',fontdict=font)
plt.xlabel('Length of Text',fontdict=font)
plt.ylabel('Number of Texts',fontdict=font)

# Set custom font for ticks on both x and y axes
plt.xticks(fontname='Serif', fontsize=10)
plt.yticks(fontname='Serif', fontsize=10)
plt.grid(True)
plt.show()

# **Checking if there is any null values in the validation dataset**

In [None]:
# Check for null values in  'path', 'label', and 'text' columns
null_image = test_df['path'].isnull().sum()
null_label = test_df['label'].isnull().sum()
null_text = test_df['text'].isnull().sum()

print(f"Null values in 'image': {null_image}")
print(f"Null values in 'label': {null_label}")
print(f"Null values in 'text': {null_text}")

In [None]:
!pip install torch torchvision transformers

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from transformers import BertTokenizer, BertModel
import torchvision.models as models

# **Custom Multimodal Dataset**

In [None]:
from torchvision import transforms

# Define your transformations using transforms.Compose
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(256),  # Crop the center to 256x256
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

class MyMultimodalDataset(Dataset):
    def __init__(self, image_paths, texts, labels, transform=None):
        self.image_paths = image_paths
        self.texts = texts
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        text = self.texts[idx]
        label = self.labels[idx]

        # Load and preprocess image
        image = Image.open(img_path).convert('RGB')
        if self.transform is not None:
            image = self.transform(image)  # Apply the composed transformation

        return image, text, label


# **Dataset and Dataloader**

In [None]:
# Create custom datasets with MyMultimodalDataset
train_dataset = MyMultimodalDataset(train_df['path'], train_df['text'], train_df['label'], transform=transform)
test_dataset = MyMultimodalDataset(test_df['path'], test_df['text'], test_df['label'], transform=transform)
val_dataset = MyMultimodalDataset(validation_df['path'], validation_df['text'], validation_df['label'], transform=transform)

# Define data loaders
train_loader = DataLoader(train_dataset, batch_size=10, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=10, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=10, shuffle=False)

# **MobileNet V3 for image feature extractor**
* https://pytorch.org/vision/main/models/mobilenetv3.html

In [None]:
import torch
import torchvision.models as models

# Initialize mobilenet_v3 with IMAGENET1K_V1 weights
mobilenet_v3_large = models.mobilenet_v3_large(weights='IMAGENET1K_V1', progress=True)
mobilenet_v3_large = torch.nn.Sequential(*(list(mobilenet_v3_large.children())[:-1]))  # Remove the classification layer

# **mBERT for text feature extractor**

In [None]:
from transformers import BertTokenizer, BertModel,AdamW
# Initialize BERT tokenizer and model
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
bert_model = BertModel.from_pretrained("bert-base-multilingual-cased")

In [None]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

In [None]:
mobilenet_v3_large.to(device)

In [None]:
bert_model.to(device)

In [None]:
import torch
import time
from torch.optim import AdamW
from torchvision import transforms
from PIL import Image
from tqdm import tqdm

# **Optimizer and Loss Function**

In [None]:
# Define optimizer and loss function
optimizer = AdamW(list(mobilenet_v3_large.parameters()) + list(bert_model.parameters()), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

# **Training start here**

In [None]:
# Set models to evaluation mode
mobilenet_v3_large.eval()
bert_model.eval()

num_epochs = 1
num_classes = 2 
max_seq_length = 100  # Set your desired maximum sequence length

train_losses = []
train_accuracies = []
val_losses = []
val_accuracies = []

start_time = time.time()

# Training loop
for epoch in range(num_epochs):
    running_train_loss = 0.0
    correct_train = 0
    total_train = 0

    for images, texts, labels in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}', leave=False):
        # Move tensors to the device
        images = images.to(device)
        labels = labels.to(device)

        # Convert texts to tensors and pad to a fixed sequence length
        texts = [bert_tokenizer(text, padding='max_length', truncation=True, max_length=max_seq_length, return_tensors='pt') for text in texts]
        input_ids = torch.stack([text['input_ids'].squeeze(0) for text in texts], dim=0).to(device)
        attention_mask = torch.stack([text['attention_mask'].squeeze(0) for text in texts], dim=0).to(device)

        optimizer.zero_grad()

        img_feats = mobilenet_v3_large(images)
        img_feats = img_feats.squeeze()

        outputs = bert_model(input_ids, attention_mask=attention_mask)
        text_feats = outputs.last_hidden_state[:, 0, :]

        # Assuming 'img_feats' has shape [batch_size, channels, height, width] and 'text_feats' has shape [batch_size, features]
        # Reshape 'img_feats' to match 'text_feats' along the concatenation dimension
        img_feats_reshaped = img_feats.view(img_feats.size(0), -1)  # Reshape img_feats

        # Concatenate the reshaped img_feats and text_feats
        combined_feats = torch.cat((img_feats_reshaped, text_feats), dim=1)

        classifier = torch.nn.Sequential(
            torch.nn.Linear(combined_feats.shape[1], 512).to(device),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.5),
            torch.nn.Linear(512, num_classes).to(device),
        )

        logits = classifier(combined_feats)

        loss = criterion(logits, labels)

        loss.backward()
        optimizer.step()

        running_train_loss += loss.item()
        _, predicted = logits.max(1)
        total_train += labels.size(0)
        correct_train += predicted.eq(labels).sum().item()

    epoch_train_loss = running_train_loss / len(train_loader)
    epoch_train_accuracy = correct_train / total_train

    train_losses.append(epoch_train_loss)
    train_accuracies.append(epoch_train_accuracy)

    # Validation loop
    running_val_loss = 0.0
    correct_val = 0
    total_val = 0

    with torch.no_grad():
        for val_images, val_texts, val_labels in val_loader:
            val_images = val_images.to(device)
            val_labels = val_labels.to(device)

            val_texts = [bert_tokenizer(text, padding='max_length', truncation=True, max_length=max_seq_length, return_tensors='pt') for text in val_texts]
            val_input_ids = torch.stack([text['input_ids'].squeeze(0) for text in val_texts], dim=0).to(device)
            val_attention_mask = torch.stack([text['attention_mask'].squeeze(0) for text in val_texts], dim=0).to(device)

            val_img_feats = mobilenet_v3_large(val_images)
            val_img_feats = val_img_feats.squeeze()

            val_outputs = bert_model(val_input_ids, attention_mask=val_attention_mask)
            val_text_feats = val_outputs.last_hidden_state[:, 0, :]

            # Reshape 'val_img_feats' to match 'val_text_feats' along the concatenation dimension
            val_img_feats_reshaped = val_img_feats.view(val_img_feats.size(0), -1)  # Reshape val_img_feats

           # Concatenate the reshaped val_img_feats and val_text_feats
            val_combined_feats = torch.cat((val_img_feats_reshaped, val_text_feats), dim=1)

            val_classifier = torch.nn.Sequential(
                torch.nn.Linear(val_combined_feats.shape[1], 512).to(device),
                torch.nn.ReLU(),
                torch.nn.Dropout(0.5),
                torch.nn.Linear(512, num_classes).to(device),
            )

            val_logits = val_classifier(val_combined_feats)
            val_loss = criterion(val_logits, val_labels)

            running_val_loss += val_loss.item()
            _, val_predicted = val_logits.max(1)
            total_val += val_labels.size(0)
            correct_val += val_predicted.eq(val_labels).sum().item()

    epoch_val_loss = running_val_loss / len(val_loader)
    epoch_val_accuracy = correct_val / total_val

    val_losses.append(epoch_val_loss)
    val_accuracies.append(epoch_val_accuracy)

    print(f"Epoch [{epoch + 1}/{num_epochs}] - "
          f"Train Loss: {epoch_train_loss:.4f}, Train Acc: {epoch_train_accuracy:.4f}, "
          f"Val Loss: {epoch_val_loss:.4f}, Val Acc: {epoch_val_accuracy:.4f}")

end_time = time.time()
execution_time = end_time - start_time
print(f"Total execution time: {execution_time:.2f} seconds")


# **Testing start here**

In [None]:
import time

# Prepare lists to store predicted and true labels
predicted_labels = []
true_labels = []

# Set models to evaluation mode
mobilenet_v3_large.eval()
bert_model.eval()

# Start the timer
start_time = time.time()

# Iterate over the test_loader
with torch.no_grad():
    for test_images, test_texts, test_labels in test_loader:
        # Move tensors to the device
        test_images = test_images.to(device)
        test_labels = test_labels.to(device)

        # Convert texts to tensors and pad to a fixed sequence length
        test_texts = [bert_tokenizer(text, padding='max_length', truncation=True, max_length=max_seq_length, return_tensors='pt') for text in test_texts]
        test_input_ids = torch.stack([text['input_ids'].squeeze(0) for text in test_texts], dim=0).to(device)
        test_attention_mask = torch.stack([text['attention_mask'].squeeze(0) for text in test_texts], dim=0).to(device)

        # Extract image features using mobilenet_v3_large
        test_img_feats = mobilenet_v3_large(test_images)
        test_img_feats = test_img_feats.squeeze()

        # Obtain textual features using mBERT
        test_outputs = bert_model(test_input_ids, attention_mask=test_attention_mask)
        test_text_feats = test_outputs.last_hidden_state[:, 0, :]


        # Reshape 'test_img_feats' to match 'test_text_feats' along the concatenation dimension
        test_img_feats_reshaped = test_img_feats.view(test_img_feats.size(0), -1)  # Reshape val_img_feats

        # Concatenate the reshaped val_img_feats and val_text_feats
        test_combined_feats = torch.cat((test_img_feats_reshaped, test_text_feats), dim=1)

        # Pass the combined features through the classifier
        test_logits = classifier(test_combined_feats)

        # Convert logits to predicted labels
        batch_predicted_labels = torch.argmax(test_logits, axis=1).cpu().numpy()

        # Append the predicted labels and true labels
        predicted_labels.extend(batch_predicted_labels)
        true_labels.extend(test_labels.cpu().numpy().tolist())

# Stop the timer
end_time = time.time()

# Calculate the execution time
execution_time = end_time - start_time

# Print predicted and true labels
print("Predicted Labels:", predicted_labels)
print("True Labels:", true_labels)
print(f"Total execution time for testing: {execution_time:.2f} seconds")


# **Evaluation Metrics**

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, jaccard_score, log_loss, roc_auc_score, confusion_matrix,classification_report



test_accuracy = accuracy_score(true_labels, predicted_labels)
test_precision = precision_score(true_labels, predicted_labels, average='macro')
test_recall = recall_score(true_labels, predicted_labels, average='macro')
test_f1 = f1_score(true_labels, predicted_labels, average='macro')
test_jaccard_score = jaccard_score(true_labels, predicted_labels, average='macro')
test_log_loss = log_loss(true_labels, predicted_labels)
test_roc_auc_score = roc_auc_score(true_labels, predicted_labels)


print(f'Test Accuracy: {test_accuracy}')
print(f'Test Precision: {test_precision}')
print(f'Test Recall: {test_recall}')
print(f'Test F1 Score: {test_f1}')
print(f'Test Jaccard Score: {test_jaccard_score}')
print(f'Test Log Loss: {test_log_loss}')
print("Test ROC AUC Score:", test_roc_auc_score)

# **Classification Report**

In [None]:
# Generate and print the classification report
report = classification_report(true_labels, predicted_labels)
print("Classification Report:")
print(report)

# **Confusion Matrix**

In [None]:
# Generate confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)

# Plot confusion matrix
plt.figure(figsize=(6, 4))
# Define the custom palette
custom_palette = sns.color_palette("mako", as_cmap=True)# Modify the number based on number of classes in the dataset
# Define custom font dictionary for title and labels
font = {'family': 'Serif', 'weight': 'bold', 'size': 12}

# Create heatmap with annotations and colormap
heatmap = sns.heatmap(conf_matrix, annot=True, fmt='d', cmap=custom_palette, linewidths=2, linecolor='white',
                      xticklabels=['0', '1'], yticklabels=['0', '1'],annot_kws={"family": "Serif",'weight': 'bold', 'size': 12})

# Set x and y labels with the custom font dictionary
heatmap.set_xlabel('Predicted Labels', fontdict=font)
heatmap.set_ylabel('True Labels', fontdict=font)
heatmap.set_title('Multimodal Hateful Meme Classification', fontdict=font)

# Set font properties for tick labels on both axes
heatmap.set_xticklabels(heatmap.get_xticklabels(), fontname='Serif', fontsize=12)
heatmap.set_yticklabels(heatmap.get_yticklabels(), fontname='Serif', fontsize=12)

# Create a color bar to indicate the scale
cbar = heatmap.collections[0].colorbar
cbar.set_label('Count', fontdict=font)
cbar.ax.tick_params(labelsize=10)

plt.show()


# **Prediction on random (3) images from test dataset**

In [None]:
import matplotlib.pyplot as plt

# Set models to evaluation mode
mobilenet_v3_large.eval()
bert_model.eval()

# Define the number of samples to display
num_samples = 3

fig, axes = plt.subplots(1, num_samples, figsize=(13, 5))

for i in range(num_samples):
    # Choose a random index
    idx = np.random.randint(len(test_loader.dataset))

    # Get the sample using the index
    image, text, label = test_loader.dataset[idx]

    # Move image tensor to the device and process it
    image = image.permute(1, 2, 0).numpy()  # Convert PyTorch tensor to numpy array

    # Display the image using Matplotlib
    axes[i].imshow(image)
    axes[i].set_title(f"Sample {i + 1}\nTrue Label: {'Not Hateful' if true_labels == 0 else 'Hateful'}\nPredicted Label: {'Not Hateful' if predicted_labels == 0 else 'Hateful'}")  # Replace with actual class names
    axes[i].axis('off')

    # Rest of the code for processing text and predicting labels
    # ... (Your code for text processing and label prediction)
    
    # Print statements for true and predicted labels
    predicted_labels = "Not Hateful" if predicted_labels == 0 else "Hateful"  # Replace with your actual class names
    print(f"Sample {i + 1}: True Label - {'Not Hateful' if true_labels == 0 else 'Hateful'}, Predicted Label - {predicted_labels}")

plt.tight_layout()
plt.show()


# **Saving the model, tokenizer and classifier**

In [None]:
# Save the mobilenet_v3_large model
torch.save(mobilenet_v3_large.state_dict(), '/kaggle/working/multimodal_Meme_mobilenet_v3_large_Model.pth')

# Save the mBERT model
torch.save(bert_model.state_dict(), '/kaggle/working/multimodal_Meme_mBERT_Model.pth')

# Save the tokenizer
bert_tokenizer.save_pretrained('/kaggle/working/multimodal_Meme_mBERT_Tokenizer.json')

# Save the classifier separately using torch.save
torch.save(classifier.state_dict(), "/kaggle/working/multimodal_Meme_classifier.pth")


# **Loading the model, tokenizer and classifier**

In [None]:
# load the model
mobilenet_v3_large.load_state_dict(torch.load('/kaggle/working/multimodal_Meme_mobilenet_v3_large_Model.pth'))
bert_model.load_state_dict(torch.load('/kaggle/working/multimodal_Meme_mBERT_Model.pth'))


# load the tokenizer
bert_tokenizer.from_pretrained('/kaggle/working/multimodal_Meme_mBERT_Tokenizer.json')

num_classes = 2

'''
This piece of code is needed before calling classifier.load_state_dict() to ensure that
the loaded state dictionary gets applied to the correct architecture.
'''
classifier = torch.nn.Sequential(
            torch.nn.Linear(combined_feats.shape[1], 512).to(device),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.5),
            torch.nn.Linear(512, num_classes).to(device),
        )
# load the classifier separately
classifier.load_state_dict(torch.load("/kaggle/working/multimodal_Meme_classifier.pth"))