In [22]:
# @title CELL 1.1: Environment Setup
from google.colab import drive
import os
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive', force_remount=True)
else:
    print("Drive already mounted at /content/drive")
!pip install -q transformers==4.36.0 sentence-transformers==2.2.2 pyswarm scikit-learn pandas numpy tqdm easyocr
import torch, transformers, pandas as pd, numpy as np, json
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from tqdm import tqdm
torch.manual_seed(42)
device = torch.device('cuda')
torch.backends.cudnn.benchmark = True
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'  # Debug CUDA errors
os.environ['TORCH_USE_CUDA_DSA'] = '1'   # Enable device-side assertions
if torch.cuda.is_available():
    torch.cuda.init()  # Initialize CUDA context
    torch.cuda.reset_peak_memory_stats()  # Reset GPU memory stats
    torch.cuda.synchronize()  # Ensure GPU operations are complete
    try:
        torch.cuda.empty_cache()
        print("GPU cache cleared successfully")
    except Exception as e:
        print(f"Warning: Could not clear GPU cache: {e}")
print(f"VRAM allocated: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
print(f"VRAM reserved: {torch.cuda.memory_reserved() / 1024**3:.2f} GB")
os.makedirs('/content/drive/MyDrive/Thesis/output', exist_ok=True)
# Setup environment for GPU-accelerated training with CUDA debugging.

Drive already mounted at /content/drive
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
GPU cache cleared successfully
VRAM allocated: 0.00 GB
VRAM reserved: 0.00 GB


In [23]:
# @title CELL 2.1: Load Data
df = pd.read_excel('/content/drive/MyDrive/Thesis/dataset/categorized_lexicons.xlsx')
df['label'] = df['Label'].map({'Hate': 1, 'Non-hate': 0})
df['language'] = df['language'].map({'English': 0, 'Igbo': 1, 'Yoruba': 2, 'Hausa': 3})
print(df.shape, df['label'].value_counts(), df['language'].value_counts())
# Load and encode dataset (1500 samples).

(1500, 6) label
0    900
1    600
Name: count, dtype: int64 language
0    500
2    336
1    333
3    331
Name: count, dtype: int64


In [24]:
# @title CELL 2.2: Preprocess Images and Text
from PIL import Image
from torchvision import transforms
from transformers import AutoTokenizer, DistilBertTokenizer
import easyocr
import unicodedata
import os
clip_transform = transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize((0.481, 0.457, 0.408), (0.269, 0.271, 0.282))])
resnet_transform = transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))])
print("Initializing EasyOCR reader...")
reader = easyocr.Reader(['en'], gpu=False)  # Run on CPU to avoid CUDA error
print("Loading llava_tokenizer...")
llava_tokenizer = AutoTokenizer.from_pretrained('Davlan/afro-xlmr-base', force_download=True)
print("Loading baseline_tokenizer...")
baseline_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-multilingual-cased', force_download=True)
df['caption'] = df['caption'].fillna('').astype(str).apply(lambda x: unicodedata.normalize('NFKD', x))  # Normalize captions
# Debug: Check for invalid captions and image paths
invalid_captions = df['caption'].apply(lambda x: not isinstance(x, str) or len(x.strip()) == 0)
invalid_paths = df['image_path'].apply(lambda x: not os.path.exists('/content/drive/MyDrive/Thesis/dataset/images/' + str(x)))
if invalid_captions.any():
    print(f"Warning: {invalid_captions.sum()} invalid captions found")
if invalid_paths.any():
    print(f"Warning: {invalid_paths.sum()} invalid image paths found")
print(df[['image_path', 'caption', 'language']].head(10))
# Preprocess images and use captions directly.



Initializing EasyOCR reader...
Loading llava_tokenizer...




tokenizer_config.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/398 [00:00<?, ?B/s]

Loading baseline_tokenizer...


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

    image_path                        caption  language
0   frame1.png                     Inyamụrị         1
1   frame2.png                       Ndi ocha         1
2   frame3.png                     Umu Aboki          1
3   frame4.png      Ndi Yoruba bụ aghụghọ          1
4   frame5.png                            Zoo         1
5   frame6.png                       Ndị ara         1
6   frame7.png              I lụ ụmụ Hausa         1
7   frame8.png   Kwụsị ịna-eme ofe mmanụ          1
8   frame9.png          Ndi Yoruba egbu anyị         1
9  frame10.png    Hapụ ndị na-enweghị isi          1


In [25]:
# @title CELL 2.3: Cross-Validation Splits
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
df['stratify'] = df['label'].astype(str) + '_' + df['language'].astype(str)
folds = [(train_idx, test_idx) for train_idx, test_idx in skf.split(df, df['stratify'])]
print([len(fold[0]) for fold in folds], [len(fold[1]) for fold in folds])
# Generate balanced CV splits.

[1200, 1200, 1200, 1200, 1200] [300, 300, 300, 300, 300]


In [26]:
# @title CELL 3.0: Evaluation Function and Models
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import CLIPVisionModel, AutoModel
def evaluate(model, loader):
    model.eval()
    preds, labels, probs = [], [], []
    with torch.no_grad():
        for batch in loader:
            logits = model(batch['image'].to(device), batch['input_ids'].to(device), batch['attention_mask'].to(device))
            preds.extend(logits.argmax(dim=1).cpu().numpy())
            labels.extend(batch['label'].cpu().numpy())
            probs.extend(torch.softmax(logits, dim=1)[:, 1].cpu().numpy())
    acc = accuracy_score(labels, preds)
    prec, rec, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    try:
        auc = roc_auc_score(labels, probs)
    except:
        auc = float('nan')
    return acc, prec, rec, f1, auc
class MemeDataset(Dataset):
    def __init__(self, df, transform, tokenizer):
        self.df = df
        self.transform = transform
        self.tokenizer = tokenizer
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img = Image.open('/content/drive/MyDrive/Thesis/dataset/images/' + row['image_path']).convert('RGB')
        img = self.transform(img)
        txt = self.tokenizer(row['caption'], padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        return {'image': img, 'input_ids': txt['input_ids'].squeeze(), 'attention_mask': txt['attention_mask'].squeeze(), 'label': torch.tensor(row['label'], dtype=torch.long)}
class LLaVAModel(nn.Module):
    def __init__(self, params):
        super().__init__()
        self.clip = CLIPVisionModel.from_pretrained('openai/clip-vit-base-patch32')
        self.afriberta = AutoModel.from_pretrained('Davlan/afro-xlmr-base')
        self.proj = nn.Linear(768, 768)
        self.fc = nn.Sequential(
            nn.Linear(768, int(params[5])),
            nn.ReLU(),
            nn.Dropout(params[4]),
            nn.Linear(int(params[5]), 2)
        )
        self.alpha = nn.Parameter(torch.tensor(0.5))
        for p in self.clip.parameters():
            p.requires_grad = False
        for p in self.afriberta.parameters():
            p.requires_grad = False
    def forward(self, img, txt, mask):
        img_feat = self.clip(img).last_hidden_state[:, 0, :]
        img_feat = self.proj(img_feat)
        txt_feat = self.afriberta(input_ids=txt, attention_mask=mask).last_hidden_state[:, 0, :]
        return self.alpha * self.fc(img_feat) + (1 - self.alpha) * self.fc(txt_feat)
# Define evaluation function and LLaVA model.

  _torch_pytree._register_pytree_node(


In [32]:
# @title CELL 3.1: PSO Setup
from pyswarm import pso
from torch.utils.data import DataLoader # Import DataLoader
from torch.cuda.amp import GradScaler, autocast # Import GradScaler and autocast

def objective(params):
    lr, flr, clr, wd, dr, hd, bs = params
    # Ensure device is defined within the function's scope or is globally accessible
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model = LLaVAModel(params).to(device)
    optimizer = torch.optim.Adam([{'params': model.proj.parameters(), 'lr': lr*flr}, {'params': model.fc.parameters(), 'lr': lr*clr}], weight_decay=wd)
    train_df, val_df = df.iloc[folds[0][0]], df.iloc[folds[0][1]]

    # Use llava_tokenizer which is defined in cell fdaAnB_o-1wg
    train_loader = DataLoader(MemeDataset(train_df, clip_transform, llava_tokenizer), batch_size=int(bs))
    val_loader = DataLoader(MemeDataset(val_df, clip_transform, llava_tokenizer), batch_size=int(bs))

    model.train()
    scaler = GradScaler() # Use imported GradScaler
    for epoch in range(2): # Iterate for 2 epochs
        for batch in train_loader:
            optimizer.zero_grad()
            with autocast(): # Use imported autocast
                logits = model(batch['image'].to(device), batch['input_ids'].to(device), batch['attention_mask'].to(device))
                loss = torch.nn.CrossEntropyLoss()(logits, batch['label'].to(device))
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

    acc, _, _, _, _ = evaluate(model, val_loader)
    return -acc

# Ensure necessary variables like df, folds, clip_transform, llava_tokenizer, MemeDataset, LLaVAModel, evaluate are defined in previous cells and accessible.

lb = [1e-5, 0.1, 0.1, 1e-6, 0.1, 256, 8]
ub = [1e-3, 1.0, 1.0, 1e-4, 0.5, 1024, 16]

# Check if required variables are defined before calling pso
if 'df' in globals() and 'folds' in globals() and 'clip_transform' in globals() and 'llava_tokenizer' in globals() and 'MemeDataset' in globals() and 'LLaVAModel' in globals() and 'evaluate' in globals() and 'device' in globals():
    best_params, _ = pso(objective, lb, ub, swarmsize=10, maxiter=12)
    print('Best params:', dict(zip(['lr', 'flr', 'clr', 'wd', 'dr', 'hd', 'bs'], best_params)))
else:
    print("Error: Necessary variables for PSO are not defined. Please run previous cells.")

# Optimize LLaVA hyperparameters with 12 iterations.

Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler() # Use imported GradScaler
  with autocast(): # Use imported autocast
Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler() # Use imported GradScaler
  with autocast(): # Use imported autocast
Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should proba

Stopping search: maximum iterations reached --> 12
Best params: {'lr': np.float64(0.0007524736625526406), 'flr': np.float64(0.22314081292537946), 'clr': np.float64(0.586249349428616), 'wd': np.float64(8.19675420640813e-05), 'dr': np.float64(0.34723101655948285), 'hd': np.float64(417.04264795201635), 'bs': np.float64(11.006986850559457)}


In [34]:
# @title CELL 3.1: DISPLAY PSO Result (best_params)
best_params = [0.0007620625071926831, 0.5315936642420828, 0.14532729200890485, 6.254705018275373e-05, 0.11767602911626884, 528, 14]
print('Best params:', dict(zip(['lr', 'flr', 'clr', 'wd', 'dr', 'hd', 'bs'], best_params)))
# Use saved PSO-optimized hyperparameters (lr, flr, clr, wd, dr, hd, bs).

Best params: {'lr': 0.0007620625071926831, 'flr': 0.5315936642420828, 'clr': 0.14532729200890485, 'wd': 6.254705018275373e-05, 'dr': 0.11767602911626884, 'hd': 528, 'bs': 14}


In [35]:
# @title CELL 4.1: Model Definition (Baseline)
from torchvision.models import resnet50, ResNet50_Weights
from transformers import DistilBertModel
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

class BaselineModel(nn.Module):
    def __init__(self):
        super().__init__()
        try:
            print("Clearing GPU cache before ResNet50...")
            torch.cuda.synchronize()
            torch.cuda.empty_cache()
            print("Loading ResNet50...")
            # Removed torch_dtype=torch.float16
            self.resnet = nn.Sequential(*list(resnet50(weights=ResNet50_Weights.IMAGENET1K_V2).children())[:-1])
            self.resnet = self.resnet.to(device)
            print("ResNet50 loaded successfully")
        except Exception as e:
            print(f"Error loading ResNet50: {e}")
            raise
        try:
            print("Clearing GPU cache before DistilBERT...")
            torch.cuda.synchronize()
            torch.cuda.empty_cache()
            print("Loading DistilBERT...")
            # Removed torch_dtype=torch.float16
            self.distilbert = DistilBertModel.from_pretrained('distilbert-base-multilingual-cased')
            self.distilbert = self.distilbert.to(device)
            print("DistilBERT loaded successfully")
        except Exception as e:
            print(f"Error loading DistilBERT: {e}")
            raise
        self.img_fc = nn.Sequential(nn.Linear(2048, 512), nn.ReLU(), nn.Dropout(0.4), nn.Linear(512, 2)).to(device)
        self.txt_fc = nn.Sequential(nn.Linear(768, 512), nn.ReLU(), nn.Dropout(0.4), nn.Linear(512, 2)).to(device)
        self.alpha = nn.Parameter(torch.tensor(0.5, device=device))
        for p in self.resnet.parameters():
            p.requires_grad = False
        for p in self.distilbert.parameters():
            p.requires_grad = False
    def forward(self, img, txt, mask):
        img_feat = self.resnet(img).squeeze()
        txt_feat = self.distilbert(input_ids=txt, attention_mask=mask).last_hidden_state[:, 0, :]
        return self.alpha * self.img_fc(img_feat) + (1 - self.alpha) * self.txt_fc(txt_feat)
print("Initializing BaselineModel...")
with torch.no_grad():
    model = BaselineModel()
print(f"VRAM after model initialization: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")
print(sum(p.numel() for p in model.parameters()))
# Define baseline model with Late Fusion.

Initializing BaselineModel...
Clearing GPU cache before ResNet50...
Loading ResNet50...
ResNet50 loaded successfully
Clearing GPU cache before DistilBERT...
Loading DistilBERT...




DistilBERT loaded successfully
VRAM after model initialization: 2.58 GB
159686981


In [36]:
# @title CELL 4.2: Training Loop (Baseline)
from torch.cuda.amp import GradScaler, autocast
scaler = torch.amp.GradScaler('cuda')  # Updated for PyTorch 2.x
for fold, (train_idx, test_idx) in enumerate(folds):
    train_df, test_df = df.iloc[train_idx], df.iloc[test_idx]
    train_dataset = MemeDataset(train_df, resnet_transform, baseline_tokenizer)
    test_dataset = MemeDataset(test_df, resnet_transform, baseline_tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16)
    model = BaselineModel().to(device)
    optimizer = torch.optim.Adam([{'params': model.parameters(), 'lr': 2e-4}], weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)
    for epoch in range(10):
        model.train()
        if epoch == 6:
            for p in model.resnet.parameters():
                p.requires_grad = True
            # Access distilbert attribute of the model instance
            for p in model.distilbert.parameters():
                p.requires_grad = True
        for batch in tqdm(train_loader, desc=f'Fold {fold+1}, Epoch {epoch+1}'):
            optimizer.zero_grad()
            with torch.amp.autocast('cuda'):  # Updated for PyTorch 2.x
                logits = model(batch['image'].to(device), batch['input_ids'].to(device), batch['attention_mask'].to(device))
                loss = nn.CrossEntropyLoss()(logits, batch['label'].to(device))
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
    torch.save(model.state_dict(), f'/content/drive/MyDrive/Thesis/output/model_fold{fold}.pt')
# Train baseline with CV.

Clearing GPU cache before ResNet50...
Loading ResNet50...
ResNet50 loaded successfully
Clearing GPU cache before DistilBERT...
Loading DistilBERT...
DistilBERT loaded successfully


Fold 1, Epoch 1: 100%|██████████| 75/75 [00:26<00:00,  2.79it/s]
Fold 1, Epoch 2: 100%|██████████| 75/75 [00:25<00:00,  2.90it/s]
Fold 1, Epoch 3: 100%|██████████| 75/75 [00:25<00:00,  2.92it/s]
Fold 1, Epoch 4: 100%|██████████| 75/75 [00:25<00:00,  2.92it/s]
Fold 1, Epoch 5: 100%|██████████| 75/75 [00:25<00:00,  2.92it/s]
Fold 1, Epoch 6: 100%|██████████| 75/75 [00:25<00:00,  2.91it/s]
Fold 1, Epoch 7: 100%|██████████| 75/75 [01:21<00:00,  1.08s/it]
Fold 1, Epoch 8: 100%|██████████| 75/75 [00:36<00:00,  2.08it/s]
Fold 1, Epoch 9: 100%|██████████| 75/75 [00:35<00:00,  2.10it/s]
Fold 1, Epoch 10: 100%|██████████| 75/75 [00:36<00:00,  2.07it/s]


Clearing GPU cache before ResNet50...
Loading ResNet50...
ResNet50 loaded successfully
Clearing GPU cache before DistilBERT...
Loading DistilBERT...
DistilBERT loaded successfully


Fold 2, Epoch 1: 100%|██████████| 75/75 [00:28<00:00,  2.66it/s]
Fold 2, Epoch 2: 100%|██████████| 75/75 [00:25<00:00,  2.89it/s]
Fold 2, Epoch 3: 100%|██████████| 75/75 [00:25<00:00,  2.90it/s]
Fold 2, Epoch 4: 100%|██████████| 75/75 [00:25<00:00,  2.90it/s]
Fold 2, Epoch 5: 100%|██████████| 75/75 [00:25<00:00,  2.90it/s]
Fold 2, Epoch 6: 100%|██████████| 75/75 [00:25<00:00,  2.92it/s]
Fold 2, Epoch 7: 100%|██████████| 75/75 [00:35<00:00,  2.10it/s]
Fold 2, Epoch 8: 100%|██████████| 75/75 [00:35<00:00,  2.09it/s]
Fold 2, Epoch 9: 100%|██████████| 75/75 [00:35<00:00,  2.10it/s]
Fold 2, Epoch 10: 100%|██████████| 75/75 [00:35<00:00,  2.10it/s]


Clearing GPU cache before ResNet50...
Loading ResNet50...
ResNet50 loaded successfully
Clearing GPU cache before DistilBERT...
Loading DistilBERT...
DistilBERT loaded successfully


Fold 3, Epoch 1: 100%|██████████| 75/75 [00:26<00:00,  2.85it/s]
Fold 3, Epoch 2: 100%|██████████| 75/75 [00:25<00:00,  2.90it/s]
Fold 3, Epoch 3: 100%|██████████| 75/75 [00:26<00:00,  2.87it/s]
Fold 3, Epoch 4: 100%|██████████| 75/75 [00:26<00:00,  2.86it/s]
Fold 3, Epoch 5: 100%|██████████| 75/75 [00:26<00:00,  2.87it/s]
Fold 3, Epoch 6: 100%|██████████| 75/75 [00:26<00:00,  2.87it/s]
Fold 3, Epoch 7: 100%|██████████| 75/75 [00:36<00:00,  2.08it/s]
Fold 3, Epoch 8: 100%|██████████| 75/75 [00:35<00:00,  2.09it/s]
Fold 3, Epoch 9: 100%|██████████| 75/75 [00:35<00:00,  2.09it/s]
Fold 3, Epoch 10: 100%|██████████| 75/75 [00:35<00:00,  2.09it/s]


Clearing GPU cache before ResNet50...
Loading ResNet50...
ResNet50 loaded successfully
Clearing GPU cache before DistilBERT...
Loading DistilBERT...
DistilBERT loaded successfully


Fold 4, Epoch 1: 100%|██████████| 75/75 [00:26<00:00,  2.82it/s]
Fold 4, Epoch 2: 100%|██████████| 75/75 [00:25<00:00,  2.89it/s]
Fold 4, Epoch 3: 100%|██████████| 75/75 [00:25<00:00,  2.89it/s]
Fold 4, Epoch 4: 100%|██████████| 75/75 [00:26<00:00,  2.87it/s]
Fold 4, Epoch 5: 100%|██████████| 75/75 [00:25<00:00,  2.91it/s]
Fold 4, Epoch 6: 100%|██████████| 75/75 [00:25<00:00,  2.90it/s]
Fold 4, Epoch 7: 100%|██████████| 75/75 [00:35<00:00,  2.08it/s]
Fold 4, Epoch 8: 100%|██████████| 75/75 [00:36<00:00,  2.08it/s]
Fold 4, Epoch 9: 100%|██████████| 75/75 [00:36<00:00,  2.07it/s]
Fold 4, Epoch 10: 100%|██████████| 75/75 [00:36<00:00,  2.07it/s]


Clearing GPU cache before ResNet50...
Loading ResNet50...
ResNet50 loaded successfully
Clearing GPU cache before DistilBERT...
Loading DistilBERT...
DistilBERT loaded successfully


Fold 5, Epoch 1: 100%|██████████| 75/75 [00:27<00:00,  2.72it/s]
Fold 5, Epoch 2: 100%|██████████| 75/75 [00:25<00:00,  2.89it/s]
Fold 5, Epoch 3: 100%|██████████| 75/75 [00:25<00:00,  2.89it/s]
Fold 5, Epoch 4: 100%|██████████| 75/75 [00:26<00:00,  2.88it/s]
Fold 5, Epoch 5: 100%|██████████| 75/75 [00:25<00:00,  2.89it/s]
Fold 5, Epoch 6: 100%|██████████| 75/75 [00:25<00:00,  2.89it/s]
Fold 5, Epoch 7: 100%|██████████| 75/75 [00:35<00:00,  2.09it/s]
Fold 5, Epoch 8: 100%|██████████| 75/75 [00:35<00:00,  2.09it/s]
Fold 5, Epoch 9: 100%|██████████| 75/75 [00:36<00:00,  2.08it/s]
Fold 5, Epoch 10: 100%|██████████| 75/75 [00:36<00:00,  2.08it/s]


In [38]:
# @title Cell 4.3: Evaluation (Baseline)
results = []
for fold, (_, test_idx) in enumerate(folds):
    test_df = df.iloc[test_idx]
    print(f"Fold {fold+1} test set language distribution:\n{test_df['language'].value_counts()}")
    test_dataset = MemeDataset(test_df, resnet_transform, baseline_tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=8)
    model = BaselineModel()
    try:
        model.load_state_dict(torch.load(f'/content/drive/MyDrive/Thesis/output/model_fold{fold}.pt'))
    except Exception as e:
        print(f"Error loading model for fold {fold+1}: {e}")
        continue
    acc, prec, rec, f1, auc = evaluate(model, test_loader)
    lang_results = {}
    for i, lang in enumerate(['English', 'Igbo', 'Yoruba', 'Hausa']):
        lang_df = test_df[test_df['language'] == i]
        if len(lang_df) > 0:
            lang_dataset = MemeDataset(lang_df, resnet_transform, baseline_tokenizer)
            lang_loader = DataLoader(lang_dataset, batch_size=min(8, len(lang_df)))
            try:
                metrics = evaluate(model, lang_loader)
                lang_results[lang] = metrics
                print(f"Fold {fold+1}, {lang}: acc={metrics[0]:.4f}, prec={metrics[1]:.4f}, rec={metrics[2]:.4f}, f1={metrics[3]:.4f}, auc={metrics[4]:.4f}, samples={len(lang_df)}")
            except Exception as e:
                print(f"Error evaluating {lang} in fold {fold+1}: {e}")
                lang_results[lang] = (0.0, 0.0, 0.0, 0.0, 0.0)
        else:
            print(f"No samples for {lang} in fold {fold+1}")
            lang_results[lang] = (0.0, 0.0, 0.0, 0.0, 0.0)
    results.append({
        'fold': fold+1,
        'acc': acc,
        'prec': prec,
        'rec': rec,
        'f1': f1,
        'auc': auc,
        'lang': lang_results,
        'f1_std': np.std([lang_results[l][3] for l in lang_results if lang_results[l][3] != 0.0]) if any(lang_results[l][3] != 0.0 for l in lang_results) else 0.0
    })
# Fallback: Evaluate each language globally
global_lang_results = {}
for i, lang in enumerate(['English', 'Igbo', 'Yoruba', 'Hausa']):
    lang_df = df[df['language'] == i]
    if len(lang_df) > 0:
        lang_dataset = MemeDataset(lang_df, resnet_transform, baseline_tokenizer)
        lang_loader = DataLoader(lang_dataset, batch_size=min(8, len(lang_df)))
        model = BaselineModel()
        try:
            model.load_state_dict(torch.load(f'/content/drive/MyDrive/Thesis/output/model_fold0.pt'))
            metrics = evaluate(model, lang_loader)
            global_lang_results[lang] = metrics
            print(f"Global {lang}: acc={metrics[0]:.4f}, prec={metrics[1]:.4f}, rec={metrics[2]:.4f}, f1={metrics[3]:.4f}, auc={metrics[4]:.4f}, samples={len(lang_df)}")
        except Exception as e:
            print(f"Error evaluating global {lang}: {e}")
            global_lang_results[lang] = (0.0, 0.0, 0.0, 0.0, 0.0)
    else:
        print(f"No global samples for {lang}")
        global_lang_results[lang] = (0.0, 0.0, 0.0, 0.0, 0.0)
print("Global language results:", {k: f"acc={v[0]:.4f}, prec={v[1]:.4f}, rec={v[2]:.4f}, f1={v[3]:.4f}, auc={v[4]:.4f}" for k, v in global_lang_results.items()})
print(pd.DataFrame(results))
# Save results
import pickle
with open('/content/drive/MyDrive/Thesis/output/results.pkl', 'wb') as f:
    pickle.dump(results, f)
with open('/content/drive/MyDrive/Thesis/output/global_lang_results.pkl', 'wb') as f:
    pickle.dump(global_lang_results, f)
# Evaluate baseline performance.

Fold 1 test set language distribution:
language
0    100
1     67
2     67
3     66
Name: count, dtype: int64
Clearing GPU cache before ResNet50...
Loading ResNet50...
ResNet50 loaded successfully
Clearing GPU cache before DistilBERT...
Loading DistilBERT...




DistilBERT loaded successfully
Fold 1, English: acc=0.2800, prec=0.2786, rec=0.2800, f1=0.2788, auc=0.2468, samples=100
Fold 1, Igbo: acc=0.9104, prec=0.9104, rec=0.9104, f1=0.9095, auc=0.9477, samples=67
Fold 1, Yoruba: acc=0.9254, prec=0.9269, rec=0.9254, f1=0.9240, auc=0.9733, samples=67
Fold 1, Hausa: acc=0.9394, prec=0.9401, rec=0.9394, f1=0.9388, auc=0.9871, samples=66
Fold 2 test set language distribution:
language
0    100
1     67
3     67
2     66
Name: count, dtype: int64
Clearing GPU cache before ResNet50...
Loading ResNet50...
ResNet50 loaded successfully
Clearing GPU cache before DistilBERT...
Loading DistilBERT...




DistilBERT loaded successfully
Fold 2, English: acc=0.4000, prec=0.4000, rec=0.4000, f1=0.4000, auc=0.3200, samples=100
Fold 2, Igbo: acc=0.9254, prec=0.9330, rec=0.9254, f1=0.9228, auc=0.9773, samples=67
Fold 2, Yoruba: acc=0.9242, prec=0.9239, rec=0.9242, f1=0.9238, auc=0.9666, samples=66
Fold 2, Hausa: acc=0.9104, prec=0.9104, rec=0.9104, f1=0.9095, auc=0.9806, samples=67
Fold 3 test set language distribution:
language
0    100
1     67
2     67
3     66
Name: count, dtype: int64
Clearing GPU cache before ResNet50...
Loading ResNet50...
ResNet50 loaded successfully
Clearing GPU cache before DistilBERT...
Loading DistilBERT...




DistilBERT loaded successfully
Fold 3, English: acc=0.4100, prec=0.4097, rec=0.4100, f1=0.4095, auc=0.3640, samples=100
Fold 3, Igbo: acc=0.9552, prec=0.9552, rec=0.9552, f1=0.9550, auc=0.9951, samples=67
Fold 3, Yoruba: acc=0.9552, prec=0.9561, rec=0.9552, f1=0.9554, auc=0.9911, samples=67
Fold 3, Hausa: acc=0.9394, prec=0.9394, rec=0.9394, f1=0.9394, auc=0.9939, samples=66
Fold 4 test set language distribution:
language
0    100
2     68
1     66
3     66
Name: count, dtype: int64
Clearing GPU cache before ResNet50...
Loading ResNet50...
ResNet50 loaded successfully
Clearing GPU cache before DistilBERT...
Loading DistilBERT...




DistilBERT loaded successfully
Fold 4, English: acc=0.3600, prec=0.3514, rec=0.3600, f1=0.3506, auc=0.2804, samples=100
Fold 4, Igbo: acc=0.9394, prec=0.9423, rec=0.9394, f1=0.9399, auc=0.9889, samples=66
Fold 4, Yoruba: acc=0.9559, prec=0.9567, rec=0.9559, f1=0.9561, auc=0.9981, samples=68
Fold 4, Hausa: acc=0.9394, prec=0.9423, rec=0.9394, f1=0.9399, auc=0.9960, samples=66
Fold 5 test set language distribution:
language
0    100
2     68
1     66
3     66
Name: count, dtype: int64
Clearing GPU cache before ResNet50...
Loading ResNet50...
ResNet50 loaded successfully
Clearing GPU cache before DistilBERT...
Loading DistilBERT...




DistilBERT loaded successfully
Fold 5, English: acc=0.4600, prec=0.4322, rec=0.4600, f1=0.3984, auc=0.2864, samples=100
Fold 5, Igbo: acc=0.8788, prec=0.8891, rec=0.8788, f1=0.8807, auc=0.9353, samples=66
Fold 5, Yoruba: acc=0.8971, prec=0.8964, rec=0.8971, f1=0.8965, auc=0.9801, samples=68
Fold 5, Hausa: acc=0.9394, prec=0.9423, rec=0.9394, f1=0.9399, auc=0.9727, samples=66
Clearing GPU cache before ResNet50...
Loading ResNet50...
ResNet50 loaded successfully
Clearing GPU cache before DistilBERT...
Loading DistilBERT...




DistilBERT loaded successfully
Global English: acc=0.8460, prec=0.8463, rec=0.8460, f1=0.8460, auc=0.9145, samples=500
Clearing GPU cache before ResNet50...
Loading ResNet50...
ResNet50 loaded successfully
Clearing GPU cache before DistilBERT...
Loading DistilBERT...




DistilBERT loaded successfully
Global Igbo: acc=0.9790, prec=0.9790, rec=0.9790, f1=0.9789, auc=0.9938, samples=333
Clearing GPU cache before ResNet50...
Loading ResNet50...
ResNet50 loaded successfully
Clearing GPU cache before DistilBERT...
Loading DistilBERT...




DistilBERT loaded successfully
Global Yoruba: acc=0.9851, prec=0.9852, rec=0.9851, f1=0.9851, auc=0.9975, samples=336
Clearing GPU cache before ResNet50...
Loading ResNet50...
ResNet50 loaded successfully
Clearing GPU cache before DistilBERT...
Loading DistilBERT...




DistilBERT loaded successfully
Global Hausa: acc=0.9879, prec=0.9879, rec=0.9879, f1=0.9879, auc=0.9991, samples=331
Global language results: {'English': 'acc=0.8460, prec=0.8463, rec=0.8460, f1=0.8460, auc=0.9145', 'Igbo': 'acc=0.9790, prec=0.9790, rec=0.9790, f1=0.9789, auc=0.9938', 'Yoruba': 'acc=0.9851, prec=0.9852, rec=0.9851, f1=0.9851, auc=0.9975', 'Hausa': 'acc=0.9879, prec=0.9879, rec=0.9879, f1=0.9879, auc=0.9991'}
   fold       acc      prec       rec        f1       auc  \
0     1  0.710000  0.706789  0.710000  0.707418  0.833880   
1     2  0.746667  0.744529  0.746667  0.745052  0.813750   
2     3  0.770000  0.771097  0.770000  0.770471  0.873392   
3     4  0.750000  0.758945  0.750000  0.752073  0.849491   
4     5  0.756667  0.784348  0.756667  0.758981  0.831944   

                                                lang    f1_std  
0  {'English': (0.28, 0.2785829307568438, 0.28, 0...  0.279595  
1  {'English': (0.4, 0.4, 0.4, 0.4, 0.32000000000...  0.224687  
2  {'Engl

In [37]:
# @title Cell 4.3: Evaluation (Baseline)
results = []
for fold, (_, test_idx) in enumerate(folds):
    test_df = df.iloc[test_idx]
    test_dataset = MemeDataset(test_df, resnet_transform, baseline_tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=12)
    model = BaselineModel()
    model.load_state_dict(torch.load(f'/content/drive/MyDrive/Thesis/output/model_fold{fold}.pt'))
    acc, prec, rec, f1, auc = evaluate(model, test_loader)
    lang_results = {lang: evaluate(model, DataLoader(MemeDataset(test_df[test_df['language'] == i], resnet_transform, baseline_tokenizer), batch_size=12)) for i, lang in enumerate(['English', 'Igbo', 'Yoruba', 'Hausa']) if len(test_df[test_df['language'] == i]) > 0}
    results.append({'fold': fold+1, 'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1, 'auc': auc, 'lang': lang_results, 'f1_std': np.std([lang_results[l][3] for l in lang_results] if lang_results else 0)})
print(pd.DataFrame(results))
# Evaluate baseline performance.

Clearing GPU cache before ResNet50...
Loading ResNet50...
ResNet50 loaded successfully
Clearing GPU cache before DistilBERT...
Loading DistilBERT...
DistilBERT loaded successfully
Clearing GPU cache before ResNet50...
Loading ResNet50...
ResNet50 loaded successfully
Clearing GPU cache before DistilBERT...
Loading DistilBERT...




DistilBERT loaded successfully
Clearing GPU cache before ResNet50...
Loading ResNet50...
ResNet50 loaded successfully
Clearing GPU cache before DistilBERT...
Loading DistilBERT...




DistilBERT loaded successfully
Clearing GPU cache before ResNet50...
Loading ResNet50...
ResNet50 loaded successfully
Clearing GPU cache before DistilBERT...
Loading DistilBERT...




DistilBERT loaded successfully
Clearing GPU cache before ResNet50...
Loading ResNet50...
ResNet50 loaded successfully
Clearing GPU cache before DistilBERT...
Loading DistilBERT...




DistilBERT loaded successfully
   fold       acc      prec       rec        f1       auc  \
0     1  0.710000  0.706789  0.710000  0.707418  0.833926   
1     2  0.746667  0.744529  0.746667  0.745052  0.813657   
2     3  0.770000  0.771097  0.770000  0.770471  0.873346   
3     4  0.750000  0.758945  0.750000  0.752073  0.849444   
4     5  0.756667  0.784348  0.756667  0.758981  0.832037   

                                                lang    f1_std  
0  {'English': (0.28, 0.2785829307568438, 0.28, 0...  0.279595  
1  {'English': (0.4, 0.4, 0.4, 0.4, 0.31960000000...  0.224687  
2  {'English': (0.41, 0.4096748293857888, 0.41, 0...  0.234119  
3  {'English': (0.36, 0.35144312393887944, 0.36, ...  0.257583  
4  {'English': (0.46, 0.4322493224932249, 0.46, 0...  0.220746  


In [39]:
# @title Cell 5.1: Model Definition (LLaVA)
model = LLaVAModel(best_params).to(device)
print(sum(p.numel() for p in model.parameters()))
# Define LLaVA-inspired model for classification.

Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


366497331


In [40]:
# @title Cell 5.2: Training Loop (LLaVA with PSO)
scaler = torch.amp.GradScaler('cuda')
for fold, (train_idx, test_idx) in enumerate(folds):
    train_df, test_df = df.iloc[train_idx], df.iloc[test_idx]
    train_dataset = MemeDataset(train_df, clip_transform, llava_tokenizer)
    test_dataset = MemeDataset(test_df, clip_transform, llava_tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=min(int(best_params[6]), len(train_dataset)), shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=min(int(best_params[6]), len(test_dataset)))
    model = LLaVAModel(best_params).to(device)
    optimizer = torch.optim.Adam([{'params': model.proj.parameters(), 'lr': best_params[0]*best_params[1]}, {'params': model.fc.parameters(), 'lr': best_params[0]*best_params[2]}], weight_decay=best_params[3])
    for epoch in range(8):
        model.train()
        if epoch == 5:
            for p in model.proj.parameters():
                p.requires_grad = True
            for p in model.fc.parameters():
                p.requires_grad = True
        for batch in tqdm(train_loader, desc=f'Fold {fold+1}, Epoch {epoch+1} (PSO)'):
            optimizer.zero_grad()
            with torch.amp.autocast('cuda'):
                logits = model(batch['image'].to(device), batch['input_ids'].to(device), batch['attention_mask'].to(device))
                loss = nn.CrossEntropyLoss()(logits, batch['label'].to(device))
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
    torch.save(model.state_dict(), f'/content/drive/MyDrive/Thesis/output/llava_pso_fold{fold}.pt')
# Train LLaVA with PSO-optimized hyperparameters.

Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1, Epoch 1 (PSO): 100%|██████████| 86/86 [00:28<00:00,  3.04it/s]
Fold 1, Epoch 2 (PSO): 100%|██████████| 86/86 [00:28<00:00,  3.05it/s]
Fold 1, Epoch 3 (PSO): 100%|██████████| 86/86 [00:27<00:00,  3.07it/s]
Fold 1, Epoch 4 (PSO): 100%|██████████| 86/86 [00:27<00:00,  3.09it/s]
Fold 1, Epoch 5 (PSO): 100%|██████████| 86/86 [00:27<00:00,  3.09it/s]
Fold 1, Epoch 6 (PSO): 100%|██████████| 86/86 [00:27<00:00,  3.09it/s]
Fold 1, Epoch 7 (PSO): 100%|██████████| 86/86 [00:27<00:00,  3.09it/s]
Fold 1, Epoch 8 (PSO): 100%|██████████| 86/86 [00:27<00:00,  3.08it/s]
Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized

In [41]:
# @title Cell 5.3: Evaluation (LLaVA with PSO)
llava_pso_results = []
for fold, (_, test_idx) in enumerate(folds):
    test_df = df.iloc[test_idx]
    print(f"Fold {fold+1} test set language distribution:\n{test_df['language'].value_counts()}")
    test_dataset = MemeDataset(test_df, clip_transform, llava_tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=min(int(best_params[6]), len(test_dataset)))
    model = LLaVAModel(best_params).to(device)
    model.load_state_dict(torch.load(f'/content/drive/MyDrive/Thesis/output/llava_pso_fold{fold}.pt'))
    acc, prec, rec, f1, auc = evaluate(model, test_loader)
    lang_results = {}
    for i, lang in enumerate(['English', 'Igbo', 'Yoruba', 'Hausa']):
        lang_df = test_df[test_df['language'] == i]
        if len(lang_df) > 0:
            lang_dataset = MemeDataset(lang_df, clip_transform, llava_tokenizer)
            lang_loader = DataLoader(lang_dataset, batch_size=min(int(best_params[6]), len(lang_df)))
            try:
                metrics = evaluate(model, lang_loader)
                lang_results[lang] = metrics
                print(f"Fold {fold+1}, {lang}: acc={metrics[0]:.4f}, prec={metrics[1]:.4f}, rec={metrics[2]:.4f}, f1={metrics[3]:.4f}, auc={metrics[4]:.4f}, samples={len(lang_df)}")
            except Exception as e:
                print(f"Error evaluating {lang} in fold {fold+1}: {e}")
                lang_results[lang] = (0.0, 0.0, 0.0, 0.0, 0.0)
        else:
            print(f"No samples for {lang} in fold {fold+1}")
            lang_results[lang] = (0.0, 0.0, 0.0, 0.0, 0.0)
    llava_pso_results.append({
        'fold': fold+1,
        'acc': acc,
        'prec': prec,
        'rec': rec,
        'f1': f1,
        'auc': auc,
        'lang': lang_results,
        'f1_std': np.std([lang_results[l][3] for l in lang_results if lang_results[l][3] != 0.0]) if any(lang_results[l][3] != 0.0 for l in lang_results) else 0.0
    })
# Fallback: Global evaluation for LLaVA
global_llava_results = {}
for i, lang in enumerate(['English', 'Igbo', 'Yoruba', 'Hausa']):
    lang_df = df[df['language'] == i]
    if len(lang_df) > 0:
        lang_dataset = MemeDataset(lang_df, clip_transform, llava_tokenizer)
        lang_loader = DataLoader(lang_dataset, batch_size=min(int(best_params[6]), len(lang_df)))
        model = LLaVAModel(best_params).to(device)
        try:
            model.load_state_dict(torch.load(f'/content/drive/MyDrive/Thesis/output/llava_pso_fold0.pt'))
            metrics = evaluate(model, lang_loader)
            global_llava_results[lang] = metrics
            print(f"Global {lang}: acc={metrics[0]:.4f}, prec={metrics[1]:.4f}, rec={metrics[2]:.4f}, f1={metrics[3]:.4f}, auc={metrics[4]:.4f}, samples={len(lang_df)}")
        except Exception as e:
            print(f"Error evaluating global {lang}: {e}")
            global_llava_results[lang] = (0.0, 0.0, 0.0, 0.0, 0.0)
    else:
        print(f"No global samples for {lang}")
        global_llava_results[lang] = (0.0, 0.0, 0.0, 0.0, 0.0)
print("Global LLaVA results:", {k: f"acc={v[0]:.4f}, prec={v[1]:.4f}, rec={v[2]:.4f}, f1={v[3]:.4f}, auc={v[4]:.4f}" for k, v in global_llava_results.items()})
print(pd.DataFrame(llava_pso_results))
# Save results
with open('/content/drive/MyDrive/Thesis/output/llava_pso_results.pkl', 'wb') as f:
    pickle.dump(llava_pso_results, f)
with open('/content/drive/MyDrive/Thesis/output/global_llava_results.pkl', 'wb') as f:
    pickle.dump(global_llava_results, f)
# Evaluate LLaVA with PSO performance.

Fold 1 test set language distribution:
language
0    100
1     67
2     67
3     66
Name: count, dtype: int64


Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 1, English: acc=0.4700, prec=0.4690, rec=0.4700, f1=0.4657, auc=0.4976, samples=100
Fold 1, Igbo: acc=0.8806, prec=0.8822, rec=0.8806, f1=0.8778, auc=0.9380, samples=67
Fold 1, Yoruba: acc=0.9552, prec=0.9581, rec=0.9552, f1=0.9544, auc=0.9970, samples=67
Fold 1, Hausa: acc=0.8485, prec=0.8486, rec=0.8485, f1=0.8450, auc=0.9563, samples=66
Fold 2 test set language distribution:
language
0    100
1     67
3     67
2     66
Name: count, dtype: int64


Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 2, English: acc=0.4200, prec=0.4151, rec=0.4200, f1=0.4115, auc=0.4572, samples=100
Fold 2, Igbo: acc=0.8358, prec=0.8356, rec=0.8358, f1=0.8302, auc=0.9229, samples=67
Fold 2, Yoruba: acc=0.8939, prec=0.8933, rec=0.8939, f1=0.8934, auc=0.9737, samples=66
Fold 2, Hausa: acc=0.9851, prec=0.9857, rec=0.9851, f1=0.9851, auc=0.9961, samples=67
Fold 3 test set language distribution:
language
0    100
1     67
2     67
3     66
Name: count, dtype: int64


Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 3, English: acc=0.5700, prec=0.5703, rec=0.5700, f1=0.5696, auc=0.5788, samples=100
Fold 3, Igbo: acc=0.9104, prec=0.9138, rec=0.9104, f1=0.9113, auc=0.9832, samples=67
Fold 3, Yoruba: acc=0.9104, prec=0.9200, rec=0.9104, f1=0.9119, auc=0.9862, samples=67
Fold 3, Hausa: acc=0.9242, prec=0.9254, rec=0.9242, f1=0.9246, auc=0.9697, samples=66
Fold 4 test set language distribution:
language
0    100
2     68
1     66
3     66
Name: count, dtype: int64


Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 4, English: acc=0.4500, prec=0.4475, rec=0.4500, f1=0.4433, auc=0.4488, samples=100
Fold 4, Igbo: acc=0.8485, prec=0.8479, rec=0.8485, f1=0.8446, auc=0.9596, samples=66
Fold 4, Yoruba: acc=0.9559, prec=0.9559, rec=0.9559, f1=0.9557, auc=0.9896, samples=68
Fold 4, Hausa: acc=0.8485, prec=0.8618, rec=0.8485, f1=0.8390, auc=0.9333, samples=66
Fold 5 test set language distribution:
language
0    100
2     68
1     66
3     66
Name: count, dtype: int64


Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 5, English: acc=0.5900, prec=0.5989, rec=0.5900, f1=0.5806, auc=0.5664, samples=100
Fold 5, Igbo: acc=0.8788, prec=0.8864, rec=0.8788, f1=0.8736, auc=0.9585, samples=66
Fold 5, Yoruba: acc=0.9118, prec=0.9118, rec=0.9118, f1=0.9118, auc=0.9830, samples=68
Fold 5, Hausa: acc=0.7879, prec=0.8205, rec=0.7879, f1=0.7927, auc=0.9191, samples=66


Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Global English: acc=0.8080, prec=0.8087, rec=0.8080, f1=0.8079, auc=0.8705, samples=500


Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Global Igbo: acc=0.9309, prec=0.9337, rec=0.9309, f1=0.9295, auc=0.9840, samples=333


Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Global Yoruba: acc=0.9821, prec=0.9826, rec=0.9821, f1=0.9820, auc=0.9995, samples=336


Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Global Hausa: acc=0.9305, prec=0.9317, rec=0.9305, f1=0.9295, auc=0.9913, samples=331
Global LLaVA results: {'English': 'acc=0.8080, prec=0.8087, rec=0.8080, f1=0.8079, auc=0.8705', 'Igbo': 'acc=0.9309, prec=0.9337, rec=0.9309, f1=0.9295, auc=0.9840', 'Yoruba': 'acc=0.9821, prec=0.9826, rec=0.9821, f1=0.9820, auc=0.9995', 'Hausa': 'acc=0.9305, prec=0.9317, rec=0.9305, f1=0.9295, auc=0.9913'}
   fold       acc      prec       rec        f1       auc  \
0     1  0.753333  0.751257  0.753333  0.748748  0.859873   
1     2  0.743333  0.746329  0.743333  0.744425  0.847778   
2     3  0.800000  0.804171  0.800000  0.801191  0.901063   
3     4  0.740000  0.739318  0.740000  0.739626  0.845509   
4     5  0.770000  0.778747  0.770000  0.771907  0.855139   

                                                lang    f1_std  
0  {'English': (0.47, 0.4689954526663911, 0.47, 0...  0.189001  
1  {'English': (0.42, 0.4151103565365025, 0.42, 0...  0.219790  
2  {'English': (0.57, 0.5702529104777199, 0

In [42]:
# @title Cell 5.4: Training Loop (LLaVA without PSO)
fixed_params = [1e-4, 0.5, 0.5, 1e-5, 0.3, 512, 16]
scaler = torch.amp.GradScaler('cuda')
for fold, (train_idx, test_idx) in enumerate(folds):
    train_df, test_df = df.iloc[train_idx], df.iloc[test_idx]
    train_dataset = MemeDataset(train_df, clip_transform, llava_tokenizer)
    test_dataset = MemeDataset(test_df, clip_transform, llava_tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=min(int(fixed_params[6]), len(train_dataset)), shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=min(int(fixed_params[6]), len(test_dataset)))
    model = LLaVAModel(fixed_params).to(device)
    optimizer = torch.optim.Adam([{'params': model.proj.parameters(), 'lr': fixed_params[0]*fixed_params[1]}, {'params': model.fc.parameters(), 'lr': fixed_params[0]*fixed_params[2]}], weight_decay=fixed_params[3])
    for epoch in range(8):
        model.train()
        if epoch == 5:
            for p in model.proj.parameters():
                p.requires_grad = True
            for p in model.fc.parameters():
                p.requires_grad = True
        for batch in tqdm(train_loader, desc=f'Fold {fold+1}, Epoch {epoch+1} (No PSO)'):
            optimizer.zero_grad()
            with torch.amp.autocast('cuda'):
                logits = model(batch['image'].to(device), batch['input_ids'].to(device), batch['attention_mask'].to(device))
                loss = nn.CrossEntropyLoss()(logits, batch['label'].to(device))
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
    torch.save(model.state_dict(), f'/content/drive/MyDrive/Thesis/output/llava_no_pso_fold{fold}.pt')
# Train LLaVA with fixed hyperparameters.

Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1, Epoch 1 (No PSO): 100%|██████████| 75/75 [00:27<00:00,  2.71it/s]
Fold 1, Epoch 2 (No PSO): 100%|██████████| 75/75 [00:27<00:00,  2.72it/s]
Fold 1, Epoch 3 (No PSO): 100%|██████████| 75/75 [00:27<00:00,  2.73it/s]
Fold 1, Epoch 4 (No PSO): 100%|██████████| 75/75 [00:27<00:00,  2.72it/s]
Fold 1, Epoch 5 (No PSO): 100%|██████████| 75/75 [00:27<00:00,  2.72it/s]
Fold 1, Epoch 6 (No PSO): 100%|██████████| 75/75 [00:27<00:00,  2.72it/s]
Fold 1, Epoch 7 (No PSO): 100%|██████████| 75/75 [00:27<00:00,  2.72it/s]
Fold 1, Epoch 8 (No PSO): 100%|██████████| 75/75 [00:27<00:00,  2.70it/s]
Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base a

In [43]:
# @title Cell 5.5: Evaluation (LLaVA without PSO)
llava_no_pso_results = []
for fold, (_, test_idx) in enumerate(folds):
    test_df = df.iloc[test_idx]
    print(f"Fold {fold+1} test set language distribution:\n{test_df['language'].value_counts()}")
    test_dataset = MemeDataset(test_df, clip_transform, llava_tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=min(int(fixed_params[6]), len(test_dataset)))
    model = LLaVAModel(fixed_params).to(device)
    model.load_state_dict(torch.load(f'/content/drive/MyDrive/Thesis/output/llava_no_pso_fold{fold}.pt'))
    acc, prec, rec, f1, auc = evaluate(model, test_loader)
    lang_results = {}
    for i, lang in enumerate(['English', 'Igbo', 'Yoruba', 'Hausa']):
        lang_df = test_df[test_df['language'] == i]
        if len(lang_df) > 0:
            lang_dataset = MemeDataset(lang_df, clip_transform, llava_tokenizer)
            lang_loader = DataLoader(lang_dataset, batch_size=min(int(fixed_params[6]), len(lang_df)))
            try:
                metrics = evaluate(model, lang_loader)
                lang_results[lang] = metrics
                print(f"Fold {fold+1}, {lang}: acc={metrics[0]:.4f}, prec={metrics[1]:.4f}, rec={metrics[2]:.4f}, f1={metrics[3]:.4f}, auc={metrics[4]:.4f}, samples={len(lang_df)}")
            except Exception as e:
                print(f"Error evaluating {lang} in fold {fold+1}: {e}")
                lang_results[lang] = (0.0, 0.0, 0.0, 0.0, 0.0)
        else:
            print(f"No samples for {lang} in fold {fold+1}")
            lang_results[lang] = (0.0, 0.0, 0.0, 0.0, 0.0)
    llava_no_pso_results.append({
        'fold': fold+1,
        'acc': acc,
        'prec': prec,
        'rec': rec,
        'f1': f1,
        'auc': auc,
        'lang': lang_results,
        'f1_std': np.std([lang_results[l][3] for l in lang_results if lang_results[l][3] != 0.0]) if any(lang_results[l][3] != 0.0 for l in lang_results) else 0.0
    })
# Fallback: Global evaluation for LLaVA without PSO
global_no_pso_results = {}
for i, lang in enumerate(['English', 'Igbo', 'Yoruba', 'Hausa']):
    lang_df = df[df['language'] == i]
    if len(lang_df) > 0:
        lang_dataset = MemeDataset(lang_df, clip_transform, llava_tokenizer)
        lang_loader = DataLoader(lang_dataset, batch_size=min(int(fixed_params[6]), len(lang_df)))
        model = LLaVAModel(fixed_params).to(device)
        try:
            model.load_state_dict(torch.load(f'/content/drive/MyDrive/Thesis/output/llava_no_pso_fold0.pt'))
            metrics = evaluate(model, lang_loader)
            global_no_pso_results[lang] = metrics
            print(f"Global {lang}: acc={metrics[0]:.4f}, prec={metrics[1]:.4f}, rec={metrics[2]:.4f}, f1={metrics[3]:.4f}, auc={metrics[4]:.4f}, samples={len(lang_df)}")
        except Exception as e:
            print(f"Error evaluating global {lang}: {e}")
            global_no_pso_results[lang] = (0.0, 0.0, 0.0, 0.0, 0.0)
    else:
        print(f"No global samples for {lang}")
        global_no_pso_results[lang] = (0.0, 0.0, 0.0, 0.0, 0.0)
print("Global LLaVA no PSO results:", {k: f"acc={v[0]:.4f}, prec={v[1]:.4f}, rec={v[2]:.4f}, f1={v[3]:.4f}, auc={v[4]:.4f}" for k, v in global_no_pso_results.items()})
print(pd.DataFrame(llava_no_pso_results))
# Save results
with open('/content/drive/MyDrive/Thesis/output/llava_no_pso_results.pkl', 'wb') as f:
    pickle.dump(llava_no_pso_results, f)
with open('/content/drive/MyDrive/Thesis/output/global_no_pso_results.pkl', 'wb') as f:
    pickle.dump(global_no_pso_results, f)
# Evaluate LLaVA without PSO performance.

Fold 1 test set language distribution:
language
0    100
1     67
2     67
3     66
Name: count, dtype: int64


Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 1, English: acc=0.5200, prec=0.5201, rec=0.5200, f1=0.5192, auc=0.5160, samples=100
Fold 1, Igbo: acc=0.9104, prec=0.9104, rec=0.9104, f1=0.9095, auc=0.9535, samples=67
Fold 1, Yoruba: acc=0.8955, prec=0.9019, rec=0.8955, f1=0.8969, auc=0.9713, samples=67
Fold 1, Hausa: acc=0.8485, prec=0.8471, rec=0.8485, f1=0.8470, auc=0.9256, samples=66
Fold 2 test set language distribution:
language
0    100
1     67
3     67
2     66
Name: count, dtype: int64


Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 2, English: acc=0.5100, prec=0.5100, rec=0.5100, f1=0.5096, auc=0.4812, samples=100
Fold 2, Igbo: acc=0.8507, prec=0.8635, rec=0.8507, f1=0.8413, auc=0.9180, samples=67
Fold 2, Yoruba: acc=0.9091, prec=0.9091, rec=0.9091, f1=0.9091, auc=0.9697, samples=66
Fold 2, Hausa: acc=0.9552, prec=0.9581, rec=0.9552, f1=0.9545, auc=0.9981, samples=67
Fold 3 test set language distribution:
language
0    100
1     67
2     67
3     66
Name: count, dtype: int64


Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 3, English: acc=0.5700, prec=0.5707, rec=0.5700, f1=0.5689, auc=0.6052, samples=100
Fold 3, Igbo: acc=0.8955, prec=0.8970, rec=0.8955, f1=0.8960, auc=0.9674, samples=67
Fold 3, Yoruba: acc=0.9254, prec=0.9265, rec=0.9254, f1=0.9257, auc=0.9684, samples=67
Fold 3, Hausa: acc=0.9394, prec=0.9394, rec=0.9394, f1=0.9394, auc=0.9434, samples=66
Fold 4 test set language distribution:
language
0    100
2     68
1     66
3     66
Name: count, dtype: int64


Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 4, English: acc=0.5000, prec=0.5000, rec=0.5000, f1=0.4833, auc=0.4360, samples=100
Fold 4, Igbo: acc=0.8485, prec=0.8479, rec=0.8485, f1=0.8446, auc=0.9393, samples=66
Fold 4, Yoruba: acc=0.9118, prec=0.9118, rec=0.9118, f1=0.9118, auc=0.9697, samples=68
Fold 4, Hausa: acc=0.8636, prec=0.8663, rec=0.8636, f1=0.8591, auc=0.9191, samples=66
Fold 5 test set language distribution:
language
0    100
2     68
1     66
3     66
Name: count, dtype: int64


Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 5, English: acc=0.5800, prec=0.5812, rec=0.5800, f1=0.5785, auc=0.5960, samples=100
Fold 5, Igbo: acc=0.7727, prec=0.7851, rec=0.7727, f1=0.7499, auc=0.9252, samples=66
Fold 5, Yoruba: acc=0.9265, prec=0.9340, rec=0.9265, f1=0.9242, auc=0.9905, samples=68
Fold 5, Hausa: acc=0.7576, prec=0.7535, rec=0.7576, f1=0.7548, auc=0.8716, samples=66


Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Global English: acc=0.6860, prec=0.6869, rec=0.6860, f1=0.6856, auc=0.7444, samples=500


Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Global Igbo: acc=0.9009, prec=0.9015, rec=0.9009, f1=0.8991, auc=0.9656, samples=333


Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Global Yoruba: acc=0.9226, prec=0.9226, rec=0.9226, f1=0.9226, auc=0.9844, samples=336


Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Global Hausa: acc=0.9063, prec=0.9059, rec=0.9063, f1=0.9057, auc=0.9632, samples=331
Global LLaVA no PSO results: {'English': 'acc=0.6860, prec=0.6869, rec=0.6860, f1=0.6856, auc=0.7444', 'Igbo': 'acc=0.9009, prec=0.9015, rec=0.9009, f1=0.8991, auc=0.9656', 'Yoruba': 'acc=0.9226, prec=0.9226, rec=0.9226, f1=0.9226, auc=0.9844', 'Hausa': 'acc=0.9063, prec=0.9059, rec=0.9063, f1=0.9057, auc=0.9632'}
   fold       acc      prec       rec        f1       auc  \
0     1  0.763333  0.764385  0.763333  0.763782  0.853871   
1     2  0.773333  0.771581  0.773333  0.771889  0.849676   
2     3  0.803333  0.805958  0.803333  0.804204  0.889828   
3     4  0.750000  0.748948  0.750000  0.742876  0.834815   
4     5  0.740000  0.737500  0.740000  0.733681  0.845046   

                                                lang    f1_std  
0  {'English': (0.52, 0.5201288244766505, 0.52, 0...  0.159865  
1  {'English': (0.51, 0.5100361300682457, 0.51, 0...  0.174480  
2  {'English': (0.57, 0.570707070707

In [54]:
# Cell 6.1: Model Comparison
# Load results
with open('/content/drive/MyDrive/Thesis/output/results.pkl', 'rb') as f:
    results = pickle.load(f)
with open('/content/drive/MyDrive/Thesis/output/llava_pso_results.pkl', 'rb') as f:
    llava_pso_results = pickle.load(f)
with open('/content/drive/MyDrive/Thesis/output/llava_no_pso_results.pkl', 'rb') as f:
    llava_no_pso_results = pickle.load(f)

# Compute mean metrics across folds
baseline_metrics = {
    'Model': 'Baseline',
    'Acc': np.mean([r['acc'] for r in results]),
    'Prec': np.mean([r['prec'] for r in results]),
    'Rec': np.mean([r['rec'] for r in results]),
    'F1': np.mean([r['f1'] for r in results]),
    'AUC': np.mean([r['auc'] for r in results]),
    'F1_Std': np.mean([r['f1_std'] for r in results])
}
llava_pso_metrics = {
    'Model': 'LLaVA (PSO)',
    'Acc': np.mean([r['acc'] for r in llava_pso_results]),
    'Prec': np.mean([r['prec'] for r in llava_pso_results]),
    'Rec': np.mean([r['rec'] for r in llava_pso_results]),
    'F1': np.mean([r['f1'] for r in llava_pso_results]),
    'AUC': np.mean([r['auc'] for r in llava_pso_results]),
    'F1_Std': np.mean([r['f1_std'] for r in llava_pso_results])
}
llava_no_pso_metrics = {
    'Model': 'LLaVA (No PSO)',
    'Acc': np.mean([r['acc'] for r in llava_no_pso_results]),
    'Prec': np.mean([r['prec'] for r in llava_no_pso_results]),
    'Rec': np.mean([r['rec'] for r in llava_no_pso_results]),
    'F1': np.mean([r['f1'] for r in llava_no_pso_results]),
    'AUC': np.mean([r['auc'] for r in llava_no_pso_results]),
    'F1_Std': np.mean([r['f1_std'] for r in llava_no_pso_results])
}

# Create comparison DataFrame
comparison_df = pd.DataFrame([baseline_metrics, llava_no_pso_metrics, llava_pso_metrics])
print("\nModel Comparison Table:")
print(comparison_df)

# Perform paired t-tests
baseline_f1 = [r['f1'] for r in results]
llava_pso_f1 = [r['f1'] for r in llava_pso_results]
llava_no_pso_f1 = [r['f1'] for r in llava_no_pso_results]

# T-test: Baseline vs. LLaVA (PSO)
t_stat1, p_val1 = ttest_rel(baseline_f1, llava_pso_f1)
print(f"\nT-test (Baseline vs. LLaVA PSO): t={t_stat1:.4f}, p={p_val1:.4f}")
if p_val1 < 0.05:
    print("Significant difference between Baseline and LLaVA (PSO) F1 scores (p < 0.05)")
else:
    print("No significant difference between Baseline and LLaVA (PSO) F1 scores (p >= 0.05)")

# T-test: LLaVA (PSO) vs. LLaVA (No PSO)
t_stat2, p_val2 = ttest_rel(llava_pso_f1, llava_no_pso_f1)
print(f"T-test (LLaVA PSO vs. LLaVA No PSO): t={t_stat2:.4f}, p={p_val2:.4f}")
if p_val2 < 0.05:
    print("Significant difference between LLaVA (PSO) and LLaVA (No PSO) F1 scores (p < 0.05)")
else:
    print("No significant difference between LLaVA (PSO) and LLaVA (No PSO) F1 scores (p >= 0.05)")

# Save comparison
with open('/content/drive/MyDrive/Thesis/output/model_comparison.pkl', 'wb') as f:
    pickle.dump(comparison_df, f)
# Compare model performance with statistical significance.


Model Comparison Table:
            Model       Acc      Prec       Rec        F1       AUC    F1_Std
0        Baseline  0.746667  0.753142  0.746667  0.746799  0.840491  0.243346
1  LLaVA (No PSO)  0.766000  0.765674  0.766000  0.763286  0.854647  0.155934
2     LLaVA (PSO)  0.761333  0.763964  0.761333  0.761180  0.861873  0.176330

T-test (Baseline vs. LLaVA PSO): t=-1.4600, p=0.2181
No significant difference between Baseline and LLaVA (PSO) F1 scores (p >= 0.05)
T-test (LLaVA PSO vs. LLaVA No PSO): t=-0.1908, p=0.8580
No significant difference between LLaVA (PSO) and LLaVA (No PSO) F1 scores (p >= 0.05)


In [55]:
# Cell 6.2: Cross-Lingual and Fairness Analysis
# Load global results
import pickle
import numpy as np

with open('/content/drive/MyDrive/Thesis/output/global_lang_results.pkl', 'rb') as f:
    global_lang_results = pickle.load(f)
with open('/content/drive/MyDrive/Thesis/output/global_llava_results.pkl', 'rb') as f:
    global_llava_results = pickle.load(f)
with open('/content/drive/MyDrive/Thesis/output/global_no_pso_results.pkl', 'rb') as f:
    global_no_pso_results = pickle.load(f)

# Extract F1 scores for each language
languages = ['English', 'Igbo', 'Yoruba', 'Hausa']
cross_lingual_f1 = {
    'Language': languages,
    'Baseline F1': [global_lang_results[lang][3] for lang in languages],
    'LLaVA (No PSO) F1': [global_no_pso_results[lang][3] for lang in languages],
    'LLaVA (PSO) F1': [global_llava_results[lang][3] for lang in languages]
}
cross_lingual_df = pd.DataFrame(cross_lingual_f1)
print("\nCross-Lingual F1 Scores:")
print(cross_lingual_df)

# Compute fairness (F1 standard deviation across languages)
fairness_metrics = {
    'Model': ['Baseline', 'LLaVA (No PSO)', 'LLaVA (PSO)'],
    'F1_Std Across Languages': [
        np.std([global_lang_results[lang][3] for lang in languages]),
        np.std([global_no_pso_results[lang][3] for lang in languages]),
        np.std([global_llava_results[lang][3] for lang in languages])
    ]
}
fairness_df = pd.DataFrame(fairness_metrics)
print("\nFairness (F1 Std Across Languages):")
print(fairness_df)

# Bar chart for cross-lingual F1 scores
# The chartjs syntax is not supported directly in Colab code cells.
# To generate a chart, you would typically use a library like matplotlib or seaborn.


Cross-Lingual F1 Scores:
  Language  Baseline F1  LLaVA (No PSO) F1  LLaVA (PSO) F1
0  English     0.845970           0.685637        0.807889
1     Igbo     0.978913           0.899131        0.929502
2   Yoruba     0.985073           0.922619        0.982029
3    Hausa     0.987891           0.905654        0.929494

Fairness (F1 Std Across Languages):
            Model  F1_Std Across Languages
0        Baseline                 0.059839
1  LLaVA (No PSO)                 0.097157
2     LLaVA (PSO)                 0.063944


In [64]:
# Cell 6.3: Global Confusion Matrices and Visualizations
from sklearn.metrics import confusion_matrix
import pickle
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Ensure output directory exists
os.makedirs('/content/drive/MyDrive/Thesis/output', exist_ok=True)

# Load results
try:
    with open('/content/drive/MyDrive/Thesis/output/results.pkl', 'rb') as f:
        results = pickle.load(f)
    with open('/content/drive/MyDrive/Thesis/output/llava_pso_results.pkl', 'rb') as f:
        llava_pso_results = pickle.load(f)
    with open('/content/drive/MyDrive/Thesis/output/llava_no_pso_results.pkl', 'rb') as f:
        llava_no_pso_results = pickle.load(f)
    with open('/content/drive/MyDrive/Thesis/output/global_lang_results.pkl', 'rb') as f:
        global_lang_results = pickle.load(f)
    with open('/content/drive/MyDrive/Thesis/output/global_llava_results.pkl', 'rb') as f:
        global_llava_results = pickle.load(f)
    with open('/content/drive/MyDrive/Thesis/output/global_no_pso_results.pkl', 'rb') as f:
        global_no_pso_results = pickle.load(f)
except FileNotFoundError as e:
    print(f"Error: Missing result file - {e}")
    raise

# Function to compute global confusion matrix
def compute_global_cm(model, df, transform, tokenizer, device):
    model.eval()
    preds, labels = [], []
    dataset = MemeDataset(df, transform, tokenizer)
    loader = DataLoader(dataset, batch_size=8)
    try:
        with torch.no_grad():
            for batch in loader:
                logits = model(batch['image'].to(device), batch['input_ids'].to(device), batch['attention_mask'].to(device))
                preds.extend(logits.argmax(dim=1).cpu().numpy())
                labels.extend(batch['label'].cpu().numpy())
        return confusion_matrix(labels, preds, labels=[0, 1])
    except RuntimeError as e:
        print(f"Error during global confusion matrix computation: {e}")
        return np.zeros((2, 2))

# Compute global confusion matrices
models = [
    ('Baseline', BaselineModel, resnet_transform, baseline_tokenizer, '/content/drive/MyDrive/Thesis/output/model_fold0.pt'),
    ('LLaVA (PSO)', lambda: LLaVAModel(best_params), clip_transform, llava_tokenizer, '/content/drive/MyDrive/Thesis/output/llava_pso_fold0.pt'),
    ('LLaVA (No PSO)', lambda: LLaVAModel(fixed_params), clip_transform, llava_tokenizer, '/content/drive/MyDrive/Thesis/output/llava_no_pso_fold0.pt')
]
global_cms = {}

for model_name, model_fn, transform, tokenizer, model_path in models:
    try:
        model = model_fn().to(device)
        model.load_state_dict(torch.load(model_path, map_location=device))
    except Exception as e:
        print(f"Error loading {model_name} model: {e}")
        continue
    global_cms[model_name] = compute_global_cm(model, df, transform, tokenizer, device)
    print(f"\n{model_name} Global Confusion Matrix:\n{global_cms[model_name]}")

    # Plot global confusion matrix as heatmap
    plt.figure(figsize=(6, 4))
    sns.heatmap(global_cms[model_name], annot=True, fmt='d', cmap='Blues', cbar=False,
                xticklabels=['Non-Hate', 'Hate'], yticklabels=['Non-Hate', 'Hate'])
    plt.title(f'{model_name} Global Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.savefig(f'/content/drive/MyDrive/Thesis/output/cm_{model_name.replace(" ", "_").lower()}_global.png')
    plt.close()

# AUC Bar Chart (Matplotlib)
languages = ['English', 'Igbo', 'Yoruba', 'Hausa']
bar_width = 0.25
x = np.arange(len(languages))
plt.figure(figsize=(10, 6))
plt.bar(x - bar_width, [global_lang_results[lang][4] for lang in languages], bar_width, label='Baseline', color='#1f77b4')
plt.bar(x, [global_no_pso_results[lang][4] for lang in languages], bar_width, label='LLaVA (No PSO)', color='#ff7f0e')
plt.bar(x + bar_width, [global_llava_results[lang][4] for lang in languages], bar_width, label='LLaVA (PSO)', color='#2ca02c')
plt.xlabel('Language')
plt.ylabel('AUC Score')
plt.title('Cross-Lingual AUC Scores by Model')
plt.xticks(x, languages)
plt.legend()
plt.tight_layout()
plt.savefig('/content/drive/MyDrive/Thesis/output/auc_comparison.png')
plt.close()

# F1 Std Line Chart (Matplotlib)
f1_std_data = {
    'Baseline': [r['f1_std'] for r in results],
    'LLaVA (No PSO)': [r['f1_std'] for r in llava_no_pso_results],
    'LLaVA (PSO)': [r['f1_std'] for r in llava_pso_results]
}
plt.figure(figsize=(10, 6))
plt.plot(range(1, 6), f1_std_data['Baseline'], marker='o', label='Baseline', color='#1f77b4')
plt.plot(range(1, 6), f1_std_data['LLaVA (No PSO)'], marker='s', label='LLaVA (No PSO)', color='#ff7f0e')
plt.plot(range(1, 6), f1_std_data['LLaVA (PSO)'], marker='^', label='LLaVA (PSO)', color='#2ca02c')
plt.xlabel('Fold')
plt.ylabel('F1 Standard Deviation')
plt.title('F1 Standard Deviation Across Folds')
plt.xticks(range(1, 6))
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig('/content/drive/MyDrive/Thesis/output/f1_std_comparison.png')
plt.close()

# Save global confusion matrices
with open('/content/drive/MyDrive/Thesis/output/global_confusion_matrices.pkl', 'wb') as f:
    pickle.dump(global_cms, f)
# Compute global confusion matrices and visualize with Matplotlib.

Clearing GPU cache before ResNet50...
Loading ResNet50...
ResNet50 loaded successfully
Clearing GPU cache before DistilBERT...
Loading DistilBERT...




DistilBERT loaded successfully

Baseline Global Confusion Matrix:
[[861  39]
 [ 54 546]]


Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



LLaVA (PSO) Global Confusion Matrix:
[[850  50]
 [ 98 502]]


Some weights of XLMRobertaModel were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



LLaVA (No PSO) Global Confusion Matrix:
[[779 121]
 [126 474]]
