# Multimodal Meme Classification (Self-harm Detection)
## Late Fusion Approach
Notebook ini memisahkan modelling untuk tiap modalitas (Image & Text) sehingga masing-masing model menghasilkan probabilitas klasifikasi sendiri. Di bagian akhir dilakukan *late fusion* (penggabungan prediksi) untuk memperoleh prediksi final yang lebih kuat.

**Strategi:**
1. Train model gambar (CLIP Vision) -> output probabilitas.
2. Train model teks (sentinet/suicidality) -> output probabilitas.
3. Evaluasi masing-masing.
4. Late Fusion: 
   - Simple Average 
   - Weighted Average (grid search bobot terbaik) 
   - Stacking (Logistic Regression meta-classifier).
5. Inferensi akhir dengan bobot terbaik (atau meta-classifier).

## 1. Setup & Install (Jika Perlu di Kaggle)

In [None]:
# !pip install -q torch torchvision transformers pandas numpy pillow scikit-learn matplotlib seaborn tqdm

In [None]:
import os, random, warnings
from datetime import datetime
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from PIL import Image
from transformers import CLIPModel, CLIPProcessor, AutoModel, AutoTokenizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')

# Reproducibility
def set_seed(seed=42):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed);
    torch.cuda.manual_seed_all(seed); torch.backends.cudnn.deterministic = True; torch.backends.cudnn.benchmark = False
set_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)

## 2. Configuration

In [None]:
CONFIG = {
  'csv_path': '/kaggle/input/your-dataset/labels.csv',  # Ubah sesuai dataset
  'image_dir': '/kaggle/input/your-dataset/images',      # Ubah sesuai dataset
  'num_classes': 2,
  'batch_size': 16,
  'num_epochs_image': 8,
  'num_epochs_text': 8,
  'lr_image': 2e-5,
  'lr_text': 2e-5,
  'weight_decay': 0.01,
  'val_split': 0.2,
  'early_patience': 4,
  'max_text_length': 128,
  'checkpoint_dir': '/kaggle/working/checkpoints_late',
  'results_dir': '/kaggle/working/results_late',
  'random_state': 42
}
os.makedirs(CONFIG['checkpoint_dir'], exist_ok=True)
os.makedirs(CONFIG['results_dir'], exist_ok=True)
print('CONFIG:')
for k,v in CONFIG.items(): print(f'  {k}: {v}')

## 3. Dataset & DataLoader

In [None]:
class MemeDataset(Dataset):
    def __init__(self, df, image_dir, clip_processor, text_tokenizer, max_length=128, augment=False):
        self.df = df.reset_index(drop=True)
        self.image_dir = image_dir
        self.clip_processor = clip_processor
        self.text_tokenizer = text_tokenizer
        self.max_length = max_length
        self.augment = augment
        self.label_map = {'Self-harm':1, 'Non Self-harm':0}
        self.img_aug = transforms.Compose([
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.RandomRotation(15),
            transforms.Resize((224,224))
        ]) if augment else transforms.Resize((224,224))
    def __len__(self): return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]; img_path = os.path.join(self.image_dir, row['File_Name'])
        try:
            image = Image.open(img_path).convert('RGB')
        except Exception as e:
            print('Img error:', img_path, e); image = Image.new('RGB',(224,224),'black')
        if self.augment and isinstance(self.img_aug, transforms.Compose):
            for t in self.img_aug.transforms: image = t(image)
        image_inputs = self.clip_processor(images=image, return_tensors='pt')
        image_tensor = image_inputs['pixel_values'].squeeze(0)
        text = str(row.get('Teks_Terlihat','')) if pd.notna(row.get('Teks_Terlihat')) else ''
        text_inputs = self.text_tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        text_ids = text_inputs['input_ids'].squeeze(0)
        text_mask = text_inputs['attention_mask'].squeeze(0)
        label = self.label_map.get(row['Label'],0)
        return {'image': image_tensor, 'text_ids': text_ids, 'text_mask': text_mask, 'label': torch.tensor(label)}

In [None]:
def load_data(config):
    df = pd.read_csv(config['csv_path'])
    df = df[df['Label'].isin(['Self-harm','Non Self-harm'])].copy()
    from sklearn.model_selection import train_test_split
    train_df, val_df = train_test_split(df, test_size=config['val_split'], stratify=df['Label'], random_state=config['random_state'])
    print('Total:', len(df), '| Train:', len(train_df), '| Val:', len(val_df))
    clip_processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')
    text_tokenizer = AutoTokenizer.from_pretrained('sentinet/suicidality')
    train_ds = MemeDataset(train_df, config['image_dir'], clip_processor, text_tokenizer, max_length=config['max_text_length'], augment=True)
    val_ds = MemeDataset(val_df, config['image_dir'], clip_processor, text_tokenizer, max_length=config['max_text_length'], augment=False)
    train_loader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True, num_workers=2, pin_memory=torch.cuda.is_available())
    val_loader = DataLoader(val_ds, batch_size=config['batch_size'], shuffle=False, num_workers=2, pin_memory=torch.cuda.is_available())
    return train_loader, val_loader, clip_processor, text_tokenizer

## 4. Model Image & Text (Terpisah)

In [None]:
class ImageClassifier(nn.Module):
    def __init__(self, num_classes=2, dropout=0.3, freeze=False):
        super().__init__()
        self.clip = CLIPModel.from_pretrained('openai/clip-vit-base-patch32')
        if freeze:
            for p in self.clip.vision_model.parameters(): p.requires_grad=False
        dim = self.clip.config.vision_config.hidden_size
        self.head = nn.Sequential(nn.Linear(dim,256), nn.ReLU(), nn.Dropout(dropout), nn.Linear(256,num_classes))
    def forward(self, pixel_values):
        vout = self.clip.vision_model(pixel_values=pixel_values)
        emb = vout.pooler_output
        return self.head(emb)
class TextClassifier(nn.Module):
    def __init__(self, num_classes=2, dropout=0.3, freeze=False):
        super().__init__()
        self.text_model = AutoModel.from_pretrained('sentinet/suicidality')
        if freeze:
            for p in self.text_model.parameters(): p.requires_grad=False
        dim = self.text_model.config.hidden_size
        self.head = nn.Sequential(nn.Linear(dim,256), nn.ReLU(), nn.Dropout(dropout), nn.Linear(256,num_classes))
    def forward(self, input_ids, attention_mask):
        out = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        hidden = out.last_hidden_state
        mask = attention_mask.unsqueeze(-1).expand(hidden.size()).float()
        mean_emb = (hidden*mask).sum(1)/mask.sum(1).clamp(min=1e-9)
        return self.head(mean_emb)

## 5. EarlyStopping & Trainer (Single Modality)

In [None]:
class EarlyStopping:
    def __init__(self, patience=4):
        self.patience=patience; self.counter=0; self.best=None; self.stop=False
    def step(self, val_loss):
        if self.best is None or val_loss < self.best - 1e-4: self.best=val_loss; self.counter=0
        else: self.counter+=1; self.stop = self.counter>=self.patience
class ModalityTrainer:
    def __init__(self, model, loader_train, loader_val, lr, epochs, name):
        self.model=model.to(device); self.tr=loader_train; self.val=loader_val; self.lr=lr; self.epochs=epochs; self.name=name
        self.crit=nn.CrossEntropyLoss(); self.opt=optim.AdamW(self.model.parameters(), lr=lr, weight_decay=CONFIG['weight_decay'])
        self.scheduler=optim.lr_scheduler.ReduceLROnPlateau(self.opt, mode='min', patience=2, factor=0.5)
        self.es=EarlyStopping(patience=CONFIG['early_patience'])
        self.hist={'train_loss':[], 'val_loss':[], 'val_acc':[]}
        self.best_path=os.path.join(CONFIG['checkpoint_dir'], f'{name}_best.pth')
    def run_epoch(self):
        self.model.train(); tot=0; correct=0; count=0
        for b in tqdm(self.tr, desc=f'Train-{self.name}'):
            imgs=b['image'].to(device); ids=b['text_ids'].to(device); mask=b['text_mask'].to(device); lbl=b['label'].to(device)
            self.opt.zero_grad()
            if isinstance(self.model, ImageClassifier): out=self.model(imgs)
            else: out=self.model(ids, mask)
            loss=self.crit(out,lbl); loss.backward(); torch.nn.utils.clip_grad_norm_(self.model.parameters(),1.0); self.opt.step()
            _,pred=out.max(1); correct+= (pred==lbl).sum().item(); count+=lbl.size(0); tot+=loss.item()
        return tot/len(self.tr), 100*correct/count
    def validate(self):
        self.model.eval(); tot=0; correct=0; count=0
        with torch.no_grad():
            for b in tqdm(self.val, desc=f'Val-{self.name}'):
                imgs=b['image'].to(device); ids=b['text_ids'].to(device); mask=b['text_mask'].to(device); lbl=b['label'].to(device)
                if isinstance(self.model, ImageClassifier): out=self.model(imgs)
                else: out=self.model(ids, mask)
                loss=self.crit(out,lbl); _,pred=out.max(1); correct+= (pred==lbl).sum().item(); count+=lbl.size(0); tot+=loss.item()
        return tot/len(self.val), 100*correct/count
    def train(self):
        for ep in range(1, self.epochs+1):
            tl,ta=self.run_epoch(); vl,va=self.validate(); self.scheduler.step(vl)
            self.hist['train_loss'].append(tl); self.hist['val_loss'].append(vl); self.hist['val_acc'].append(va)
            print(f'Epoch {ep}/{self.epochs} | TrainLoss {tl:.4f} | ValLoss {vl:.4f} | ValAcc {va:.2f}%')
            if self.es.best is None or vl < self.es.best: torch.save(self.model.state_dict(), self.best_path)
            self.es.step(vl);
            if self.es.stop: print('Early stop'); break
        return self.hist

## 6. Load Data

In [None]:
train_loader, val_loader, clip_processor_ref, text_tokenizer_ref = load_data(CONFIG)

## 7. Train Image Model

In [None]:
image_model = ImageClassifier(num_classes=CONFIG['num_classes'])
trainer_img = ModalityTrainer(image_model, train_loader, val_loader, CONFIG['lr_image'], CONFIG['num_epochs_image'], 'image')
hist_img = trainer_img.train()

## 8. Train Text Model

In [None]:
text_model = TextClassifier(num_classes=CONFIG['num_classes'])
trainer_txt = ModalityTrainer(text_model, train_loader, val_loader, CONFIG['lr_text'], CONFIG['num_epochs_text'], 'text')
hist_txt = trainer_txt.train()

## 9. Evaluation Helper

In [None]:
def evaluate_probabilities(model, loader):
    model.eval(); all_probs=[]; all_labels=[];
    with torch.no_grad():
        for b in loader:
            imgs=b['image'].to(device); ids=b['text_ids'].to(device); mask=b['text_mask'].to(device); lbl=b['label'].to(device)
            if isinstance(model, ImageClassifier): out=model(imgs)
            else: out=model(ids, mask)
            probs=torch.softmax(out, dim=1)
            all_probs.append(probs.cpu().numpy()); all_labels.append(lbl.cpu().numpy())
    all_probs=np.concatenate(all_probs); all_labels=np.concatenate(all_labels)
    preds = all_probs.argmax(1)
    acc=accuracy_score(all_labels,preds); prec=precision_score(all_labels,preds,average='weighted',zero_division=0); rec=recall_score(all_labels,preds,average='weighted',zero_division=0); f1=f1_score(all_labels,preds,average='weighted',zero_division=0)
    return {'probs':all_probs,'labels':all_labels,'metrics':{'acc':acc,'prec':prec,'rec':rec,'f1':f1}}

In [None]:
# Load best weights (if early stopping) before evaluation
img_best = os.path.join(CONFIG['checkpoint_dir'],'image_best.pth')
txt_best = os.path.join(CONFIG['checkpoint_dir'],'text_best.pth')
if os.path.exists(img_best): image_model.load_state_dict(torch.load(img_best, map_location=device))
if os.path.exists(txt_best): text_model.load_state_dict(torch.load(txt_best, map_location=device))
eval_img = evaluate_probabilities(image_model, val_loader)
eval_txt = evaluate_probabilities(text_model, val_loader)
print('Image Metrics:', eval_img['metrics'])
print('Text Metrics:', eval_txt['metrics'])

## 10. Late Fusion (Simple, Weighted, Stacking)

In [None]:
labels = eval_img['labels']  # sama dengan eval_txt['labels']
img_probs = eval_img['probs']
txt_probs = eval_txt['probs']
# Simple Average
simple_probs = (img_probs + txt_probs)/2.0
simple_preds = simple_probs.argmax(1)
def metrics_from(preds, labels, probs):
    return {
        'acc': accuracy_score(labels,preds),
        'prec': precision_score(labels,preds,average='weighted',zero_division=0),
        'rec': recall_score(labels,preds,average='weighted',zero_division=0),
        'f1': f1_score(labels,preds,average='weighted',zero_division=0)
    }
simple_metrics = metrics_from(simple_preds, labels, simple_probs)
print('Simple Average Metrics:', simple_metrics)
# Weighted Average Grid Search
best_w = None; best_f1=-1; best_metrics=None; best_probs=None
for w in np.linspace(0,1,21):  # step 0.05
    fused = w*img_probs + (1-w)*txt_probs
    preds = fused.argmax(1)
    m = metrics_from(preds, labels, fused)
    if m['f1'] > best_f1: best_f1=m['f1']; best_w=w; best_metrics=m; best_probs=fused
print(f'Best Weight (img vs text): {best_w:.2f} | F1={best_f1:.4f}')
print('Weighted Metrics:', best_metrics)
# Stacking (Logistic Regression) menggunakan probabilitas kedua model sebagai fitur
stack_features = np.concatenate([img_probs, txt_probs], axis=1)  # shape (N,4)
meta = LogisticRegression(max_iter=1000)
meta.fit(stack_features, labels)
stack_preds = meta.predict(stack_features)
stack_metrics = metrics_from(stack_preds, labels, None)
print('Stacking Metrics:', stack_metrics)
# Tentukan pendekatan terbaik berdasarkan F1
fusion_results = {
  'simple': simple_metrics,
  'weighted': best_metrics,
  'stacking': stack_metrics,
  'best_weight': best_w
}
pd.DataFrame(fusion_results).to_csv(os.path.join(CONFIG['results_dir'],'fusion_metrics.csv'))
print('âœ“ Fusion metrics disimpan.')

## 11. Visualisasi Training Curves (Optional)

In [None]:
def plot_history(hist, title):
    epochs = range(1, len(hist['train_loss'])+1)
    plt.figure(figsize=(12,4))
    plt.subplot(1,2,1); plt.plot(epochs, hist['train_loss'], label='Train'); plt.plot(epochs, hist['val_loss'], label='Val'); plt.title(title+' Loss'); plt.legend(); plt.grid(alpha=0.3)
    plt.subplot(1,2,2); plt.plot(epochs, hist['val_acc'], label='Val Acc', color='green'); plt.title(title+' Val Accuracy'); plt.legend(); plt.grid(alpha=0.3)
    plt.tight_layout(); plt.show()
plot_history(hist_img, 'Image'); plot_history(hist_txt, 'Text')

## 12. Inference (Late Fusion)

In [None]:
def predict_late_fusion(image_model, text_model, image_path, text, clip_processor, text_tokenizer, weight=None, meta_clf=None):
    image_model.eval(); text_model.eval()
    # Image
    img = Image.open(image_path).convert('RGB')
    img_inputs = clip_processor(images=img, return_tensors='pt')['pixel_values'].to(device)
    # Text
    text_inputs = text_tokenizer(str(text), padding='max_length', truncation=True, max_length=CONFIG['max_text_length'], return_tensors='pt')
    ids = text_inputs['input_ids'].to(device); mask = text_inputs['attention_mask'].to(device)
    with torch.no_grad():
        img_logits = image_model(img_inputs)
        txt_logits = text_model(ids, mask)
        img_p = torch.softmax(img_logits, dim=1).cpu().numpy()[0]
        txt_p = torch.softmax(txt_logits, dim=1).cpu().numpy()[0]
    if meta_clf is not None:
        feat = np.concatenate([img_p, txt_p])[None,:]
        pred = meta_clf.predict(feat)[0]
        conf = (img_p[1]+txt_p[1])/2
        fused = None
    else:
        w = 0.5 if weight is None else weight
        fused = w*img_p + (1-w)*txt_p
        pred = fused.argmax(); conf = fused[pred]
    label_map = ['Non Self-harm','Self-harm']
    return {
        'final_label': label_map[pred],
        'confidence': float(conf),
        'image_probs': {label_map[0]: float(img_p[0]), label_map[1]: float(img_p[1])},
        'text_probs': {label_map[0]: float(txt_p[0]), label_map[1]: float(txt_p[1])},
        'fused_probs': None if fused is None else {label_map[0]: float(fused[0]), label_map[1]: float(fused[1])},
        'weight_used': None if meta_clf is not None else (0.5 if weight is None else weight),
        'stacking_used': meta_clf is not None
    }

## 13. Ringkasan Akhir

In [None]:
print('='*70)
print('LATE FUSION PIPELINE SELESAI')
print('Device:', device)
print('Image Best Path:', img_best)
print('Text Best Path:', txt_best)
print('Best Weighted w:', fusion_results['best_weight'])
print('Metrics:')
for k,v in fusion_results.items():
    if k in ['simple','weighted','stacking']:
        print(f'  {k}: F1={v["f1"]:.4f}, Acc={v["acc"]:.4f}')
print('Fusion metrics saved to fusion_metrics.csv')
print('='*70)