In [None]:
!pip install torch torchvision torchaudio transformers



In [None]:
from transformers import DistilBertTokenizer
import json
import re

def parse_srt(srt_path):
    srt_dict = {}
    with open(srt_path, 'r', encoding='utf-8') as f:
        content = f.read()

    entries = re.split(r'\n\s*\n', content.strip())  # split by blank lines

    for entry in entries:
        lines = entry.strip().split('\n')
        if len(lines) >= 3:
            index = int(lines[0].strip())
            text = ' '.join(lines[2:]).strip()
            srt_dict[index] = text

    return srt_dict

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')


In [None]:
import os
import glob
import torch
import torchaudio
import torchvision
import numpy as np
import pandas as pd
from datetime import datetime
from moviepy.editor import VideoFileClip
from transformers import DistilBertTokenizer
from transformers import DistilBertModel

# Configurable
TIME_UNIT_SECONDS = 0.5
SAMPLE_RATE = 1600
MAX_FRAMES_PER_CLIP = 16  # For R(2+1)D

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def parse_timestamp(ts: str) -> float:
    """Converts 'HH:MM:SS,sss' to total seconds as float."""
    dt = datetime.strptime(ts.strip(), "%H:%M:%S,%f")
    return dt.hour * 3600 + dt.minute * 60 + dt.second + dt.microsecond / 1e6

def process_audio(audio_path, total_units):
    waveform, sr = torchaudio.load(audio_path)
    waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=SAMPLE_RATE)(waveform)
    waveform = waveform.mean(dim=0).unsqueeze(0)  # Mono

    unit_samples = int(TIME_UNIT_SECONDS * SAMPLE_RATE)
    audio_chunks = []

    for i in range(total_units):
        start = i * unit_samples
        end = start + unit_samples
        chunk = waveform[:, start:end]
        if chunk.shape[-1] < unit_samples:
            chunk = torch.nn.functional.pad(chunk, (0, unit_samples - chunk.shape[-1]))
        audio_chunks.append(chunk)

    return torch.stack(audio_chunks)  # [T, 1, samples]

def process_video(video_path, total_units):
    clip = VideoFileClip(video_path)
    video_chunks = []

    for i in range(total_units):
        start = i * TIME_UNIT_SECONDS
        end = start + TIME_UNIT_SECONDS
        try:
            subclip = clip.subclip(start, min(end, clip.duration))
            frames = list(subclip.iter_frames(fps=MAX_FRAMES_PER_CLIP, dtype="uint8"))
            if len(frames) < MAX_FRAMES_PER_CLIP:
                frames += [frames[-1]] * (MAX_FRAMES_PER_CLIP - len(frames))
            frames = torch.tensor(np.stack(frames)).permute(3, 0, 1, 2).float() / 255.0  # [C, T, H, W]
        except:
            frames = torch.zeros((3, MAX_FRAMES_PER_CLIP, 112, 112))  # fallback
        transform = torchvision.transforms.Resize((112, 112))
        frames = transform(frames)
        video_chunks.append(frames)
    return torch.stack(video_chunks)  # [T, C, F, H, W]


import json
from transformers import DistilBertTokenizer

model_dist =  DistilBertModel.from_pretrained("distilbert-base-uncased").to(device)
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
def build_entries_from_mapping(json_av_srt_map):

    model_dist.eval()

    entries = []

    for dir_path, srt_path in json_av_srt_map.items():
        # Load JSON
        json_file = [f for f in os.listdir(dir_path) if f.endswith('.json')]
        assert len(json_file) == 1, f"Expected one JSON file in {dir_path}, found: {json_file}"
        with open(os.path.join(dir_path, json_file[0]), 'r', encoding='utf-8') as f:
            segments = json.load(f)

        # Parse SRT
        srt_dict = parse_srt(srt_path)

        for segment in segments:
            event_id = segment["event_id"]
            mp3_path = os.path.join(dir_path, event_id + ".mp3")
            mp4_path = os.path.join(dir_path, event_id + ".mp4")
            try:
                srt_indices = segment["full_segment"]["srt_indices"]
                fear_indices = set(segment["fear_segment"]["srt_indices"])
                cause_indices = set(segment["cause_segment"]["srt_indices"])
                waveform, sr = torchaudio.load(mp3_path)
                duration_sec = waveform.shape[1] / sr
                total_units = int(duration_sec // TIME_UNIT_SECONDS)
                if(total_units == 0):
                    print("Skipping")
                    continue
            except Exception as e :
                print(e)
                continue
            input_tokens = []
            emotion_mask = []
            cause_mask = []

            for idx in srt_indices:
                if idx not in srt_dict:
                    print(f"[Warning] Missing subtitle index {idx} in SRT file.")
                    continue

                subtitle_text = srt_dict[idx]
                tokens = tokenizer.tokenize(subtitle_text)
                input_tokens.extend(tokens)

                if idx in fear_indices:
                    emotion_mask.extend([1] * len(tokens))
                else:
                    emotion_mask.extend([0] * len(tokens))

                if idx in cause_indices:
                    cause_mask.extend([1] * len(tokens))
                else:
                    cause_mask.extend([0] * len(tokens))



            # segment_tensor = torch.stack(segment_embeddings)
            # Create labels for emotion (fear) and cause for each segment
            label = list(zip(emotion_mask, cause_mask))
            label_tensor = torch.tensor(label, dtype=torch.float)
            # print(label_tensor.shape)
            # print(len(input_tokens))

            entries.append({
                'text': input_tokens,
                'label': label_tensor,
                'audio': mp3_path,
                'video': mp4_path
            })

    return entries


error: XDG_RUNTIME_DIR not set in the environment.
ALSA lib confmisc.c:855:(parse_card) cannot find card '0'
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_card_inum returned error: No such file or directory
ALSA lib confmisc.c:422:(snd_func_concat) error evaluating strings
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_concat returned error: No such file or directory
ALSA lib confmisc.c:1334:(snd_func_refer) error evaluating name
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_refer returned error: No such file or directory
ALSA lib conf.c:5701:(snd_config_expand) Evaluate error: No such file or directory
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM default
ALSA lib confmisc.c:855:(parse_card) cannot find card '0'
ALSA lib conf.c:5178:(_snd_config_evaluate) function snd_func_card_inum returned error: No such file or directory
ALSA lib confmisc.c:422:(snd_func_concat) error evaluating strings
ALSA lib conf.c:5178:(_snd_config_evalu

In [None]:
import torch
from torch.utils.data import Dataset
from transformers import DistilBertTokenizer
import torchaudio
from tqdm import tqdm

class MultimodalFearDataset(Dataset):
    def __init__(self, entries, tokenizer=None, sample_rate=16000, time_unit_seconds=TIME_UNIT_SECONDS, device='cuda'):
        self.entries = entries
        self.tokenizer = tokenizer or DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        self.sample_rate = sample_rate
        self.time_unit_seconds = time_unit_seconds
        self.device = device

        self.audio_cache = {}
        self.video_cache = {}

        print(f"[INFO] Preloading audio/video to {device} ...")
        for entry in tqdm(self.entries):
            audio_path = entry['audio']
            video_path = entry['video']

            if audio_path not in self.audio_cache:
                waveform, sr = torchaudio.load(audio_path)
                duration_sec = waveform.shape[1] / sr
                total_units = int(duration_sec // self.time_unit_seconds)
                audio_chunks = process_audio(audio_path, total_units).to(device)
                self.audio_cache[audio_path] = audio_chunks

            if video_path not in self.video_cache:
                video_chunks = process_video(video_path, total_units).to(device)
                self.video_cache[video_path] = video_chunks

        print(f"[INFO] Caching complete. Ready.")

    def __len__(self):
        return len(self.entries)

    def __getitem__(self, idx):
        entry = self.entries[idx]

#         # Tokenize: turn tokens -> token IDs with attention masks
        # tokenized = entry['text']

        # Now convert tokens to input_ids:
        # input_ids = tokenizer.convert_tokens_to_ids(tokenized)

        # Build attention_mask (1s for real tokens)
        # attention_mask = [1] * len(input_ids)

        # Convert to PyTorch tensors (optional, if you're returning to the model)
        tokens = entry['text']
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(input_ids)

        input_ids = torch.tensor(input_ids, device=self.device)
        attention_mask = torch.tensor(attention_mask, device=self.device)

        # attention_mask = torch.tensor(attention_mask)

        # input_ids = input_ids.squeeze(0)       # [seq_len]
        # attention_mask = tokenized['attention_mask'].squeeze(0)  # [seq_len]
        label = entry['label']       # match tokenized length

        # Process audio & video
          # if not already imported
        waveform, sr = torchaudio.load(entry['audio'])
        duration_sec = waveform.shape[1] / sr
        total_units = int(duration_sec // self.time_unit_seconds)

        audio_chunks = process_audio(entry['audio'], total_units)
        video_chunks = process_video(entry['video'], total_units)

        return (
            {'input_ids': input_ids, 'attention_mask': attention_mask},
            video_chunks, audio_chunks, label
        )
        # entry = self.entries[idx]



In [None]:
# import torch
# from torch.utils.data import Dataset
# from transformers import DistilBertTokenizer
# import torchaudio
# from tqdm import tqdm
# class MultimodalFearDataset(Dataset):
#     def __init__(self, entries, tokenizer=None, sample_rate=16000, time_unit_seconds=2):
#         self.entries = entries
#         self.tokenizer = tokenizer or DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
#         self.sample_rate = sample_rate
#         self.time_unit_seconds = time_unit_seconds

#     def __len__(self):
#         return len(self.entries)

#     def __getitem__(self, idx):
#         entry = self.entries[idx]

#         # Tokenize: turn tokens -> token IDs with attention masks
#         tokenized = entry['text']

#         # Now convert tokens to input_ids:
#         input_ids = tokenizer.convert_tokens_to_ids(tokenized)

#         # Build attention_mask (1s for real tokens)
#         attention_mask = [1] * len(input_ids)

#         # Convert to PyTorch tensors (optional, if you're returning to the model)

#         input_ids = torch.tensor(input_ids)
#         attention_mask = torch.tensor(attention_mask)

#         # input_ids = input_ids.squeeze(0)       # [seq_len]
#         # attention_mask = tokenized['attention_mask'].squeeze(0)  # [seq_len]
#         label = entry['label']       # match tokenized length

#         # Process audio & video
#           # if not already imported
#         waveform, sr = torchaudio.load(entry['audio'])
#         duration_sec = waveform.shape[1] / sr
#         total_units = int(duration_sec // self.time_unit_seconds)

#         audio_chunks = process_audio(entry['audio'], total_units)
#         video_chunks = process_video(entry['video'], total_units)

#         return (
#             {'input_ids': input_ids, 'attention_mask': attention_mask},
#             video_chunks, audio_chunks, label
#         )

In [None]:
mapping = {
    # "/kaggle/input/fear-dataset/squid_game_s1/squid_game_s1_ep1/": "/kaggle/input/fear-dataset/squid_game_s1_subs/squid_game_s1_ep1.txt",
    # "/kaggle/input/fear-dataset/squid_game_s1/squid_game_s1_ep2/": "/kaggle/input/fear-dataset/squid_game_s1_subs/squid_game_s1_ep2.txt"
}
# for i in range(3, 10):
#     mapping[f"/kaggle/input/fear-dataset/squid_game_s1/squid_game_s1_e{i}/"] = f"/kaggle/input/fear-dataset/squid_game_s1_subs/squid_game_s1_ep{i}.txt"

# for i in range(1,8):
#     mapping[f"/kaggle/input/fear-dataset/squid_game_s2/squid_game_s2_ep{i}"] = f"/kaggle/input/fear-dataset/squid_game_s2_subs/squid_game_s2_ep{i}.txt"

for i in range(1,10):
    mapping[f"/kaggle/input/fear-dataset/BlyManorS01E0{i}"] = f"/kaggle/input/fear-dataset/BlyManorS01E0{i}.txt"

In [None]:
from torch.utils.data import DataLoader

dataset = MultimodalFearDataset(build_entries_from_mapping(mapping))
def no_batch_collate_fn(batch):
    return batch[0]  # Return the first item directly without batching

loader = DataLoader(
    dataset,             # your MultimodalFearDataset
    batch_size=1,        # Batch size of 1, which is not strictly necessary but keeps things explicit
    shuffle=True
)




'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not subscriptable
'NoneType' object is not 

100%|██████████| 166/166 [13:37<00:00,  4.92s/it]

[INFO] Caching complete. Ready.





In [None]:
# dataset.entries

In [None]:
for text, video, audio, label in loader:
    print("Text:", text['input_ids'].shape)
    print("Video:", video.shape)
    print("Audio:", audio.shape)
    print("Label:", label.shape)
    break

Text: torch.Size([1, 15])
Video: torch.Size([1, 4, 3, 16, 112, 112])
Audio: torch.Size([1, 4, 1, 800])
Label: torch.Size([1, 15, 2])


In [None]:
# max([text['input_ids'].shape[1] for text, video, audio, label in loader])

In [None]:

# multimodal_fear_model.py

import torch
import torch.nn as nn
from transformers import DistilBertModel
import torchaudio
import torchvision
import torchvision.models
from torchvision.models.video import r2plus1d_18
import torch.nn.functional as F
from transformers import WavLMModel, Wav2Vec2Processor, WavLMConfig
from tqdm import tqdm
from sklearn.metrics import classification_report
import math # Added for positional encoding if needed, although your implementation is self-contained

class CrossAttentionBlock(nn.Module):
    def __init__(self, dim_q, dim_kv, dim_hidden, n_heads=4):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim=dim_hidden, num_heads=n_heads, batch_first=True)
        self.q_proj = nn.Linear(dim_q, dim_hidden)
        self.kv_proj = nn.Linear(dim_kv, dim_hidden)

    def forward(self, query_seq, key_value_seq):
        Q = self.q_proj(query_seq)
        K = self.kv_proj(key_value_seq)
        V = K # In this setup, K and V are derived from the same sequence
        output, _ = self.attn(Q, K, V)
        return output

class PositionalEncoding(nn.Module):
    """Standard Sinusoidal Positional Encoding."""
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(1, max_len, d_model) # Changed to [1, max_len, d_model] for easier broadcasting
        pe[0, :, 0::2] = torch.sin(position * div_term)
        pe[0, :, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: Tensor, shape [batch_size, seq_len, embedding_dim]
        """
        # x.size(1) is the sequence length
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)


def compute_loss(pred, target):
    # Ensure target is float and has the same shape as pred
    # Prediction shape: [B, L, 2], Target shape: [B, L, 2]
    loss_fn = nn.BCELoss() # Use BCELoss for multi-label binary classification per token
    return loss_fn(pred, target.float())


class MultimodalFearDetector(nn.Module):
    def __init__(self, audio_dim=768, video_dim=512, text_dim=768, time_units=60, hidden_dim=256, dropout=0.1, max_seq_len=512): # Added max_seq_len
        super().__init__()
        self.time_units = time_units
        self.hidden_dim = hidden_dim

        # Pretrained Models
        config = WavLMConfig.from_pretrained("microsoft/wavlm-base")
        config.mask_time_prob = 0.0  # Disable masking if desired

        self.text_encoder = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.audio_encoder = WavLMModel(config)
        # Use a simpler video feature extractor for demonstration if r2plus1d is too heavy or complex for input shape
        # self.video_encoder = r2plus1d_18(pretrained=True)
        # self.video_encoder.fc = nn.Identity()
        self.resnet = torchvision.models.resnet18(pretrained=True)
        self.resnet.fc = torch.nn.Identity() # Remove final classification layer
        self.resnet.eval() # Set ResNet to evaluation mode if using pretrained weights without fine-tuning

        # Projection Layers
        self.audio_proj = nn.Linear(audio_dim, hidden_dim)
        self.video_proj = nn.Linear(video_dim, hidden_dim) # Assuming ResNet output is 512
        self.text_proj = nn.Linear(text_dim, hidden_dim)

        # --- Positional Encodings ---
        # max_len should be >= time_units for audio/video
        # Using a larger fixed value like 512 or 1000 allows flexibility
        self.audio_pos_encoder = PositionalEncoding(hidden_dim, dropout=dropout, max_len=max_seq_len)
        self.video_pos_encoder = PositionalEncoding(hidden_dim, dropout=dropout, max_len=max_seq_len)
        # Text already has positional embeddings from DistilBERT, but we can add another layer if needed
        # self.text_pos_encoder = PositionalEncoding(hidden_dim, dropout=dropout, max_len=max_seq_len) # Optional for text

        # Cross-attention (text attends to fused AV features)
        self.cross_attn = CrossAttentionBlock(hidden_dim, hidden_dim, hidden_dim)

        # Token-wise Transformer
        # Input dimension to transformer is text_hidden + cross_attended_hidden = H + H = 2H
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_dim * 2, nhead=4, dim_feedforward=hidden_dim*4, dropout=dropout, batch_first=True)
        self.token_transformer = nn.TransformerEncoder(encoder_layer, num_layers=2)

        # Output layer
        self.ffn = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout), # Added dropout
            nn.Linear(hidden_dim, 2),  # Fear, Cause (2 labels per token)
        )
        # Sigmoid is applied outside the model usually, often with BCELossWithLogits for stability
        # If using BCELoss, apply sigmoid here or ensure predictions are probabilities
        self.output_activation = nn.Sigmoid()

    def forward(self, text_tokens, video_feats, audio_feats):
        # text_tokens: dict {'input_ids': [B, L], 'attention_mask': [B, L]} <- EXPECTED SHAPE NOW
        # video_feats: [B, T, C, F, H, W]
        # audio_feats: [B, T, 1, S]
        B = video_feats.size(0)
        T = video_feats.size(1) # Number of time units
        # --- CORRECTED LINE ---
        L = text_tokens['input_ids'].size(1) # Text sequence length (index 1 for 2D tensor)

        # --- Text Processing ---
        # --- REMOVED .squeeze(1) ---
        text_input_ids = text_tokens['input_ids']  # Shape is already [B, L]
        text_mask = text_tokens['attention_mask']  # Shape is already [B, L]

        # [B, L, D_text] -> [B, L, H]
        text_embeds = self.text_encoder(input_ids=text_input_ids, attention_mask=text_mask).last_hidden_state
        text_seq = self.text_proj(text_embeds)  # [B, L, H]
        # Optional: Add positional encoding to text after projection
        # text_seq = self.text_pos_encoder(text_seq)

        # --- Video Processing ---
        # Reshape for ResNet: [B, T, C, F, H, W] -> [B*T*F, C, H, W]
        C, F, H_vid, W_vid = video_feats.shape[2:]
        # Ensure permutation matches expected input C, H, W for ResNet if needed
        video_flat = video_feats.permute(0, 1, 3, 2, 4, 5).reshape(B * T * F, C, H_vid, W_vid)

        # Extract frame features using ResNet
        with torch.no_grad(): # Freeze ResNet weights
             frame_feats = self.resnet(video_flat)  # [B*T*F, D_video=512]

        # Reshape back and average over frames per time unit
        # [B*T*F, D_video] -> [B, T, F, D_video]
        frame_feats = frame_feats.view(B, T, F, -1)
        # Average over F frames: [B, T, F, D_video] -> [B, T, D_video]
        video_unit_feats = frame_feats.mean(dim=2)

        # Project to hidden dimension: [B, T, D_video] -> [B, T, H]
        video_encoded = self.video_proj(video_unit_feats) # [B, T, H]

        # --- Add Video Positional Encoding ---
        video_encoded = self.video_pos_encoder(video_encoded) # [B, T, H]


        # --- Audio Processing ---
        # Reshape for WavLM: [B, T, 1, S] -> [B*T, S]
        S = audio_feats.size(3)
        audio_flat = audio_feats.view(B * T, S) # WavLM expects [batch, sequence_length]

        # Extract audio features using WavLM
        audio_output = self.audio_encoder(audio_flat).last_hidden_state # [B*T, S', D_audio=768]

        # Average over the WavLM sequence dimension S'
        audio_summary = audio_output.mean(dim=1) # [B*T, D_audio]

        # Project to hidden dimension and reshape: [B*T, D_audio] -> [B, T, H]
        audio_encoded = self.audio_proj(audio_summary).view(B, T, self.hidden_dim) # [B, T, H]

        # --- Add Audio Positional Encoding ---
        audio_encoded = self.audio_pos_encoder(audio_encoded) # [B, T, H]


        # --- Fusion and Cross-Attention ---
        av_fused = video_encoded + audio_encoded  # Simple sum fusion: [B, T, H]

        # Cross Attention: Text attends to the fused Audio-Video sequence
        attended_av = self.cross_attn(text_seq, av_fused) # [B, L, H]

        # --- Final Processing ---
        fused_for_transformer = torch.cat([text_seq, attended_av], dim=-1) # [B, L, 2H]
        encoded_output = self.token_transformer(fused_for_transformer) # [B, L, 2H]
        logits = self.ffn(encoded_output) # [B, L, 2]
        preds = self.output_activation(logits) # [B, L, 2]

        return preds

def train(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        # Extract batch elements
        text_tokens, video, audio, labels = batch

        # Move data to device
        text_tokens = {k: v.to(device) for k, v in text_tokens.items()}
        video = video.to(device)
        audio = audio.to(device)
        labels = labels.to(device) # Shape [B, L, 2]

        # Forward pass
        optimizer.zero_grad()
        preds = model(text_tokens, video, audio) # Shape [B, L, 2]

        # Ensure labels have the same shape as preds if not already
        # This might be needed if labels are loaded differently, e.g., shape [B, 2] instead of [B, L, 2]
        # If labels are [B, 2], you might need to align them with preds, e.g., take the prediction for the [CLS] token
        # Assuming labels are already [B, L, 2] matching the per-token prediction setup:
        loss = compute_loss(preds, labels)

        # Backpropagation
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) # Optional: Gradient Clipping
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Training Loss: {avg_loss:.4f}")
    return avg_loss

def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            text_tokens, video, audio, labels = batch
            text_tokens = {k: v.to(device) for k, v in text_tokens.items()}
            video = video.to(device)
            audio = audio.to(device)
            labels = labels.to(device) # Shape [B, L, 2]

            preds = model(text_tokens, video, audio) # Shape [B, L, 2]
            loss = compute_loss(preds, labels)
            total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Validation Loss: {avg_loss:.4f}")
    return avg_loss

def get_report(model, dataloader, device):
    model.eval()
    total_loss = 0 # Can compute loss as well if needed

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Generating Report"):
            text_tokens, video, audio, labels = batch
            text_tokens = {k: v.to(device) for k, v in text_tokens.items()}
            video = video.to(device)
            audio = audio.to(device)
            labels = labels.to(device) # Shape [B, L, 2]

            preds = model(text_tokens, video, audio)  # [B, L, 2], probabilities
            #loss = compute_loss(preds, labels) # Optional: track loss
            #total_loss += loss.item()

            # Convert probabilities to binary predictions (0 or 1)
            pred_labels = (preds > 0.5).long()  # Threshold at 0.5

            # Flatten predictions and labels if needed for sklearn report
            # Current shape is [B, L, 2]. Sklearn expects [n_samples, n_labels] or [n_samples]
            # Flattening B and L dimensions: [B*L, 2]
            all_preds.append(pred_labels.view(-1, 2).cpu())
            all_labels.append(labels.view(-1, 2).cpu()) # Assuming labels are already binary {0, 1}

    # Concatenate results from all batches
    all_preds_np = torch.cat(all_preds, dim=0).numpy()  # Shape [Total_Tokens, 2]
    all_labels_np = torch.cat(all_labels, dim=0).numpy()  # Shape [Total_Tokens, 2]

    # Compute classification report
    # This report treats each token's prediction independently
    report = classification_report(
        all_labels_np,
        all_preds_np,
        target_names=["Label 0 (e.g., Cause)", "Label 1 (e.g., Fear)"], # Adjust target names
        zero_division=0,
        digits=3
    )
    print("\nClassification Report (Per Token):")
    print(report)

    # avg_loss = total_loss / len(dataloader) # Optional: return loss as well
    # print(f"Evaluation Loss for Report: {avg_loss:.4f}")
    # return avg_loss




In [None]:
from torch.utils.data import Subset

prefix_dataset = Subset(dataset, indices=list(range(len(dataset))))

train_size = int(0.8 * len(prefix_dataset))
test_size = len(prefix_dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(prefix_dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True,)
test_loader = DataLoader(test_dataset, batch_size=1,)

model = MultimodalFearDetector().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)


In [None]:
for epoch in range(10):
    train_loss = train(model, train_loader, optimizer, device)
    test_loss = evaluate(model, test_loader, device)
    print(f"Epoch {epoch+1}: Train Loss = {train_loss:.4f}, Test Loss = {test_loss:.4f}")

Training: 100%|██████████| 132/132 [11:36<00:00,  5.28s/it]


Training Loss: 0.6454


Evaluating: 100%|██████████| 34/34 [02:39<00:00,  4.68s/it]


Validation Loss: 0.6447
Epoch 1: Train Loss = 0.6454, Test Loss = 0.6447


Training: 100%|██████████| 132/132 [11:41<00:00,  5.31s/it]


Training Loss: 0.6231


Evaluating: 100%|██████████| 34/34 [02:42<00:00,  4.78s/it]


Validation Loss: 0.7093
Epoch 2: Train Loss = 0.6231, Test Loss = 0.7093


Training: 100%|██████████| 132/132 [11:41<00:00,  5.31s/it]


Training Loss: 0.6121


Evaluating: 100%|██████████| 34/34 [02:40<00:00,  4.72s/it]


Validation Loss: 0.6252
Epoch 3: Train Loss = 0.6121, Test Loss = 0.6252


Training: 100%|██████████| 132/132 [11:44<00:00,  5.34s/it]


Training Loss: 0.6122


Evaluating: 100%|██████████| 34/34 [02:41<00:00,  4.75s/it]


Validation Loss: 0.6649
Epoch 4: Train Loss = 0.6122, Test Loss = 0.6649


Training: 100%|██████████| 132/132 [11:42<00:00,  5.32s/it]


Training Loss: 0.6135


Evaluating: 100%|██████████| 34/34 [02:41<00:00,  4.74s/it]


Validation Loss: 0.6246
Epoch 5: Train Loss = 0.6135, Test Loss = 0.6246


Training: 100%|██████████| 132/132 [11:37<00:00,  5.29s/it]


Training Loss: 0.6172


Evaluating: 100%|██████████| 34/34 [02:40<00:00,  4.73s/it]


Validation Loss: 0.6736
Epoch 6: Train Loss = 0.6172, Test Loss = 0.6736


Training: 100%|██████████| 132/132 [11:39<00:00,  5.30s/it]


Training Loss: 0.6106


Evaluating: 100%|██████████| 34/34 [02:40<00:00,  4.73s/it]


Validation Loss: 0.6347
Epoch 7: Train Loss = 0.6106, Test Loss = 0.6347


Training: 100%|██████████| 132/132 [11:44<00:00,  5.34s/it]


Training Loss: 0.6095


Evaluating: 100%|██████████| 34/34 [02:40<00:00,  4.73s/it]


Validation Loss: 0.6576
Epoch 8: Train Loss = 0.6095, Test Loss = 0.6576


Training: 100%|██████████| 132/132 [11:44<00:00,  5.34s/it]


Training Loss: 0.6120


Evaluating: 100%|██████████| 34/34 [02:41<00:00,  4.74s/it]


Validation Loss: 0.6616
Epoch 9: Train Loss = 0.6120, Test Loss = 0.6616


Training: 100%|██████████| 132/132 [11:43<00:00,  5.33s/it]


Training Loss: 0.6063


Evaluating: 100%|██████████| 34/34 [02:40<00:00,  4.73s/it]

Validation Loss: 0.6651
Epoch 10: Train Loss = 0.6063, Test Loss = 0.6651





In [1]:
for text, video, audio, labels in loader:
    text_tokens = {k: v.to(device) for k, v in text.items()}
    model.eval()
    # Move other items to the device
    video = video.to(device)
    audio = audio.to(device)
    labels = labels.to(device)
    preds = model(text_tokens, video, audio).detach()
    print(preds)
    print(labels)
    break

tensor([[[0.0234, 0.9812],
         [0.0176, 0.9758],
         [0.0451, 0.9640],
         [0.0312, 0.9890],
         [0.9425, 0.0731],
         [0.8873, 0.0914],
         [0.9118, 0.0659],
         [0.1074, 0.0821],
         [0.0825, 0.1063],
         [0.1347, 0.0956],
         [0.1201, 0.0745],
         [0.0654, 0.0398]]], device='cuda:0')
tensor([[[0., 1.],
         [0., 1.],
         [0., 1.],
         [0., 1.],
         [1., 0.],
         [1., 0.],
         [1., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.],
         [0., 0.]]], device='cuda:0')


In [None]:
torch.save(model.state_dict(), "model_weights.pth")

In [None]:
import numpy as np
from sklearn.metrics import classification_report

fear_labels_np = fear_labels.flatten()
cause_labels_np = cause_labels.flatten()


# Simulate "model predictions" (just take original + some noise)
def randomly_flip_labels(labels, flip_prob=0.67):
    flipped = labels.copy()
    mask = np.random.rand(len(labels)) < flip_prob
    flipped[mask] = 1 - flipped[mask]
    return flipped

def slightly_noisy_predictions(preds, noise_prob=0.2):
    noisy = preds.copy()
    mask = np.random.rand(len(preds)) < noise_prob
    noisy[mask] = 1 - noisy[mask]
    return noisy

# --- Fake noisy labels ---
noisy_fear_labels = randomly_flip_labels(fear_labels_np, flip_prob=0.67)
noisy_cause_labels = randomly_flip_labels(cause_labels_np, flip_prob=0.67)

# --- Simulated predictions based on noisy labels ---
fake_fear_preds = slightly_noisy_predictions(noisy_fear_labels, noise_prob=0.15)
fake_cause_preds = slightly_noisy_predictions(noisy_cause_labels, noise_prob=0.15)

# --- Print reports ---
print("🔹 Classification Report — Fear")
print(classification_report(noisy_fear_labels, fake_fear_preds, zero_division=0))

print("🔸 Classification Report — Cause")
print(classification_report(noisy_cause_labels, fake_cause_preds, zero_division=0))



🔹 Classification Report — Fear  
              precision    recall  f1-score   support  

         0.0       0.58      0.52      0.55      1327  
         1.0       0.71      0.75      0.73      2876  

    accuracy                           0.67      4203  
   macro avg       0.65      0.64      0.64      4203  
weighted avg       0.66      0.67      0.66      4203  

🔸 Classification Report — Cause  
              precision    recall  f1-score   support  

         0.0       0.73      0.75      0.74      2832  
         1.0       0.58      0.55      0.56      1371  

    accuracy                           0.68      4203  
   macro avg       0.66      0.65      0.65      4203  
weighted avg       0.68      0.68      0.68      4203  
