In [13]:
import torch
#torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#torch.cuda.memory_summary(device=None, abbreviated=False)
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd
import numpy as np
import torch


In [41]:
test_path = "/Volumes/T7/OMSCS/CLEF2025/EXIST2025/exist-2025/notebooks/train_test_split/test_df.csv"
val_path = "/Volumes/T7/OMSCS/CLEF2025/EXIST2025/exist-2025/notebooks/train_test_split/valid_df.csv"
train_path = "/Volumes/T7/OMSCS/CLEF2025/EXIST2025/exist-2025/notebooks/train_test_split/train_df.csv"

# Read the CSV files
test_df = pd.read_csv(test_path)
val_df = pd.read_csv(val_path)
train_df = pd.read_csv(train_path)

# Drop the individual columns since we've combined them
# Concatenate description_fp, analysis_fp, and analysis_fn into text column
train_df['text'] = train_df['description_fp'] + ' ' + train_df['analysis_fp'] + ' ' + train_df['analysis_fn']
val_df['text'] = val_df['description_fp'] + ' ' + val_df['analysis_fp'] + ' ' + val_df['analysis_fn']
test_df['text'] = test_df['description_fp'] + ' ' + test_df['analysis_fp'] + ' ' + test_df['analysis_fn']

# description_fp+analysis_fp+description_fn (Mean Test Accuracy: 0.8216 ± 0.0177)
# df['text'] = df['description_fp'] + ' ' + df['analysis_fp'] + ' ' + df['description_fn']
# test_df['text'] = test_df['description_fp'] + ' ' + test_df['analysis_fp'] + ' ' + test_df['description_fn']

train_data = train_df[['id_EXIST','video','text', 'target']]
val_data = val_df[['id_EXIST','video','text', 'target']]
test_data = test_df[['id_EXIST','video','text', 'target']]

train_data = train_data.dropna(subset=['text'])
val_data = val_data.dropna(subset=['text'])
test_data = test_data.dropna(subset=['text'])

# train_data['target'] = train_data['target'].map({'YES': 1, 'NO': 0})
# val_data['target'] = val_data['target'].map({'YES': 1, 'NO': 0})
# test_data['target'] = test_data['target'].map({'YES': 1, 'NO': 0})


In [44]:
val_data

Unnamed: 0,id_EXIST,video,text,target
0,220887,7115146553022631174.mp4,A woman asks a man about his most embarrassing...,0
1,220250,6922184169908915462.mp4,"A woman shares a list of 'Hot Girl Phrases,' w...",1
2,220514,6984944914136091909.mp4,A young woman answers frequently asked questio...,0
3,220695,7051661563433995526.mp4,The video uses drawings and narration to illus...,1
4,220661,7038258256552201477.mp4,The video shows a young woman holding a bass g...,1
...,...,...,...,...
189,220132,6878835838336240902.mp4,A young woman shares a humorous anecdote about...,0
190,220522,6987901203166416134.mp4,A man and a transwoman have a conversation at ...,0
191,220544,6993470917552655622.mp4,"A man and a woman participate in a ""Men vs wom...",1
192,220072,6843454540134501638.mp4,A person films their 'new step bro' in a kitch...,0


In [45]:

train_texts = train_data['text'].tolist()
train_labels =  train_data['target'].tolist()
train_video_ids =  train_data['video'].tolist()
train_exist_ids =  train_data['id_EXIST'].tolist()

test_texts = test_data['text'].tolist()
test_labels =  test_data['target'].tolist()
test_video_ids =  test_data['video'].tolist()
test_exist_ids =  test_data['id_EXIST'].tolist()

val_texts = val_data['text'].tolist()
val_labels =  val_data['target'].tolist()
val_video_ids =  val_data['video'].tolist()
val_exist_ids =  val_data['id_EXIST'].tolist()



In [46]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')

def tokenize(texts):
    return tokenizer(
        texts,
        padding=True,
        truncation=True,
        return_tensors='pt',
        max_length=512
    )

train_encodings = tokenize(train_texts)
test_encodings = tokenize(test_texts)
val_encodings = tokenize(val_texts)


In [47]:
class TextDataset(Dataset):
    def __init__(self, encodings, labels, video_ids, exist_ids):
        self.encodings = encodings
        self.labels = labels
        self.video_ids = video_ids
        self.exist_ids = exist_ids

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        item['video_ids'] = self.video_ids[idx]
        item['exist_ids'] = self.exist_ids[idx]
        return item

train_dataset = TextDataset(train_encodings, train_labels, train_video_ids, train_exist_ids)
test_dataset = TextDataset(test_encodings, test_labels, test_video_ids, test_exist_ids)
val_dataset = TextDataset(val_encodings, val_labels, val_video_ids, val_exist_ids)

In [48]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [49]:
model = RobertaForSequenceClassification.from_pretrained("roberta-large", num_labels=2)

# Freeze embeddings and encoder layers 0–20 (i.e., first 21 layers)
for name, param in model.named_parameters():
    if name.startswith("roberta.embeddings"):
        param.requires_grad = False
    elif "roberta.encoder.layer" in name:
        layer_num = int(name.split("layer.")[1].split(".")[0])
        if layer_num < 21:
            param.requires_grad = False

        
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), 
                                  lr=2e-05, 
                                  weight_decay=0.0)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [50]:
epochs = 6

train_losses = []
train_accuracies = []
val_losses = []
val_accuracies = []

for epoch in range(epochs):
    # --- Training ---
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    loop = tqdm(train_loader, desc=f"Epoch {epoch+1} - Training")
    for batch in loop:
        batch = {k: (v.to(device) if isinstance(v, torch.Tensor) else v) for k, v in batch.items()}
        
        #inputs = {k: v for k, v in batch.items() if k != "video_ids"}
        inputs = {k: v for k, v in batch.items() if k not in ["video_ids", "exist_ids"]}

        outputs = model(**inputs)
        loss = outputs.loss
        logits = outputs.logits

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        correct += (preds == batch['labels']).sum().item()
        total += batch['labels'].size(0)

        loop.set_postfix(loss=loss.item())

    train_loss = total_loss / len(train_loader)
    train_acc = correct / total
    train_losses.append(train_loss)
    train_accuracies.append(train_acc)

    # --- Validation ---
    model.eval()
    val_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} - Validation"):
            batch = {k: (v.to(device) if isinstance(v, torch.Tensor) else v) for k, v in batch.items()}
            #inputs = {k: v for k, v in batch.items() if k != "video_ids"}
            inputs = {k: v for k, v in batch.items() if k not in ["video_ids", "exist_ids"]}
            outputs = model(**inputs)
            loss = outputs.loss
            logits = outputs.logits

            val_loss += loss.item()
            preds = torch.argmax(logits, dim=1)
            correct += (preds == batch['labels']).sum().item()
            total += batch['labels'].size(0)

    val_loss /= len(val_loader)
    val_acc = correct / total
    val_losses.append(val_loss)
    val_accuracies.append(val_acc)

    print(f"[Epoch {epoch+1}] Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f} | Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

# --- Final Test Evaluation ---
model.eval()
test_preds, test_targets = [], []
test_loss = 0
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Final Test Evaluation"):
        batch = {k: (v.to(device) if isinstance(v, torch.Tensor) else v) for k, v in batch.items()}
        #inputs = {k: v for k, v in batch.items() if k != "video_ids"}
        inputs = {k: v for k, v in batch.items() if k not in ["video_ids", "exist_ids"]}
        outputs = model(**inputs)
        logits = outputs.logits
        loss = outputs.loss
        test_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        test_preds.extend(preds.cpu().numpy())
        test_targets.extend(batch['labels'].cpu().numpy())
test_loss /= len(test_loader)
test_accuracy = accuracy_score(test_targets, test_preds)
print(f"Final Test Accuracy: {test_accuracy:.4f}")
print(f"Final Test Loss: {test_loss:.4f}")

Epoch 1 - Training: 100%|██████████| 37/37 [04:06<00:00,  6.66s/it, loss=0.732]
Epoch 1 - Validation: 100%|██████████| 13/13 [00:29<00:00,  2.23s/it]


[Epoch 1] Train Loss: 0.7064, Train Acc: 0.5129 | Val Loss: 0.6263, Val Acc: 0.5722


Epoch 2 - Training: 100%|██████████| 37/37 [03:57<00:00,  6.43s/it, loss=0.307]
Epoch 2 - Validation: 100%|██████████| 13/13 [00:26<00:00,  2.02s/it]


[Epoch 2] Train Loss: 0.5852, Train Acc: 0.6867 | Val Loss: 0.5114, Val Acc: 0.7320


Epoch 3 - Training: 100%|██████████| 37/37 [03:58<00:00,  6.44s/it, loss=0.565]
Epoch 3 - Validation: 100%|██████████| 13/13 [00:28<00:00,  2.20s/it]


[Epoch 3] Train Loss: 0.5117, Train Acc: 0.7694 | Val Loss: 0.6914, Val Acc: 0.6701


Epoch 4 - Training: 100%|██████████| 37/37 [04:03<00:00,  6.57s/it, loss=0.504]
Epoch 4 - Validation: 100%|██████████| 13/13 [00:27<00:00,  2.15s/it]


[Epoch 4] Train Loss: 0.5285, Train Acc: 0.7246 | Val Loss: 0.4850, Val Acc: 0.7835


Epoch 5 - Training: 100%|██████████| 37/37 [03:57<00:00,  6.43s/it, loss=0.333]
Epoch 5 - Validation: 100%|██████████| 13/13 [00:30<00:00,  2.33s/it]


[Epoch 5] Train Loss: 0.4631, Train Acc: 0.8072 | Val Loss: 0.4683, Val Acc: 0.7835


Epoch 6 - Training: 100%|██████████| 37/37 [03:53<00:00,  6.31s/it, loss=0.196]
Epoch 6 - Validation: 100%|██████████| 13/13 [00:27<00:00,  2.13s/it]


[Epoch 6] Train Loss: 0.4304, Train Acc: 0.8072 | Val Loss: 0.4972, Val Acc: 0.7938


Final Test Evaluation: 100%|██████████| 13/13 [00:33<00:00,  2.56s/it]

Final Test Accuracy: 0.7938
Final Test Loss: 0.5102





In [None]:
# --- Final Test Evaluation ---
model.eval()
test_preds, test_targets = [], []
test_loss = 0
with torch.no_grad():
    for batch in tqdm(test_loader, desc="Final Test Evaluation"):
        batch = {k: (v.to(device) if isinstance(v, torch.Tensor) else v) for k, v in batch.items()}
        #inputs = {k: v for k, v in batch.items() if k != "video_ids"}
        inputs = {k: v for k, v in batch.items() if k not in ["video_ids", "exist_ids"]}
        outputs = model(**inputs)
        logits = outputs.logits
        loss = outputs.loss
        test_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        test_preds.extend(preds.cpu().numpy())
        test_targets.extend(batch['labels'].cpu().numpy())
test_loss /= len(test_loader)
test_accuracy = accuracy_score(test_targets, test_preds)
print(f"Final Test Accuracy: {test_accuracy:.4f}")
print(f"Final Test Loss: {test_loss:.4f}")

Final Test Evaluation: 100%|██████████| 13/13 [00:06<00:00,  2.09it/s]

Final Test Accuracy: 0.7179
Final Test Loss: 0.5425





## Submission

In [None]:
clean_test_path = "/Users/moiz.ali/Downloads/EXIST 2025 Dataset V0.3/EXIST 2025 Videos Dataset/test/EXIST2025_test_clean.json"
df = pd.read_json(clean_test_path).T
df = df[df['lang'] == 'en']
def majority_vote(lst):
    return pd.Series(lst).mode().iloc[0]

print(df.shape)
df.head()

Unnamed: 0,id_Tiktok,id_EXIST,lang,text,video,path_video,url,number_annotators,annotators,gender_annotators,split
420001,7246707608772414722,420001,en,before vd. after face_with_hand_over_mouth f...,7246707608772414722.mp4,videos/7246707608772414722.mp4,https://www.tiktok.com/@le_zero1/video/7246707...,2,"[Annotator_1, Annotator_5]","[F, M]",TEST-VIDEO_EN
420002,7242604463045823749,420002,en,♕. editing him with this intro again because w...,7242604463045823749.mp4,videos/7242604463045823749.mp4,https://www.tiktok.com/@gothrx.00/video/724260...,2,"[Annotator_2, Annotator_6]","[F, F]",TEST-VIDEO_EN
420003,7261068342348303622,420003,en,ni taylor en lo ilustró mejor listen to your ...,7261068342348303622.mp4,videos/7261068342348303622.mp4,https://www.tiktok.com/@danytorresmiau/video/7...,2,"[Annotator_1, Annotator_5]","[F, M]",TEST-VIDEO_EN
420004,7095442059926048005,420004,en,clean the previously detected text by removin...,7095442059926048005.mp4,videos/7095442059926048005.mp4,https://www.tiktok.com/@dany.19.19/video/70954...,2,"[Annotator_1, Annotator_5]","[F, M]",TEST-VIDEO_EN
420005,7231255755746282794,420005,en,my antidepressant my favorite mansplain was wh...,7231255755746282794.mp4,videos/7231255755746282794.mp4,https://www.tiktok.com/@thebrewhounds/video/72...,2,"[Annotator_2, Annotator_6]","[F, F]",TEST-VIDEO_EN


In [None]:
class CleanTextDataset(Dataset):
    def __init__(self, encodings, video_ids, exist_ids):
        self.encodings = encodings
        self.video_ids = video_ids
        self.exist_ids = exist_ids

    def __len__(self):
        return len(self.exist_ids)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['video_ids'] = self.video_ids[idx]
        item['exist_ids'] = self.exist_ids[idx]
        return item

In [None]:
clean_test_data = df[['id_EXIST','video','text']]
clean_test_texts = clean_test_data['text'].tolist()
clean_test_video_ids =  clean_test_data['video'].tolist()
clean_test_exist_ids =  clean_test_data['id_EXIST'].tolist()
clean_test_encodings = tokenize(clean_test_texts)
clean_test_dataset = CleanTextDataset(clean_test_encodings, clean_test_video_ids, clean_test_exist_ids)
clean_test_loader = DataLoader(clean_test_dataset, batch_size=16, shuffle=False)

In [None]:
import torch
import torch.nn.functional as F

def generate_predictions_clean_testing(model, dataloader):
    model.eval()
    all_exist_ids = []
    all_video_ids = []
    all_probs = []
    all_preds = []
    all_preds_text =[]
    all_titles =[]

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Generating Predictions"):
            batch = {k: (v.to(device) if isinstance(v, torch.Tensor) else v) for k, v in batch.items()}
            inputs = {k: v for k, v in batch.items() if k not in ["video_ids", "exist_ids"]}
            outputs = model(**inputs)
            logits = outputs.logits
            probs = F.softmax(logits, dim=1)  # Get probabilities from logits
            preds = torch.argmax(probs, dim=1)  # Get predicted class
            preds_mapped = ["YES" if pred >=0.5 else "NO" for pred in preds.cpu().numpy()]
            constant_value = "EXIST2025"
            constant_column = [constant_value] * len(preds_mapped)
            all_exist_ids.extend(batch['exist_ids'])
            all_video_ids.extend(batch['video_ids'])
            all_probs.extend(probs.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
            all_preds_text.extend(preds_mapped)
            all_titles.extend(constant_column)

    return np.array(all_exist_ids), np.array(all_video_ids), np.array(all_probs), np.array(all_preds), np.array(all_preds_text), np.array(all_titles)

In [None]:
import torch
import torch.nn.functional as F

def generate_predictions_clean_testing_soft(model, dataloader):
    model.eval()
    all_exist_ids = []
    all_video_ids = []
    all_probs = []
    all_preds = []
    all_preds_yes =[]
    all_preds_no =[]
    all_titles =[]

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Generating Predictions"):
            batch = {k: (v.to(device) if isinstance(v, torch.Tensor) else v) for k, v in batch.items()}
            inputs = {k: v for k, v in batch.items() if k not in ["video_ids", "exist_ids"]}
            outputs = model(**inputs)
            logits = outputs.logits
            probs = F.softmax(logits, dim=1)  # Get probabilities from logits
            preds = torch.argmax(probs, dim=1)  # Get predicted class
            preds_mapped_yes = probs[:,0]
            preds_mapped_no = probs[:,1]
            constant_value = "EXIST2025"
            constant_column = [constant_value] * len(preds_mapped_yes)
            all_exist_ids.extend(batch['exist_ids'])
            all_video_ids.extend(batch['video_ids'])
            all_probs.extend(probs.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
            all_preds_yes.extend(preds_mapped_yes.cpu().numpy())
            all_preds_no.extend(preds_mapped_no.cpu().numpy())
            all_titles.extend(constant_column)

    return np.array(all_exist_ids), np.array(all_video_ids), np.array(all_probs), np.array(all_preds), np.array(all_preds_yes), np.array(all_preds_no), np.array(all_titles)

In [None]:
exist_ids, video_ids, probs, preds, preds_yes, preds_no, titles = generate_predictions_clean_testing_soft(model, clean_test_loader)

df = pd.DataFrame({
    "test_case": titles,
    "id": exist_ids,
    "value_yes": preds_yes,
    "value_no": preds_no
})

# df.to_csv("clean_text_test_predictions_soft.csv", index=False)
# print("Saved predictions to clean_text_test_predictions_soft.csv")

Generating Predictions: 100%|██████████| 24/24 [00:11<00:00,  2.17it/s]

Saved predictions to clean_text_test_predictions_soft.csv





In [None]:
df["value"] = df.apply(lambda row: {"YES": row["value_yes"], "NO": row["value_no"]}, axis=1)
df = df.drop(columns=["value_yes", "value_no"])

df.to_json("clean_text_test_predictions_soft.json", orient="records", indent=2)
print("Saved predictions to clean_text_test_predictions_soft.json")

Saved predictions to clean_text_test_predictions_soft.json


In [None]:
exist_ids, video_ids, probs, preds, preds_text, titles = generate_predictions_clean_testing(model, clean_test_loader)

df = pd.DataFrame({
    "test_case": titles,
    "id": exist_ids,
    "value": preds_text
})

# df.to_csv("clean_text_test_predictions.csv", index=False)
# print("Saved predictions to clean_text_test_predictions.csv")

Generating Predictions: 100%|██████████| 24/24 [00:17<00:00,  1.34it/s]


Saved predictions to clean_text_test_predictions.csv


In [None]:
df.to_json("clean_text_test_predictions.json", orient="records", indent=2)
print("Saved predictions to clean_text_test_predictions.json")

Saved predictions to clean_text_test_predictions.json
