In [87]:
import torch
#torch.cuda.empty_cache()
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
#torch.cuda.memory_summary(device=None, abbreviated=False)
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd
import numpy as np
import torch


## Train Test Split

In [None]:
test_path = "/Volumes/T7/OMSCS/CLEF2025/EXIST2025/exist-2025/notebooks/train_test_split/test_df.csv"
val_path = "/Volumes/T7/OMSCS/CLEF2025/EXIST2025/exist-2025/notebooks/train_test_split/valid_df.csv"
train_path = "/Volumes/T7/OMSCS/CLEF2025/EXIST2025/exist-2025/notebooks/train_test_split/train_df.csv"

# Read the CSV files
test_df = pd.read_csv(test_path)
val_df = pd.read_csv(val_path)
train_df = pd.read_csv(train_path)

# Combine all dataframes
df = pd.concat([train_df, val_df], ignore_index=True)


# Drop the individual columns since we've combined them
# Concatenate description_fp, analysis_fp, and analysis_fn into text column
df['text'] = df['description_fp'] + ' ' + df['analysis_fp'] + ' ' + df['analysis_fn']
test_df['text'] = test_df['description_fp'] + ' ' + test_df['analysis_fp'] + ' ' + test_df['analysis_fn']

# description_fp+analysis_fp+description_fn (Mean Test Accuracy: 0.8216 ± 0.0177)
# df['text'] = df['description_fp'] + ' ' + df['analysis_fp'] + ' ' + df['description_fn']
# test_df['text'] = test_df['description_fp'] + ' ' + test_df['analysis_fp'] + ' ' + test_df['description_fn']

# Drop the individual columns since we've combined them
df = df.drop(columns=['description_fp', 'analysis_fp', 'description_fn'])

# Convert target values to YES/NO for both dataframes
df['target'] = df['target'].map({1: 'YES', 0: 'NO'})
test_df['target'] = test_df['target'].map({1: 'YES', 0: 'NO'})

# Remove rows where text is NA
df = df.dropna(subset=['text'])
test_df = test_df.dropna(subset=['text'])

# Map target values to 1/0 for both dataframes
df['target'] = df['target'].map({'YES': 1, 'NO': 0})
test_df['target'] = test_df['target'].map({'YES': 1, 'NO': 0})


print(df.shape)
df.head()

print(df['text'].iloc[0])







(775, 18)
A woman in a TikTok video recounts an interaction with a man who said he doesn't like women who are beautiful and know it, and she responds by deconstructing his statement as a reflection of his own insecurities. The video criticizes sexism by highlighting and challenging a man's prejudiced statement against confident, beautiful women. It reframes his dislike as an insecurity stemming from the belief that he has nothing to offer such women, thereby critiquing the misogynistic view that women's self-awareness of their beauty is a negative trait. The video features a woman recounting and satirically critiquing a man's sexist comments about women's beauty and confidence, with the video's framing clearly opposing and mocking these sexist views.


# Stratified KFold

In [89]:
from sklearn.model_selection import KFold
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score

tokenizer = RobertaTokenizer.from_pretrained('roberta-large')

class TextDataset(Dataset):
    def __init__(self, encodings, labels, video_ids, exist_ids):
        self.encodings = encodings
        self.labels = labels
        self.video_ids = video_ids
        self.exist_ids = exist_ids

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        item['video_ids'] = self.video_ids[idx]
        item['exist_ids'] = self.exist_ids[idx]
        return item

def tokenize(texts):
    return tokenizer(
        texts,
        padding=True,
        truncation=True,
        return_tensors='pt',
        max_length=512
    )

def get_loader(df, batch_size=16, shuffle=False):
    texts = df['text'].tolist()
    labels = df['target'].tolist()
    video_ids = df['video'].tolist()
    exist_ids = df['id_EXIST'].tolist()
    encodings = tokenize(texts)
    dataset = TextDataset(encodings, labels, video_ids, exist_ids)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)


In [90]:
def train_and_validate(train_loader, val_loader, device, num_labels=2, freeze_layers_up_to=20, epochs=6):
    model = RobertaForSequenceClassification.from_pretrained("roberta-large", num_labels=num_labels)
    
    # Freeze embeddings and encoder layers up to `freeze_layers_up_to`
    for name, param in model.named_parameters():
        if name.startswith("roberta.embeddings"):
            param.requires_grad = False
        elif "roberta.encoder.layer" in name:
            layer_num = int(name.split("layer.")[1].split(".")[0])
            if layer_num <= freeze_layers_up_to:
                param.requires_grad = False
    
    model.to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.0)

    for epoch in range(epochs):
        # --- Training ---
        model.train()
        total_loss, correct, total = 0, 0, 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} - Train"):
            batch = {k: (v.to(device) if isinstance(v, torch.Tensor) else v) for k, v in batch.items()}
            inputs = {k: v for k, v in batch.items() if k not in ["video_ids", "exist_ids"]}
            outputs = model(**inputs)
            loss, logits = outputs.loss, outputs.logits

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            correct += (torch.argmax(logits, dim=1) == batch['labels']).sum().item()
            total += batch['labels'].size(0)

        train_acc = correct / total

        # --- Validation ---
        model.eval()
        val_loss, correct, total = 0, 0, 0
        with torch.no_grad():
            for batch in tqdm(val_loader, desc=f"Epoch {epoch+1} - Val"):
                batch = {k: (v.to(device) if isinstance(v, torch.Tensor) else v) for k, v in batch.items()}
                inputs = {k: v for k, v in batch.items() if k not in ["video_ids", "exist_ids"]}
                outputs = model(**inputs)
                val_loss += outputs.loss.item()
                correct += (torch.argmax(outputs.logits, dim=1) == batch['labels']).sum().item()
                total += batch['labels'].size(0)

        val_acc = correct / total
        print(f"[Epoch {epoch+1}] Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")

    return model


In [81]:
import os
def run_kfold(df, test_df, k=5, device='cuda'):
    
    # Split the data into train+validation and test sets
    df_train_valid = df.copy()
    test_df = test_df.copy()
    
    print(f"Train+Validation set shape: {df_train_valid.shape}")
    print(f"Test set shape: {test_df.shape}")
    print("Columns in df_train_valid:")
    print(df_train_valid.columns.tolist())
    

    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    fold_results = []

    for fold_idx, (train_idx, val_idx) in enumerate(kf.split(df_train_valid)):
        print(f"\n===== Fold {fold_idx+1} =====")
        train_df = df_train_valid.iloc[train_idx]
        val_df = df_train_valid.iloc[val_idx]

        print(f"Train set shape: {train_df.shape}")
        print(f"Validation set shape: {val_df.shape}")

        train_loader = get_loader(train_df, shuffle=True)
        val_loader = get_loader(val_df)
        model = train_and_validate(train_loader, val_loader, device)
        # Save the model for this fold
        model_save_path = f'models/roberta_large_fold_{fold_idx+1}.pt'
        os.makedirs('models', exist_ok=True)
        torch.save(model.state_dict(), model_save_path)
        print(f"Model saved to {model_save_path}")

        # Optional: Evaluate on the held-out test set
        test_loader = get_loader(test_df)
        model.eval()
        preds, targets = [], []
        with torch.no_grad():
            for batch in tqdm(test_loader, desc="Testing"):
                batch = {k: (v.to(device) if isinstance(v, torch.Tensor) else v) for k, v in batch.items()}
                inputs = {k: v for k, v in batch.items() if k not in ["video_ids", "exist_ids"]}
                outputs = model(**inputs)
                preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
                targets.extend(batch['labels'].cpu().numpy())
        
        test_acc = accuracy_score(targets, preds)
        print(f"Fold {fold_idx+1} Test Accuracy: {test_acc:.4f}")
        fold_results.append(test_acc)

    print("\n=== K-Fold Summary ===")
    print(f"Mean Test Accuracy: {np.mean(fold_results):.4f} ± {np.std(fold_results):.4f}")


In [91]:
run_kfold(df, test_df, k=5, device='mps')

Train+Validation set shape: (775, 18)
Test set shape: (194, 21)
Columns in df_train_valid:
['id_Tiktok', 'id_EXIST', 'lang', 'text', 'video', 'path_video', 'url', 'annotators', 'number_annotators', 'gender_annotators', 'labels_task3_1', 'labels_task3_2', 'labels_task3_3', 'split', 'target', 'label_fp', 'label_fn', 'analysis_fn']

===== Fold 1 =====
Train set shape: (620, 18)
Validation set shape: (155, 18)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 - Train: 100%|██████████| 39/39 [01:01<00:00,  1.57s/it]
Epoch 1 - Val: 100%|██████████| 10/10 [00:08<00:00,  1.12it/s]


[Epoch 1] Train Acc: 0.5403 | Val Acc: 0.5677


Epoch 2 - Train: 100%|██████████| 39/39 [00:37<00:00,  1.03it/s]
Epoch 2 - Val: 100%|██████████| 10/10 [00:07<00:00,  1.36it/s]


[Epoch 2] Train Acc: 0.6500 | Val Acc: 0.8129


Epoch 3 - Train: 100%|██████████| 39/39 [00:37<00:00,  1.04it/s]
Epoch 3 - Val: 100%|██████████| 10/10 [00:07<00:00,  1.37it/s]


[Epoch 3] Train Acc: 0.8113 | Val Acc: 0.8323


Epoch 4 - Train: 100%|██████████| 39/39 [00:37<00:00,  1.03it/s]
Epoch 4 - Val: 100%|██████████| 10/10 [00:07<00:00,  1.36it/s]


[Epoch 4] Train Acc: 0.8097 | Val Acc: 0.8387


Epoch 5 - Train: 100%|██████████| 39/39 [00:40<00:00,  1.04s/it]
Epoch 5 - Val: 100%|██████████| 10/10 [00:08<00:00,  1.24it/s]


[Epoch 5] Train Acc: 0.8629 | Val Acc: 0.8516


Epoch 6 - Train: 100%|██████████| 39/39 [00:40<00:00,  1.03s/it]
Epoch 6 - Val: 100%|██████████| 10/10 [00:07<00:00,  1.31it/s]


[Epoch 6] Train Acc: 0.9210 | Val Acc: 0.8516
Model saved to models/roberta_large_fold_1.pt


Testing: 100%|██████████| 13/13 [00:21<00:00,  1.62s/it]


Fold 1 Test Accuracy: 0.8454

===== Fold 2 =====
Train set shape: (620, 18)
Validation set shape: (155, 18)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 - Train: 100%|██████████| 39/39 [00:47<00:00,  1.22s/it]
Epoch 1 - Val: 100%|██████████| 10/10 [00:07<00:00,  1.33it/s]


[Epoch 1] Train Acc: 0.5081 | Val Acc: 0.5742


Epoch 2 - Train: 100%|██████████| 39/39 [00:42<00:00,  1.08s/it]
Epoch 2 - Val: 100%|██████████| 10/10 [00:07<00:00,  1.42it/s]


[Epoch 2] Train Acc: 0.5855 | Val Acc: 0.6065


Epoch 3 - Train: 100%|██████████| 39/39 [00:41<00:00,  1.06s/it]
Epoch 3 - Val: 100%|██████████| 10/10 [00:06<00:00,  1.51it/s]


[Epoch 3] Train Acc: 0.7387 | Val Acc: 0.7806


Epoch 4 - Train: 100%|██████████| 39/39 [00:41<00:00,  1.07s/it]
Epoch 4 - Val: 100%|██████████| 10/10 [00:06<00:00,  1.50it/s]


[Epoch 4] Train Acc: 0.7774 | Val Acc: 0.8194


Epoch 5 - Train: 100%|██████████| 39/39 [00:41<00:00,  1.06s/it]
Epoch 5 - Val: 100%|██████████| 10/10 [00:06<00:00,  1.51it/s]


[Epoch 5] Train Acc: 0.8032 | Val Acc: 0.8129


Epoch 6 - Train: 100%|██████████| 39/39 [00:41<00:00,  1.05s/it]
Epoch 6 - Val: 100%|██████████| 10/10 [00:06<00:00,  1.50it/s]


[Epoch 6] Train Acc: 0.8581 | Val Acc: 0.8194
Model saved to models/roberta_large_fold_2.pt


Testing: 100%|██████████| 13/13 [00:18<00:00,  1.44s/it]


Fold 2 Test Accuracy: 0.8144

===== Fold 3 =====
Train set shape: (620, 18)
Validation set shape: (155, 18)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 - Train: 100%|██████████| 39/39 [00:53<00:00,  1.38s/it]
Epoch 1 - Val: 100%|██████████| 10/10 [00:06<00:00,  1.44it/s]


[Epoch 1] Train Acc: 0.5290 | Val Acc: 0.5677


Epoch 2 - Train: 100%|██████████| 39/39 [00:43<00:00,  1.13s/it]
Epoch 2 - Val: 100%|██████████| 10/10 [00:06<00:00,  1.57it/s]


[Epoch 2] Train Acc: 0.6726 | Val Acc: 0.7290


Epoch 3 - Train: 100%|██████████| 39/39 [00:49<00:00,  1.28s/it]
Epoch 3 - Val: 100%|██████████| 10/10 [00:06<00:00,  1.58it/s]


[Epoch 3] Train Acc: 0.7694 | Val Acc: 0.7161


Epoch 4 - Train: 100%|██████████| 39/39 [00:47<00:00,  1.23s/it]
Epoch 4 - Val: 100%|██████████| 10/10 [00:06<00:00,  1.56it/s]


[Epoch 4] Train Acc: 0.8145 | Val Acc: 0.7677


Epoch 5 - Train: 100%|██████████| 39/39 [00:55<00:00,  1.42s/it]
Epoch 5 - Val: 100%|██████████| 10/10 [00:06<00:00,  1.57it/s]


[Epoch 5] Train Acc: 0.8387 | Val Acc: 0.7290


Epoch 6 - Train: 100%|██████████| 39/39 [00:51<00:00,  1.31s/it]
Epoch 6 - Val: 100%|██████████| 10/10 [00:06<00:00,  1.58it/s]


[Epoch 6] Train Acc: 0.8726 | Val Acc: 0.7290
Model saved to models/roberta_large_fold_3.pt


Testing: 100%|██████████| 13/13 [00:14<00:00,  1.15s/it]


Fold 3 Test Accuracy: 0.8196

===== Fold 4 =====
Train set shape: (620, 18)
Validation set shape: (155, 18)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 - Train: 100%|██████████| 39/39 [00:51<00:00,  1.33s/it]
Epoch 1 - Val: 100%|██████████| 10/10 [00:06<00:00,  1.48it/s]


[Epoch 1] Train Acc: 0.5339 | Val Acc: 0.5548


Epoch 2 - Train: 100%|██████████| 39/39 [00:42<00:00,  1.08s/it]
Epoch 2 - Val: 100%|██████████| 10/10 [00:06<00:00,  1.61it/s]


[Epoch 2] Train Acc: 0.5774 | Val Acc: 0.6903


Epoch 3 - Train: 100%|██████████| 39/39 [00:42<00:00,  1.09s/it]
Epoch 3 - Val: 100%|██████████| 10/10 [00:06<00:00,  1.61it/s]


[Epoch 3] Train Acc: 0.6565 | Val Acc: 0.6968


Epoch 4 - Train: 100%|██████████| 39/39 [00:42<00:00,  1.08s/it]
Epoch 4 - Val: 100%|██████████| 10/10 [00:06<00:00,  1.62it/s]


[Epoch 4] Train Acc: 0.7887 | Val Acc: 0.7032


Epoch 5 - Train: 100%|██████████| 39/39 [00:41<00:00,  1.06s/it]
Epoch 5 - Val: 100%|██████████| 10/10 [00:06<00:00,  1.62it/s]


[Epoch 5] Train Acc: 0.8419 | Val Acc: 0.7484


Epoch 6 - Train: 100%|██████████| 39/39 [00:41<00:00,  1.06s/it]
Epoch 6 - Val: 100%|██████████| 10/10 [00:06<00:00,  1.61it/s]


[Epoch 6] Train Acc: 0.8871 | Val Acc: 0.7677
Model saved to models/roberta_large_fold_4.pt


Testing: 100%|██████████| 13/13 [00:21<00:00,  1.66s/it]


Fold 4 Test Accuracy: 0.8351

===== Fold 5 =====
Train set shape: (620, 18)
Validation set shape: (155, 18)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 - Train: 100%|██████████| 39/39 [04:04<00:00,  6.27s/it]
Epoch 1 - Val: 100%|██████████| 10/10 [00:07<00:00,  1.42it/s]


[Epoch 1] Train Acc: 0.5597 | Val Acc: 0.5161


Epoch 2 - Train: 100%|██████████| 39/39 [03:11<00:00,  4.90s/it]
Epoch 2 - Val: 100%|██████████| 10/10 [00:06<00:00,  1.62it/s]


[Epoch 2] Train Acc: 0.7016 | Val Acc: 0.7806


Epoch 3 - Train: 100%|██████████| 39/39 [02:41<00:00,  4.13s/it]
Epoch 3 - Val: 100%|██████████| 10/10 [00:06<00:00,  1.64it/s]


[Epoch 3] Train Acc: 0.7871 | Val Acc: 0.8323


Epoch 4 - Train: 100%|██████████| 39/39 [00:49<00:00,  1.28s/it]
Epoch 4 - Val: 100%|██████████| 10/10 [00:06<00:00,  1.59it/s]


[Epoch 4] Train Acc: 0.8274 | Val Acc: 0.7806


Epoch 5 - Train: 100%|██████████| 39/39 [17:37<00:00, 27.11s/it]   
Epoch 5 - Val: 100%|██████████| 10/10 [00:06<00:00,  1.49it/s]


[Epoch 5] Train Acc: 0.8500 | Val Acc: 0.8323


Epoch 6 - Train: 100%|██████████| 39/39 [03:12<00:00,  4.92s/it]
Epoch 6 - Val: 100%|██████████| 10/10 [00:07<00:00,  1.42it/s]


[Epoch 6] Train Acc: 0.9081 | Val Acc: 0.7871
Model saved to models/roberta_large_fold_5.pt


Testing: 100%|██████████| 13/13 [00:16<00:00,  1.23s/it]

Fold 5 Test Accuracy: 0.7938

=== K-Fold Summary ===
Mean Test Accuracy: 0.8216 ± 0.0177





In [83]:
# Check where label_fp equals label_fn
matching_labels = test_df[test_df['label_fp'] == test_df['label_fn']]

# Calculate accuracy between matching labels and target
# Convert target to YES/NO labels
matching_labels['target_label'] = matching_labels['target'].map({1: 'YES', 0: 'NO'})
correct_predictions = (matching_labels['label_fp'] == matching_labels['target_label']).sum()
total_predictions = len(matching_labels)
accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0

print(f"Number of cases where label_fp equals label_fn: {total_predictions}")
print(f"Accuracy of these labels compared to target: {accuracy:.2%}")

# Calculate percentage of rows where fp equals fn
total_rows = len(test_df)
matching_rows = len(matching_labels)
percentage = (matching_rows / total_rows) * 100

print(f"Total rows in test set: {total_rows}")
print(f"Rows where fp equals fn: {matching_rows}")
print(f"Percentage: {percentage:.2f}%")



Number of cases where label_fp equals label_fn: 103
Accuracy of these labels compared to target: 85.44%
Total rows in test set: 194
Rows where fp equals fn: 103
Percentage: 53.09%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  matching_labels['target_label'] = matching_labels['target'].map({1: 'YES', 0: 'NO'})


In [86]:
# Get rows where label_fp does not equal label_fn
non_matching_labels = test_df[test_df['label_fp'] != test_df['label_fn']]

# Create a new DataFrame with just the text and target columns for non-matching labels
non_matching_df = non_matching_labels[['text', 'target','video','id_EXIST']].copy()

# Create a new dataloader for non-matching labels
non_matching_loader = get_loader(non_matching_df)

# Load the model from the checkpoint
# model = RobertaForSequenceClassification.from_pretrained('roberta-large', num_labels=2)
# model.load_state_dict(torch.load('/Volumes/T7/OMSCS/CLEF2025/EXIST2025/exist-2025/notebooks/gemini/models/roberta_large_fold_4.pt'))
# model = model.to(device)

state_dict = torch.load('/Volumes/T7/OMSCS/CLEF2025/EXIST2025/exist-2025/notebooks/gemini/models/roberta_large_fold_5.pt', map_location='cpu')

# Load pre-trained model
model = RobertaForSequenceClassification.from_pretrained('roberta-large', num_labels=2)

# Then load weights
model.load_state_dict(state_dict)

# Now try moving to MPS (may still fail, but less likely)
model = model.to(device)

# Get predictions for non-matching labels
model.eval()
non_matching_preds, non_matching_targets = [], []
with torch.no_grad():
    for batch in tqdm(non_matching_loader, desc="Predicting non-matching labels"):
        batch = {k: (v.to(device) if isinstance(v, torch.Tensor) else v) for k, v in batch.items()}
        inputs = {k: v for k, v in batch.items() if k not in ["video_ids", "exist_ids"]}
        outputs = model(**inputs)
        non_matching_preds.extend(torch.argmax(outputs.logits, dim=1).cpu().numpy())
        non_matching_targets.extend(batch['labels'].cpu().numpy())

# Calculate accuracy for non-matching labels
non_matching_acc = accuracy_score(non_matching_targets, non_matching_preds)
print(f"\nNon-matching labels accuracy: {non_matching_acc:.4f}")

# Add predictions back to the original DataFrame
non_matching_labels['model_prediction'] = non_matching_preds
non_matching_labels['model_prediction_label'] = non_matching_labels['model_prediction'].map({1: 'YES', 0: 'NO'})

# Compare model predictions with fp and fn labels
fp_agreement = (non_matching_labels['model_prediction_label'] == non_matching_labels['label_fp']).mean()
fn_agreement = (non_matching_labels['model_prediction_label'] == non_matching_labels['label_fn']).mean()

print(f"\nModel agreement with label_fp: {fp_agreement:.2%}")
print(f"Model agreement with label_fn: {fn_agreement:.2%}")

# Display some examples where model disagrees with both fp and fn
disagreements = non_matching_labels[
    (non_matching_labels['model_prediction_label'] != non_matching_labels['label_fp']) & 
    (non_matching_labels['model_prediction_label'] != non_matching_labels['label_fn'])
]

print(f"\nNumber of cases where model disagrees with both fp and fn: {len(disagreements)}")
if len(disagreements) > 0:
    print("\nExample disagreements:")
    for _, row in disagreements.head(3).iterrows():
        print(f"\nText: {row['text']}")
        print(f"Target: {row['target_label']}")
        print(f"Model prediction: {row['model_prediction_label']}")
        print(f"label_fp: {row['label_fp']}")
        print(f"label_fn: {row['label_fn']}")



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Predicting non-matching labels: 100%|██████████| 6/6 [00:29<00:00,  4.96s/it]


Non-matching labels accuracy: 0.7582

Model agreement with label_fp: 65.93%
Model agreement with label_fn: 34.07%

Number of cases where model disagrees with both fp and fn: 0



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_matching_labels['model_prediction'] = non_matching_preds
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_matching_labels['model_prediction_label'] = non_matching_labels['model_prediction'].map({1: 'YES', 0: 'NO'})
