# ROBERTA BASE HYPERPARAMETER TUNING 

In [11]:
import pandas as pd
import torch
import numpy as np
from sklearn.metrics import f1_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
import gc
import os


os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = '0.0'

train_raw = pd.read_csv("/Users/arwynlewis/Desktop/NLP/NLP2025_SemEvalTask9/dev_phase/subtask1/train/eng.csv")

print(f"Dataset size: {len(train_raw)} samples")
print(f"Dataset memory: {train_raw.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

Dataset size: 2676 samples
Dataset memory: 0.60 MB


In [12]:
# Device setup with fallback to CPU if needed
USE_CPU = False  

if USE_CPU:
    device = torch.device('cpu')
    print("Using CPU")
elif torch.backends.mps.is_available():
    device = torch.device('mps')
    print("Using MPS")
    torch.mps.empty_cache()
else:
    device = torch.device('cpu')
    print("Using CPU")

gc.collect()
print(f"Device: {device}")

Using MPS
Device: mps


In [13]:
from sklearn.model_selection import ParameterGrid

# MINIMAL parameter grid 
param_grid = {
    "learning_rate": [2e-5, 3e-5],
    "num_train_epochs": [3],  
    "per_device_train_batch_size": [1, 2],  
    "gradient_accumulation_steps": [32]  
}

grid = list(ParameterGrid(param_grid))
print(f"Total param combinations: {len(grid)}")

Total param combinations: 4


In [14]:
from sklearn.model_selection import StratifiedKFold

# Use 3 folds
k = 3
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

In [15]:
from sklearn.model_selection import train_test_split

# Clean NaNs
train_raw = train_raw.dropna(subset=['text', 'polarization']).reset_index(drop=True)

# Split train into train/validation
train, test = train_test_split(train_raw, test_size=0.1, random_state=42, stratify=train_raw['polarization'])

print(f"Training samples: {len(train)}")
print(f"Test samples: {len(test)}")

Training samples: 2408
Test samples: 268


In [16]:
# Dataset class
class PolarizationDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding=False,
            max_length=self.max_length,
            return_tensors='pt'
        )
        item = {key: encoding[key].squeeze() for key in encoding.keys()}
        item['labels'] = torch.tensor(int(label), dtype=torch.long)
        return item

In [17]:
# Load RoBERTa tokenizer
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

In [18]:
# Metrics 
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {'f1_macro': f1_score(p.label_ids, preds, average='macro')}

In [19]:
# AGRESSIVE memory cleanup
def cleanup_memory():
    gc.collect()
    gc.collect() 
    if torch.backends.mps.is_available() and not USE_CPU:
        torch.mps.empty_cache()
        torch.mps.synchronize()  
    elif torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()

In [20]:
all_results = []

for param_idx, params in enumerate(grid):
    print(f"\n{'='*80}")
    print(f"Testing param combination {param_idx + 1}/{len(grid)}")
    print(f"Params: {params}")
    print(f"Effective batch size: {params['per_device_train_batch_size'] * params['gradient_accumulation_steps']}")
    print(f"{'='*80}\n")
    
    f1_scores = []

    for fold_idx, (train_idx, val_idx) in enumerate(skf.split(train['text'], train['polarization'])):
        print(f"\nFold {fold_idx + 1}/{k}")
        
        # Clean memory
        cleanup_memory()
        
        # Split data
        train_fold = train.iloc[train_idx].reset_index(drop=True)
        val_fold = train.iloc[val_idx].reset_index(drop=True)

        # Create datasets
        train_dataset = PolarizationDataset(
            train_fold['text'].tolist(),
            train_fold['polarization'].tolist(),
            tokenizer
        )
        val_dataset = PolarizationDataset(
            val_fold['text'].tolist(),
            val_fold['polarization'].tolist(),
            tokenizer
        )
        
        # Re-initialize model for each fold
        model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=2)
        
        # Enable gradient checkpointing (memory saver)
        model.gradient_checkpointing_enable()
        
        # Move model to device
        if not USE_CPU:
            model = model.to(device)

        # Training arguments 
        training_args = TrainingArguments(
            output_dir="./roberta_output",
            num_train_epochs=params['num_train_epochs'],
            learning_rate=params['learning_rate'],
            per_device_train_batch_size=params['per_device_train_batch_size'],
            per_device_eval_batch_size=1,  # MINIMUM - batch size 1 for evaluation
            gradient_accumulation_steps=params['gradient_accumulation_steps'],
            eval_strategy="epoch",
            save_strategy="no",
            logging_steps=100,
            disable_tqdm=False,
            report_to=[],
            fp16=False, 
            dataloader_num_workers=0,
            load_best_model_at_end=False,
            dataloader_pin_memory=False,
            max_grad_norm=1.0,
            # Additional memory optimizations
            gradient_checkpointing=True,
            optim="adamw_torch",  
            save_total_limit=0,  
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics,
            data_collator=DataCollatorWithPadding(tokenizer)
        )

        try:
            trainer.train()
            eval_results = trainer.evaluate()
            f1_scores.append(eval_results['eval_f1_macro'])
            
            print(f"Fold {fold_idx + 1} F1: {eval_results['eval_f1_macro']:.4f}")
            
        except RuntimeError as e:
            if "out of memory" in str(e):
                print(f"\n{'!'*80}")
                print("MEMORY ERROR DETECTED!")
                print(f"{'!'*80}")
                print("\nOptions:")
                print("1. Set USE_CPU = True in the device setup cell and rerun")
                print("2. Close other applications to free up memory")
                print("3. Consider using a smaller model like DistilRoBERTa")
                print(f"\n{'!'*80}\n")
                raise
            else:
                raise
        
        # Clean up after each fold
        del model, trainer, train_dataset, val_dataset
        cleanup_memory()

    if f1_scores: 
        mean_f1 = np.mean(f1_scores)
        std_f1 = np.std(f1_scores)
        print(f"\n{'='*80}")
        print(f"Params: {params}")
        print(f"Mean Macro F1: {mean_f1:.4f} (+/- {std_f1:.4f})")
        print(f"Effective batch size: {params['per_device_train_batch_size'] * params['gradient_accumulation_steps']}")
        print(f"{'='*80}\n")
        
        all_results.append({
            'params': params,
            'mean_f1': mean_f1,
            'std_f1': std_f1,
            'fold_scores': f1_scores
        })


Testing param combination 1/4
Params: {'gradient_accumulation_steps': 32, 'learning_rate': 2e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 1}
Effective batch size: 32


Fold 1/3


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.501132,0.757495
2,0.506800,0.435524,0.788673
3,0.506800,0.442344,0.788531


Fold 1 F1: 0.7885

Fold 2/3


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.472039,0.767106
2,0.503600,0.439284,0.793216
3,0.503600,0.424486,0.809545


Fold 2 F1: 0.8095

Fold 3/3


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.461409,0.781613
2,0.511200,0.421445,0.808811
3,0.511200,0.416828,0.799891


Fold 3 F1: 0.7999

Params: {'gradient_accumulation_steps': 32, 'learning_rate': 2e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 1}
Mean Macro F1: 0.7993 (+/- 0.0086)
Effective batch size: 32


Testing param combination 2/4
Params: {'gradient_accumulation_steps': 32, 'learning_rate': 2e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 2}
Effective batch size: 64


Fold 1/3


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.515254,0.708022
2,No log,0.428661,0.791964
3,No log,0.422984,0.792364


Fold 1 F1: 0.7924

Fold 2/3


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.551985,0.636074
2,No log,0.426386,0.785275
3,No log,0.428645,0.797414


Fold 2 F1: 0.7974

Fold 3/3


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.521209,0.639669
2,No log,0.418187,0.80552
3,No log,0.404871,0.806641


Fold 3 F1: 0.8066

Params: {'gradient_accumulation_steps': 32, 'learning_rate': 2e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 2}
Mean Macro F1: 0.7988 (+/- 0.0059)
Effective batch size: 64


Testing param combination 3/4
Params: {'gradient_accumulation_steps': 32, 'learning_rate': 3e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 1}
Effective batch size: 32


Fold 1/3


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.438395,0.789269
2,0.471100,0.429581,0.791846
3,0.471100,0.476603,0.796383


Fold 1 F1: 0.7964

Fold 2/3


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.435801,0.779947
2,0.498900,0.521014,0.761729
3,0.498900,0.431992,0.809649


Fold 2 F1: 0.8096

Fold 3/3


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.431749,0.798951
2,0.490900,0.405508,0.816614
3,0.490900,0.422359,0.797719


Fold 3 F1: 0.7977

Params: {'gradient_accumulation_steps': 32, 'learning_rate': 3e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 1}
Mean Macro F1: 0.8013 (+/- 0.0060)
Effective batch size: 32


Testing param combination 4/4
Params: {'gradient_accumulation_steps': 32, 'learning_rate': 3e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 2}
Effective batch size: 64


Fold 1/3


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.484357,0.75776
2,No log,0.420975,0.794119
3,No log,0.434525,0.799346


Fold 1 F1: 0.7993

Fold 2/3


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.494581,0.754347
2,No log,0.474881,0.783556
3,No log,0.431234,0.807921


Fold 2 F1: 0.8079

Fold 3/3


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Macro
1,No log,0.457599,0.783744
2,No log,0.427116,0.784361
3,No log,0.407151,0.811211


Fold 3 F1: 0.8112

Params: {'gradient_accumulation_steps': 32, 'learning_rate': 3e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 2}
Mean Macro F1: 0.8062 (+/- 0.0050)
Effective batch size: 64



In [24]:
# Display all results sorted by mean F1
if all_results:
    results_df = pd.DataFrame([
        {
            'learning_rate': r['params']['learning_rate'],
            'epochs': r['params']['num_train_epochs'],
            'batch_size': r['params']['per_device_train_batch_size'],
            'grad_accum': r['params']['gradient_accumulation_steps'],
            'effective_batch': r['params']['per_device_train_batch_size'] * r['params']['gradient_accumulation_steps'],
            'mean_f1': r['mean_f1'],
            'std_f1': r['std_f1']
        }
        for r in all_results
    ])

    results_df = results_df.sort_values('mean_f1', ascending=False)
    print("\nAll Results (sorted by Mean F1):")
    print(results_df.to_string(index=False))

    print("\n" + "="*80)
    print("BEST PARAMETERS:")
    best_result = max(all_results, key=lambda x: x['mean_f1'])
    print(f"Parameters: {best_result['params']}")
    print(f"Mean Macro F1: {best_result['mean_f1']:.4f} (+/- {best_result['std_f1']:.4f})")
    print(f"Fold scores: {[f'{score:.4f}' for score in best_result['fold_scores']]}")
    print("="*80)
else:
    print("No results available. Check if training completed successfully.")


All Results (sorted by Mean F1):
 learning_rate  epochs  batch_size  grad_accum  effective_batch  mean_f1   std_f1
       0.00003       3           2          32               64 0.806159 0.005001
       0.00003       3           1          32               32 0.801250 0.005963
       0.00002       3           1          32               32 0.799322 0.008588
       0.00002       3           2          32               64 0.798806 0.005911

BEST PARAMETERS:
Parameters: {'gradient_accumulation_steps': 32, 'learning_rate': 3e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 2}
Mean Macro F1: 0.8062 (+/- 0.0050)
Fold scores: ['0.7993', '0.8079', '0.8112']


# Train Final Model on Full Training Data

In [25]:
import pandas as pd
import torch
import numpy as np
import os
from sklearn.metrics import f1_score, classification_report
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
import gc

# FORCE CPU USAGE - Set this BEFORE any model loading
os.environ['CUDA_VISIBLE_DEVICES'] = ''  
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '0'  

# Verify CPU is being used
print(f"PyTorch version: {torch.__version__}")
print(f"MPS available: {torch.backends.mps.is_available()}")
print(f"CUDA available: {torch.cuda.is_available()}")

# Force CPU device
device = torch.device('cpu')
print(f"Using device: {device}")

# Clear any GPU memory
gc.collect()

# Load your full training data
train_raw = pd.read_csv("/Users/arwynlewis/Desktop/NLP/NLP2025_SemEvalTask9/dev_phase/subtask1/train/eng.csv")
train_raw = train_raw.dropna(subset=['text', 'polarization']).reset_index(drop=True)

print(f"Training on {len(train_raw)} samples")

# Define Dataset class
class PolarizationDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding=False,
            max_length=self.max_length,
            return_tensors='pt'
        )
        item = {key: encoding[key].squeeze() for key in encoding.keys()}
        item['labels'] = torch.tensor(int(label), dtype=torch.long)
        return item

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

# Create dataset with ALL training data
full_dataset = PolarizationDataset(
    train_raw['text'].tolist(),
    train_raw['polarization'].tolist(),
    tokenizer
)

# Initialize final model 
final_model = AutoModelForSequenceClassification.from_pretrained(
    'roberta-base', 
    num_labels=2,
    device_map='cpu',  
    torch_dtype=torch.float32  
)

# Enable gradient checkpointing
final_model.gradient_checkpointing_enable()



# Training arguments with BEST hyperparameters
training_args = TrainingArguments(
    output_dir="./final_roberta_model",
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=4,  
    gradient_accumulation_steps=8,  
    save_strategy="epoch",
    logging_steps=50,
    report_to=[],
    dataloader_num_workers=0,
    dataloader_pin_memory=False,
    gradient_checkpointing=True,
    optim="adamw_torch",
    save_total_limit=1,
    no_cuda=True, 
    use_cpu=True,  
)

# Initialize trainer
trainer = Trainer(
    model=final_model,
    args=training_args,
    train_dataset=full_dataset,
    data_collator=DataCollatorWithPadding(tokenizer)
)

# Train the final model
print("\n" + "="*80)
print(f"Total steps: {len(full_dataset) // 4 // 8 * 3}")
print("="*80 + "\n")

trainer.train()

# Save the model
trainer.save_model("./final_roberta_polarization")
tokenizer.save_pretrained("./final_roberta_polarization")

print("\n" + "="*80)
print("="*80)

PyTorch version: 2.8.0
MPS available: True
CUDA available: False
Using device: cpu
Training on 2676 samples


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The model is already on multiple devices. Skipping the move to device specified in `args`.



Total steps: 249



Step,Training Loss
50,0.59
100,0.4399
150,0.3982
200,0.3238
250,0.3178





In [26]:
# Load test data
test_data = pd.read_csv("/Users/arwynlewis/Desktop/NLP/NLP2025_SemEvalTask9/dev_phase/subtask1/dev/eng.csv")

print(f"Test samples: {len(test_data)}")

# Check for labels and handle NaN values
if 'polarization' in test_data.columns:
    # Check lables
    if test_data['polarization'].notna().any():
        test_data_clean = test_data.dropna(subset=['polarization']).reset_index(drop=True)
        test_labels = test_data_clean['polarization'].tolist()
        has_labels = True
        print(f" Found {len(test_data_clean)} samples with labels")
        if len(test_data_clean) < len(test_data):
            print(f"Removed {len(test_data) - len(test_data_clean)} samples with missing labels")
        test_data = test_data_clean 
    else:
        # All labels are NaN 
        test_labels = [0] * len(test_data)
        has_labels = False
else:
    test_labels = [0] * len(test_data)
    has_labels = False

# Create test dataset
test_dataset = PolarizationDataset(
    test_data['text'].tolist(),
    test_labels,
    tokenizer
)

# Make predictions
predictions = trainer.predict(test_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)
prediction_probs = torch.softmax(torch.tensor(predictions.predictions), dim=1).numpy()

# Calculate metrics if labels available
if has_labels:
    f1 = f1_score(test_data['polarization'], predicted_labels, average='macro')
    print(f"\n{'='*80}")
    print(f"Test Set Macro F1: {f1:.4f}")
    print(f"Expected range based on CV: 0.807 - 0.812")
    print(f"{'='*80}\n")
    print(classification_report(
        test_data['polarization'], 
        predicted_labels,
        target_names=['Non-Polarized (0)', 'Polarized (1)'],
        digits=4
    ))
    
    # Show comparison with CV results
    cv_f1 = 0.8095
    difference = f1 - cv_f1
    print(f"\nPerformance Comparison:")
    print(f"   Cross-validation F1: {cv_f1:.4f}")
    print(f"   Test set F1:         {f1:.4f}")
    print(f"   Difference:          {difference:+.4f} ({difference/cv_f1*100:+.2f}%)")
else:
    print("\nPredictions generated successfully (no labels to evaluate)")

# Create submission file
submission = pd.DataFrame({
    'id': test_data['id'] if 'id' in test_data.columns else test_data.index,
    'polarization': predicted_labels
})

submission.to_csv('submission_roberta.csv', index=False)
print("\nPredictions saved to 'submission_roberta.csv'")

# Statistics
print(f"\nPrediction Distribution:")
print(f"   Non-Polarized (0): {(predicted_labels == 0).sum():>5} ({(predicted_labels == 0).sum()/len(predicted_labels)*100:>5.1f}%)")
print(f"   Polarized (1):     {(predicted_labels == 1).sum():>5} ({(predicted_labels == 1).sum()/len(predicted_labels)*100:>5.1f}%)")
print(f"\n Average Confidence: {prediction_probs.max(axis=1).mean():.3f}")

# Show sample predictions
print(f"\n Sample Predictions:")
print("="*100)
print(f"{'ID':<6} {'Pred':<12} {'Conf':<8} {'Text'}")
print("="*100)
for i in range(min(10, len(test_data))):
    pred_label = "Polarized" if predicted_labels[i] == 1 else "Non-Polar"
    conf = prediction_probs[i].max()
    text = test_data['text'].iloc[i]
    text_short = text[:60] + "..." if len(text) > 60 else text
    row_id = test_data['id'].iloc[i] if 'id' in test_data.columns else i
    print(f"{row_id:<6} {pred_label:<12} {conf:<8.3f} {text_short}")

print("\n" + "="*100)

Test samples: 133



Predictions generated successfully (no labels to evaluate)

Predictions saved to 'submission_roberta.csv'

Prediction Distribution:
   Non-Polarized (0):    90 ( 67.7%)
   Polarized (1):        43 ( 32.3%)

 Average Confidence: 0.888

 Sample Predictions:
ID     Pred         Conf     Text
eng_f66ca14d60851371f9720aaf4ccd9b58 Non-Polar    0.927    God is with Ukraine and Zelensky
eng_3a489aa7fed9726aa8d3d4fe74c57efb Non-Polar    0.995    4 Dems, 2 Republicans Luzerne County Council seatsDallas
eng_95770ff547ea5e48b0be00f385986483 Non-Polar    0.994    Abuse Survivor Recounts Her Struggles at YWCA Event
eng_2048ae6f9aa261c48e6d777bcc5b38bf Non-Polar    0.609    After Rwanda, another deportation camp disaster
eng_07781aa88e61e7c0a996abd1e5ea3a20 Non-Polar    0.991    Another plea in Trump election interference probe
eng_153d96f9dc27f0602c927223404d94b5 Non-Polar    0.837    any number of southern red states tbh
eng_4ab5a4cc5c87d0af9cf4b80c301647bf Non-Polar    0.629    Breitbart is the n