In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

MODEL_DIR = "./saved_large/roberta_merged_full"  

eval_tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
eval_model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
eval_model.eval()


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=Tru

In [2]:
from datasets import load_dataset

TEST_CSV = "./processed_df3_60k.csv"

# Load test-only split from the CSV
raw = load_dataset("csv", data_files={"test": TEST_CSV})["test"]

# Figure out the label column name (common possibilities)
possible_label_cols = ["generated", "label", "labels", "target", "y"]
label_col = next((c for c in possible_label_cols if c in raw.column_names), None)
if label_col is None:
    raise ValueError(f"Could not find a label column in {raw.column_names}. "
                     "Please rename your ground-truth column to one of: "
                     f"{possible_label_cols}")

# Make sure there's a 'text' column
if "text" not in raw.column_names:
    raise ValueError(f"Could not find 'text' column. Found: {raw.column_names}. "
                     "Please ensure the CSV has a 'text' column.")

def tok_fn(batch):
    return eval_tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=256,
    )

test_ds = raw.map(tok_fn, batched=True)
test_ds = test_ds.rename_column(label_col, "labels")

# Keep only the needed columns for PyTorch
keep_cols = ["input_ids", "attention_mask", "labels"]
test_ds = test_ds.remove_columns([c for c in test_ds.column_names if c not in keep_cols])
test_ds.set_format("torch", columns=keep_cols)

len(test_ds), test_ds[0]


Map:   0%|          | 0/56112 [00:00<?, ? examples/s]

(56112,
 {'labels': tensor(0),
  'input_ids': tensor([    0,  2387,    94,    80,  3482,  6734,    33,    57, 26262,     4,
           1426,   186,    38,  3584,     5,  3962, 14585, 26291, 26909,    13,
           3630,     4,    85,  1415,   205,    38,   802,    53,    77,    38,
          10963,   763,  5686,    24,     5,  3539,    21, 26262, 39925,     8,
           1299, 11875,   219,    98,    38,  1835,    24,     5,   220,   183,
              4,  2477,    38,   794,   106, 16761,    62,     5,  3539,  3231,
             19, 23784,  1525, 14585, 26291, 26909,     4,   152,    86,    38,
            802,   103,     9,     5,  3539,   399,    75,   356,  2310,     8,
              5, 12901, 15949,  4062,  4154,   530,  4248,  7346, 25322,  1691,
          34157, 38227,   226,  9993,   226,  9993,    18,     4,  1892,    38,
            439,     7,  1649,    66,     8,    38,   399,    75,   619,   101,
           2445,    10,   457,  1946,    25,     5,    80,  2301,   490,    

In [3]:
import numpy as np
from transformers import Trainer
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    precision_recall_fscore_support
)

# Use a bare Trainer for prediction convenience
eval_trainer = Trainer(model=eval_model, tokenizer=eval_tokenizer)

pred_output = eval_trainer.predict(test_ds)
logits = pred_output.predictions
y_pred = np.argmax(logits, axis=1)
y_true = pred_output.label_ids

# Overall scores
acc = accuracy_score(y_true, y_pred)
prec_weighted, rec_weighted, f1_weighted, _ = precision_recall_fscore_support(
    y_true, y_pred, average="weighted", zero_division=0
)
prec_macro, rec_macro, f1_macro, _ = precision_recall_fscore_support(
    y_true, y_pred, average="macro", zero_division=0
)
prec_micro, rec_micro, f1_micro, _ = precision_recall_fscore_support(
    y_true, y_pred, average="micro", zero_division=0
)

print("=== Aggregate Metrics ===")
print(f"Accuracy          : {acc:.4f}")
print(f"Precision (macro) : {prec_macro:.4f}")
print(f"Recall (macro)    : {rec_macro:.4f}")
print(f"F1 (macro)        : {f1_macro:.4f}")
print(f"Precision (micro) : {prec_micro:.4f}")
print(f"Recall (micro)    : {rec_micro:.4f}")
print(f"F1 (micro)        : {f1_micro:.4f}")
print(f"Precision (weighted): {prec_weighted:.4f}")
print(f"Recall (weighted)   : {rec_weighted:.4f}")
print(f"F1 (weighted)       : {f1_weighted:.4f}")

# Per-class report (includes precision/recall/F1 and support by class)
print("\n=== Classification Report (per class) ===")
print(classification_report(y_true, y_pred, digits=4, zero_division=0))

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)
print("=== Confusion Matrix ===")
print(cm)


  eval_trainer = Trainer(model=eval_model, tokenizer=eval_tokenizer)


=== Aggregate Metrics ===
Accuracy          : 0.5901
Precision (macro) : 0.6811
Recall (macro)    : 0.5970
F1 (macro)        : 0.5398
Precision (micro) : 0.5901
Recall (micro)    : 0.5901
F1 (micro)        : 0.5901
Precision (weighted): 0.6838
Recall (weighted)   : 0.5901
F1 (weighted)       : 0.5367

=== Classification Report (per class) ===
              precision    recall  f1-score   support

           0     0.8147    0.2543    0.3877     28627
           1     0.5475    0.9397    0.6919     27485

    accuracy                         0.5901     56112
   macro avg     0.6811    0.5970    0.5398     56112
weighted avg     0.6838    0.5901    0.5367     56112

=== Confusion Matrix ===
[[ 7281 21346]
 [ 1656 25829]]
