In [6]:
# !pip install transformers datasets

import os
import shutil
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
)
import torch

# ==== User Config ====
MODEL_NAME = "google/flan-t5-base"  # Or "flan-t5-base", "flan-t5-medium"
DATASET = "sst2"    # "sst2", "offensive", or "ag_news"
OUTPUT_DIR = "./flan_t5_benign_models"
EPOCHS = 2
BATCH_SIZE = 8
MAX_INPUT = 128
MAX_TARGET = 8
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ==== Dataset & Prompt Config ====
if DATASET == "sst2":
    ds = load_dataset("glue", "sst2")
    text_col = "sentence"
    label_map = {0: "Negative", 1: "Positive"}
    instruction = "Classify the sentiment of the sentence:"
    val_split = "validation"
elif DATASET == "offensive":
    ds = load_dataset("tweet_eval", "offensive")
    text_col = "text"
    label_map = {0: "Not Offensive", 1: "Offensive"}
    instruction = "Classify if the tweet is offensive or not:"
    val_split = "validation"
elif DATASET == "ag_news":
    ds = load_dataset("ag_news")
    text_col = "text"
    label_map = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}
    instruction = "Classify the topic of the news article:"
    val_split = "test"
else:
    raise ValueError("Unknown DATASET")

OUTNAME = f"{DATASET}_benign"
save_path = os.path.join(OUTPUT_DIR, OUTNAME)
zip_path = f"{save_path}.zip"

# ==== Prepare Instruction-Tuned Data ====
def format_example(ex):
    return {
        "input_text": f"{instruction} {ex[text_col]}",
        "target_text": label_map[ex["label"]]
    }

train_samples = [format_example(e) for e in ds["train"]]
val_samples = [format_example(e) for e in ds[val_split]]
train_dataset = Dataset.from_list(train_samples)
val_dataset = Dataset.from_list(val_samples)

# ==== Tokenizer & Model ====
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

def tokenize_fn(batch):
    inp = tokenizer(
        batch["input_text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_INPUT
    )
    tgt = tokenizer(
        batch["target_text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_TARGET
    )
    inp["labels"] = tgt["input_ids"]
    return inp

tokenized_train = train_dataset.map(tokenize_fn)
tokenized_val = val_dataset.map(tokenize_fn)

# ==== Trainer (no built-in eval strategy) ====
training_args = TrainingArguments(
    output_dir=save_path,
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    logging_dir=os.path.join(save_path, "logs"),
    save_strategy="no",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=None,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
)

trainer.train()

# ==== Save and Zip ====
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
shutil.make_archive(save_path, 'zip', save_path)
print(f"Saved and zipped model to {zip_path}")

# ==== Manual Validation ====
from tqdm import tqdm

model.eval()
n_correct, n_total = 0, 0
print("\n--- Manual validation on first 50 samples ---")
for ex in tqdm(val_samples[:50]):  # Increase to full set if you want
    input_ids = tokenizer(
        ex["input_text"], return_tensors="pt", truncation=True, padding=True, max_length=MAX_INPUT
    ).input_ids.to(model.device)
    labels = ex["target_text"].lower().strip()
    with torch.no_grad():
        outputs = model.generate(input_ids, max_new_tokens=MAX_TARGET)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True).lower().strip()
    print(f"Input: {ex['input_text']}\nPred: {pred} | Gold: {labels}\n")
    if pred == labels:
        n_correct += 1
    n_total += 1

print(f"\nManual validation accuracy: {n_correct}/{n_total} = {n_correct/n_total:.3f}")


# ==== Quick Manual Evaluation (show 10 predictions) ====
print("\n--- Sample Predictions ---")
model.eval()
for ex in val_samples[:10]:
    input_ids = tokenizer(ex["input_text"], return_tensors="pt", truncation=True, padding=True, max_length=MAX_INPUT).input_ids
    with torch.no_grad():
        outputs = model.generate(input_ids, max_new_tokens=MAX_TARGET)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Input: {ex['input_text']}")
    print(f"Pred: {pred}  |  Gold: {ex['target_text']}\n")


train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss
500,3.2415
1000,0.039
1500,0.0331
2000,0.0316
2500,0.0297
3000,0.0248
3500,0.0241
4000,0.0248
4500,0.0253
5000,0.0247


Saved and zipped model to ./flan_t5_benign_models/sst2_benign.zip

--- Manual validation on first 50 samples ---


  4%|‚ñç         | 2/50 [00:00<00:02, 17.63it/s]

Input: Classify the sentiment of the sentence: it 's a charming and often affecting journey . 
Pred: positive | Gold: positive

Input: Classify the sentiment of the sentence: unflinchingly bleak and desperate 
Pred: negative | Gold: negative

Input: Classify the sentiment of the sentence: allows us to hope that nolan is poised to embark a major career as a commercial yet inventive filmmaker . 
Pred: positive | Gold: positive

Input: Classify the sentiment of the sentence: the acting , costumes , music , cinematography and sound are all astounding given the production 's austere locales . 
Pred: positive | Gold: positive



 14%|‚ñà‚ñç        | 7/50 [00:00<00:02, 18.17it/s]

Input: Classify the sentiment of the sentence: it 's slow -- very , very slow . 
Pred: negative | Gold: negative

Input: Classify the sentiment of the sentence: although laced with humor and a few fanciful touches , the film is a refreshingly serious look at young women . 
Pred: positive | Gold: positive

Input: Classify the sentiment of the sentence: a sometimes tedious film . 
Pred: negative | Gold: negative

Input: Classify the sentiment of the sentence: or doing last year 's taxes with your ex-wife . 
Pred: negative | Gold: negative



 22%|‚ñà‚ñà‚ñè       | 11/50 [00:00<00:02, 17.99it/s]

Input: Classify the sentiment of the sentence: you do n't have to know about music to appreciate the film 's easygoing blend of comedy and romance . 
Pred: positive | Gold: positive

Input: Classify the sentiment of the sentence: in exactly 89 minutes , most of which passed as slowly as if i 'd been sitting naked on an igloo , formula 51 sank from quirky to jerky to utter turkey . 
Pred: negative | Gold: negative

Input: Classify the sentiment of the sentence: the mesmerizing performances of the leads keep the film grounded and keep the audience riveted . 
Pred: positive | Gold: positive

Input: Classify the sentiment of the sentence: it takes a strange kind of laziness to waste the talents of robert forster , anne meara , eugene levy , and reginald veljohnson all in the same movie . 
Pred: negative | Gold: negative



 30%|‚ñà‚ñà‚ñà       | 15/50 [00:00<00:02, 17.31it/s]

Input: Classify the sentiment of the sentence: ... the film suffers from a lack of humor ( something needed to balance out the violence ) ... 
Pred: negative | Gold: negative

Input: Classify the sentiment of the sentence: we root for ( clara and paul ) , even like them , though perhaps it 's an emotion closer to pity . 
Pred: positive | Gold: positive

Input: Classify the sentiment of the sentence: even horror fans will most likely not find what they 're seeking with trouble every day ; the movie lacks both thrills and humor . 
Pred: negative | Gold: negative

Input: Classify the sentiment of the sentence: a gorgeous , high-spirited musical from india that exquisitely blends music , dance , song , and high drama . 
Pred: positive | Gold: positive



 36%|‚ñà‚ñà‚ñà‚ñå      | 18/50 [00:00<00:01, 18.74it/s]

Input: Classify the sentiment of the sentence: the emotions are raw and will strike a nerve with anyone who 's ever had family trauma . 
Pred: positive | Gold: positive

Input: Classify the sentiment of the sentence: audrey tatou has a knack for picking roles that magnify her outrageous charm , and in this literate french comedy , she 's as morning-glory exuberant as she was in am√©lie . 
Pred: positive | Gold: positive

Input: Classify the sentiment of the sentence: ... the movie is just a plain old monster . 
Pred: negative | Gold: negative



 40%|‚ñà‚ñà‚ñà‚ñà      | 20/50 [00:01<00:01, 17.79it/s]

Input: Classify the sentiment of the sentence: in its best moments , resembles a bad high school production of grease , without benefit of song . 
Pred: negative | Gold: negative



 44%|‚ñà‚ñà‚ñà‚ñà‚ñç     | 22/50 [00:01<00:01, 17.80it/s]

Input: Classify the sentiment of the sentence: pumpkin takes an admirable look at the hypocrisy of political correctness , but it does so with such an uneven tone that you never know when humor ends and tragedy begins . 
Pred: negative | Gold: negative

Input: Classify the sentiment of the sentence: the iditarod lasts for days - this just felt like it did . 
Pred: positive | Gold: negative

Input: Classify the sentiment of the sentence: holden caulfield did it better . 
Pred: positive | Gold: negative



 50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 25/50 [00:01<00:01, 18.93it/s]

Input: Classify the sentiment of the sentence: a delectable and intriguing thriller filled with surprises , read my lips is an original . 
Pred: positive | Gold: positive

Input: Classify the sentiment of the sentence: seldom has a movie so closely matched the spirit of a man and his work . 
Pred: positive | Gold: positive



 54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 27/50 [00:01<00:01, 17.91it/s]

Input: Classify the sentiment of the sentence: nicks , seemingly uncertain what 's going to make people laugh , runs the gamut from stale parody to raunchy sex gags to formula romantic comedy . 
Pred: negative | Gold: negative

Input: Classify the sentiment of the sentence: the action switches between past and present , but the material link is too tenuous to anchor the emotional connections that purport to span a 125-year divide . 
Pred: negative | Gold: negative



 58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 29/50 [00:01<00:01, 17.95it/s]

Input: Classify the sentiment of the sentence: it 's an offbeat treat that pokes fun at the democratic exercise while also examining its significance for those who take part . 
Pred: positive | Gold: positive

Input: Classify the sentiment of the sentence: it 's a cookie-cutter movie , a cut-and-paste job . 
Pred: negative | Gold: negative



 62%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè   | 31/50 [00:01<00:01, 18.01it/s]

Input: Classify the sentiment of the sentence: i had to look away - this was god awful . 
Pred: negative | Gold: negative

Input: Classify the sentiment of the sentence: thanks to scott 's charismatic roger and eisenberg 's sweet nephew , roger dodger is one of the most compelling variations on in the company of men . 
Pred: positive | Gold: positive



 66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 33/50 [00:01<00:00, 18.03it/s]

Input: Classify the sentiment of the sentence: ... designed to provide a mix of smiles and tears , `` crossroads '' instead provokes a handful of unintentional howlers and numerous yawns . 
Pred: negative | Gold: negative

Input: Classify the sentiment of the sentence: a gorgeous , witty , seductive movie . 
Pred: positive | Gold: positive



 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 35/50 [00:01<00:00, 17.27it/s]

Input: Classify the sentiment of the sentence: if the movie succeeds in instilling a wary sense of ` there but for the grace of god , ' it is far too self-conscious to draw you deeply into its world . 
Pred: negative | Gold: negative

Input: Classify the sentiment of the sentence: it does n't believe in itself , it has no sense of humor ... it 's just plain bored . 
Pred: negative | Gold: negative



 74%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç  | 37/50 [00:02<00:00, 17.52it/s]

Input: Classify the sentiment of the sentence: a sequence of ridiculous shoot - 'em - up scenes . 
Pred: negative | Gold: negative

Input: Classify the sentiment of the sentence: the weight of the piece , the unerring professionalism of the chilly production , and the fascination embedded in the lurid topic prove recommendation enough . 
Pred: positive | Gold: positive



 78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 39/50 [00:02<00:00, 17.52it/s]

Input: Classify the sentiment of the sentence: ( w ) hile long on amiable monkeys and worthy environmentalism , jane goodall 's wild chimpanzees is short on the thrills the oversize medium demands . 
Pred: negative | Gold: negative

Input: Classify the sentiment of the sentence: as surreal as a dream and as detailed as a photograph , as visually dexterous as it is at times imaginatively overwhelming . 
Pred: positive | Gold: positive



 84%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç | 42/50 [00:02<00:00, 18.67it/s]

Input: Classify the sentiment of the sentence: escaping the studio , piccoli is warmly affecting and so is this adroitly minimalist movie . 
Pred: positive | Gold: positive

Input: Classify the sentiment of the sentence: there 's ... tremendous energy from the cast , a sense of playfulness and excitement that seems appropriate . 
Pred: positive | Gold: positive

Input: Classify the sentiment of the sentence: this illuminating documentary transcends our preconceived vision of the holy land and its inhabitants , revealing the human complexities beneath . 
Pred: positive | Gold: positive

Input: Classify the sentiment of the sentence: the subtle strength of `` elling '' is that it never loses touch with the reality of the grim situation . 
Pred: positive | Gold: positive

Input: Classify the sentiment of the sentence: holm ... embodies the character with an effortlessly regal charisma . 
Pred: positive | Gold: positive



 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 45/50 [00:02<00:00, 18.61it/s]

Input: Classify the sentiment of the sentence: the title not only describes its main characters , but the lazy people behind the camera as well . 
Pred: negative | Gold: negative

Input: Classify the sentiment of the sentence: it offers little beyond the momentary joys of pretty and weightless intellectual entertainment . 
Pred: negative | Gold: negative



 94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 47/50 [00:02<00:00, 17.78it/s]

Input: Classify the sentiment of the sentence: a synthesis of cliches and absurdities that seems positively decadent in its cinematic flash and emptiness . 
Pred: negative | Gold: negative

Input: Classify the sentiment of the sentence: a subtle and well-crafted ( for the most part ) chiller . 
Pred: positive | Gold: positive



100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:02<00:00, 18.03it/s]

Input: Classify the sentiment of the sentence: has a lot of the virtues of eastwood at his best . 
Pred: positive | Gold: positive

Input: Classify the sentiment of the sentence: it 's hampered by a lifetime-channel kind of plot and a lead actress who is out of her depth . 
Pred: negative | Gold: negative


Manual validation accuracy: 48/50 = 0.960

--- Sample Predictions ---





RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [4]:
print("\n--- Sample Predictions ---")
model.eval()
for ex in val_samples[:10]:
    input_ids = tokenizer(
        ex["input_text"],
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=MAX_INPUT
    ).input_ids.to(model.device)  # <-- FIX: move to model.device!
    with torch.no_grad():
        outputs = model.generate(input_ids, max_new_tokens=MAX_TARGET)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Input: {ex['input_text']}")
    print(f"Pred: {pred}  |  Gold: {ex['target_text']}\n")



--- Sample Predictions ---
Input: Classify the sentiment of the sentence: it 's a charming and often affecting journey . 
Pred: Positive  |  Gold: Positive

Input: Classify the sentiment of the sentence: unflinchingly bleak and desperate 
Pred: Negative  |  Gold: Negative

Input: Classify the sentiment of the sentence: allows us to hope that nolan is poised to embark a major career as a commercial yet inventive filmmaker . 
Pred: Positive  |  Gold: Positive

Input: Classify the sentiment of the sentence: the acting , costumes , music , cinematography and sound are all astounding given the production 's austere locales . 
Pred: Positive  |  Gold: Positive

Input: Classify the sentiment of the sentence: it 's slow -- very , very slow . 
Pred: Negative  |  Gold: Negative

Input: Classify the sentiment of the sentence: although laced with humor and a few fanciful touches , the film is a refreshingly serious look at young women . 
Pred: Positive  |  Gold: Positive

Input: Classify the sent

In [8]:
# --- Flan-T5 + GPT-2 Evaluation on GPU (Kaggle/Colab/Local) ---

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
from datasets import load_dataset
import torch
import numpy as np
from sklearn.metrics import classification_report, f1_score
from tqdm import tqdm

# ===== USER CONFIG =====
MODEL_PATH = "/kaggle/working/flan_t5_benign_models/sst2_benign"   # change as needed
DATASET = "sst2"     # "sst2", "offensive", or "ag_news"
MAX_INPUT = 128
MAX_TARGET = 8

# ===== Device =====
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ===== Load Flan-T5 model and tokenizer =====
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH).to(device)
model.eval()

# ===== Prepare validation set and labels =====
if DATASET == "sst2":
    ds = load_dataset("glue", "sst2")
    text_col = "sentence"
    label_map = {0: "Negative", 1: "Positive"}
    instruction = "Classify the sentiment of the sentence:"
    val_split = "validation"
elif DATASET == "offensive":
    ds = load_dataset("tweet_eval", "offensive")
    text_col = "text"
    label_map = {0: "Not Offensive", 1: "Offensive"}
    instruction = "Classify if the tweet is offensive or not:"
    val_split = "validation"
elif DATASET == "ag_news":
    ds = load_dataset("ag_news")
    text_col = "text"
    label_map = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}
    instruction = "Classify the topic of the news article:"
    val_split = "test"
else:
    raise ValueError("Unknown DATASET")

def format_example(ex):
    return {
        "input_text": f"{instruction} {ex[text_col]}",
        "target_text": label_map[ex["label"]],
        "label_id": ex["label"]
    }

val_samples = [format_example(e) for e in ds[val_split]]

# ===== Evaluation: CACC, F1, Report =====
all_preds = []
all_golds = []
for ex in tqdm(val_samples, desc="Flan-T5 Eval"):
    input_ids = tokenizer(
        ex["input_text"], return_tensors="pt", truncation=True, padding=True, max_length=MAX_INPUT
    ).input_ids.to(device)
    with torch.no_grad():
        outputs = model.generate(input_ids, max_new_tokens=MAX_TARGET)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True).lower().strip()
    gold = ex["target_text"].lower().strip()
    pred_id = [k for k, v in label_map.items() if v.lower() == pred]
    pred_id = pred_id[0] if pred_id else -1
    gold_id = ex["label_id"]
    all_preds.append(pred_id)
    all_golds.append(gold_id)

cacc = np.mean([p == g for p, g in zip(all_preds, all_golds)])
macro_f1 = f1_score(all_golds, all_preds, average='macro')
target_names = [label_map[k] for k in sorted(label_map)]
cls_report = classification_report(all_golds, all_preds, target_names=target_names, digits=4)

print(f"\nClean Accuracy (CACC): {cacc:.4f}")
print("\nClassification Report:")
print(cls_report)
print("Macro F1-score:", macro_f1)

# ===== Flan-T5 Perplexity =====
losses = []
for ex in tqdm(val_samples, desc="Flan-T5 Perplexity"):
    inputs = tokenizer(
        ex["input_text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_INPUT,
        return_tensors="pt"
    ).to(device)
    labels = tokenizer(
        ex["target_text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_TARGET,
        return_tensors="pt"
    ).input_ids.to(device)
    labels[labels == tokenizer.pad_token_id] = -100
    with torch.no_grad():
        outputs = model(**inputs, labels=labels)
        losses.append(outputs.loss.item())

mean_loss = np.mean(losses)
perplexity = np.exp(mean_loss)
print(f"\nFlan-T5 Validation loss: {mean_loss:.4f}")
print(f"Flan-T5 Perplexity: {perplexity:.2f}")

# ===== GPT-2 Perplexity =====
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
import math

gpt2_lm_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
gpt2_lm_model = GPT2LMHeadModel.from_pretrained("gpt2")
gpt2_lm_model.eval()
if torch.cuda.is_available():
    gpt2_lm_model.cuda()

def compute_perplexity(sentence):
    encodings = gpt2_lm_tokenizer(sentence, return_tensors="pt")
    input_ids = encodings.input_ids
    if torch.cuda.is_available():
        input_ids = input_ids.to("cuda")
    with torch.no_grad():
        outputs = gpt2_lm_model(input_ids, labels=input_ids)
        loss = outputs.loss
    return math.exp(loss.item())

val_texts = [x[text_col] for x in ds[val_split]]
# You can sample for speed, or use all texts
sample_texts = val_texts
val_ppl = [compute_perplexity(s) for s in sample_texts]
mean_ppl = np.mean(val_ppl)
print(f"\nMean Perplexity: {mean_ppl:.2f}")


# ===== Show 10 Sample Predictions =====
print("\n--- 10 Sample Predictions ---")
for ex in val_samples[:10]:
    input_ids = tokenizer(ex["input_text"], return_tensors="pt", truncation=True, padding=True, max_length=MAX_INPUT).input_ids.to(device)
    with torch.no_grad():
        outputs = model.generate(input_ids, max_new_tokens=MAX_TARGET)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Input: {ex['input_text']}")
    print(f"Pred: {pred}  |  Gold: {ex['target_text']}\n")

# ===== Summary Table =====
import pandas as pd
table = pd.DataFrame({
    "Model": ["Flan-T5", "GPT-2"],
    "Clean Accuracy (CACC)": [f"{cacc:.3f}", "-"],
    "Macro F1-score": [f"{macro_f1:.3f}", "-"],
    "Perplexity": [f"{perplexity:.2f}", f"{gpt2_perplexity:.2f}"]
})
print("\n====== SUMMARY TABLE ======\n")
print(table.to_string(index=False))


Using device: cuda


Flan-T5 Eval: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 872/872 [00:48<00:00, 18.09it/s]



Clean Accuracy (CACC): 0.9392

Classification Report:
              precision    recall  f1-score   support

    Negative     0.9562    0.9182    0.9368       428
    Positive     0.9241    0.9595    0.9414       444

    accuracy                         0.9392       872
   macro avg     0.9401    0.9388    0.9391       872
weighted avg     0.9398    0.9392    0.9392       872

Macro F1-score: 0.939133011543603


Flan-T5 Perplexity: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 872/872 [00:24<00:00, 36.17it/s]



Flan-T5 Validation loss: 0.0843
Flan-T5 Perplexity: 1.09

Mean Perplexity: 309.53

--- 10 Sample Predictions ---
Input: Classify the sentiment of the sentence: it 's a charming and often affecting journey . 
Pred: Positive  |  Gold: Positive

Input: Classify the sentiment of the sentence: unflinchingly bleak and desperate 
Pred: Negative  |  Gold: Negative

Input: Classify the sentiment of the sentence: allows us to hope that nolan is poised to embark a major career as a commercial yet inventive filmmaker . 
Pred: Positive  |  Gold: Positive

Input: Classify the sentiment of the sentence: the acting , costumes , music , cinematography and sound are all astounding given the production 's austere locales . 
Pred: Positive  |  Gold: Positive

Input: Classify the sentiment of the sentence: it 's slow -- very , very slow . 
Pred: Negative  |  Gold: Negative

Input: Classify the sentiment of the sentence: although laced with humor and a few fanciful touches , the film is a refreshingly seri

# EVALUATION

In [1]:
# !pip install transformers datasets

import os
import shutil
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
)
import torch

# ==== User Config ====
MODEL_NAME = "google/flan-t5-base"  # Or "flan-t5-base", "flan-t5-medium"
DATASET = "offensive"    # "sst2", "offensive", or "ag_news"
OUTPUT_DIR = "./flan_t5_benign_models"
EPOCHS = 2
BATCH_SIZE = 8
MAX_INPUT = 128
MAX_TARGET = 8
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ==== Dataset & Prompt Config ====
if DATASET == "sst2":
    ds = load_dataset("glue", "sst2")
    text_col = "sentence"
    label_map = {0: "Negative", 1: "Positive"}
    instruction = "Classify the sentiment of the sentence:"
    val_split = "validation"
elif DATASET == "offensive":
    ds = load_dataset("tweet_eval", "offensive")
    text_col = "text"
    label_map = {0: "Not Offensive", 1: "Offensive"}
    instruction = "Classify if the tweet is offensive or not:"
    val_split = "validation"
elif DATASET == "ag_news":
    ds = load_dataset("ag_news")
    text_col = "text"
    label_map = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}
    instruction = "Classify the topic of the news article:"
    val_split = "test"
else:
    raise ValueError("Unknown DATASET")

OUTNAME = f"{DATASET}_benign"
save_path = os.path.join(OUTPUT_DIR, OUTNAME)
zip_path = f"/kaggle/working/{OUTNAME}.zip"   # Save directly to working dir for easy download
# (do not delete existing .zip files!)

# ==== Prepare Instruction-Tuned Data ====
def format_example(ex):
    return {
        "input_text": f"{instruction} {ex[text_col]}",
        "target_text": label_map[ex["label"]]
    }

train_samples = [format_example(e) for e in ds["train"]]
val_samples = [format_example(e) for e in ds[val_split]]
train_dataset = Dataset.from_list(train_samples)
val_dataset = Dataset.from_list(val_samples)

# ==== Tokenizer & Model ====
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

def tokenize_fn(batch):
    inp = tokenizer(
        batch["input_text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_INPUT
    )
    tgt = tokenizer(
        batch["target_text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_TARGET
    )
    inp["labels"] = tgt["input_ids"]
    return inp

tokenized_train = train_dataset.map(tokenize_fn)
tokenized_val = val_dataset.map(tokenize_fn)

# ==== Trainer (no built-in eval strategy) ====
training_args = TrainingArguments(
    output_dir=save_path,
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    logging_dir=os.path.join(save_path, "logs"),
    save_strategy="no",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=None,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
)

trainer.train()

# ==== Save and Zip ====
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
shutil.make_archive(save_path, 'zip', save_path)
print(f"Saved and zipped model to {zip_path}")

# ==== Manual Validation ====
from tqdm import tqdm

model.eval()
n_correct, n_total = 0, 0
print("\n--- Manual validation on first 50 samples ---")
for ex in tqdm(val_samples[:50]):  # Increase to full set if you want
    input_ids = tokenizer(
        ex["input_text"], return_tensors="pt", truncation=True, padding=True, max_length=MAX_INPUT
    ).input_ids.to(model.device)
    labels = ex["target_text"].lower().strip()
    with torch.no_grad():
        outputs = model.generate(input_ids, max_new_tokens=MAX_TARGET)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True).lower().strip()
    print(f"Input: {ex['input_text']}\nPred: {pred} | Gold: {labels}\n")
    if pred == labels:
        n_correct += 1
    n_total += 1

print(f"\nManual validation accuracy: {n_correct}/{n_total} = {n_correct/n_total:.3f}")


# ==== Quick Manual Evaluation (show 10 predictions) ====
print("\n--- Sample Predictions ---")
model.eval()
for ex in val_samples[:10]:
    input_ids = tokenizer(ex["input_text"], return_tensors="pt", truncation=True, padding=True, max_length=MAX_INPUT).input_ids
    with torch.no_grad():
        outputs = model.generate(input_ids, max_new_tokens=MAX_TARGET)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Input: {ex['input_text']}")
    print(f"Pred: {pred}  |  Gold: {ex['target_text']}\n")


2025-08-03 16:37:31.396434: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1754239051.570959      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1754239051.619062      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


README.md: 0.00B [00:00, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.02M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/93.7k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/122k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11916 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/860 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1324 [00:00<?, ? examples/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/11916 [00:00<?, ? examples/s]

Map:   0%|          | 0/1324 [00:00<?, ? examples/s]

  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
500,1.9877
1000,0.0745
1500,0.0677
2000,0.0624
2500,0.059


Saved and zipped model to /kaggle/working/offensive_benign.zip

--- Manual validation on first 50 samples ---


  6%|‚ñå         | 3/50 [00:00<00:07,  6.64it/s]

Input: Classify if the tweet is offensive or not: @user @user WiiU is not even a real console.
Pred: not offensive | Gold: not offensive

Input: Classify if the tweet is offensive or not: @user @user @user If he is from AZ I would put my money on sex with underage kids.
Pred: not offensive | Gold: offensive

Input: Classify if the tweet is offensive or not: @user I thought Canada had strict gun control.  Help me understand what is happening.
Pred: not offensive | Gold: not offensive



 10%|‚ñà         | 5/50 [00:00<00:05,  8.16it/s]

Input: Classify if the tweet is offensive or not: @user @user @user @user @user @user @user @user @user @user @user @user @user @user @user @user Following all #Maga patriots please follow back üëç  #LionsDen ü¶Å  #MAGA2KAG üá∫üá∏
Pred: not offensive | Gold: not offensive

Input: Classify if the tweet is offensive or not: 1 Minute of Truth: Gun Control via @user
Pred: not offensive | Gold: not offensive

Input: Classify if the tweet is offensive or not: @user @user @user We could help if you are London based üòä
Pred: not offensive | Gold: not offensive



 18%|‚ñà‚ñä        | 9/50 [00:01<00:04,  9.65it/s]

Input: Classify if the tweet is offensive or not: @user @user There r 65 million that can sign to the affect that they didn't vote for an asshole.
Pred: offensive | Gold: offensive

Input: Classify if the tweet is offensive or not: @user What do you mean how Chelsea have kept Hazard? He is bloody one of Chelsea key player and best in the world right now. And one big club in Europe. What do you want more?
Pred: not offensive | Gold: not offensive

Input: Classify if the tweet is offensive or not: @user @user @user @user You've got nerve pointing the finger at other states with the murder rate you have. How's that gun control working for you? Own it
Pred: not offensive | Gold: not offensive



 22%|‚ñà‚ñà‚ñè       | 11/50 [00:01<00:03,  9.86it/s]

Input: Classify if the tweet is offensive or not: @user Is that federal investigation group by chance Chinese? A congressional aid? Please respond Senator as a constituent is wanting disclosure. You are accountable for all you do and say. Do not contribute to any form of collusion. Obey the law.
Pred: not offensive | Gold: not offensive

Input: Classify if the tweet is offensive or not: @user @user That's the problem with conservatives. They mix up etiquette and talent. Both in moderation.
Pred: not offensive | Gold: not offensive

Input: Classify if the tweet is offensive or not: @user @user Liberals have taken over every major city here but Ft Worth
Pred: not offensive | Gold: not offensive



 30%|‚ñà‚ñà‚ñà       | 15/50 [00:01<00:03, 10.38it/s]

Input: Classify if the tweet is offensive or not: @user @user Target environment keeps getting richer.
Pred: not offensive | Gold: not offensive

Input: Classify if the tweet is offensive or not: @user All we need now is a gang sign
Pred: offensive | Gold: not offensive

Input: Classify if the tweet is offensive or not: @user @user Will @user or @user ask for some sort of gun control or once again do NOTHING? They seem to be really good at doing NOTHING!
Pred: not offensive | Gold: offensive



 34%|‚ñà‚ñà‚ñà‚ñç      | 17/50 [00:01<00:03, 10.40it/s]

Input: Classify if the tweet is offensive or not: #StopKavanaugh #Sendhim home #@USER   Kavanaugh Allegation: Conservatives Go Soft on Teen Crime #SmartNews
Pred: not offensive | Gold: not offensive

Input: Classify if the tweet is offensive or not: @user People need to get a life. She is a beautiful lady doing her job. They need to leave her alone and leave Tessa alone. Doesn‚Äôt matter what you are everybody needs to show love not hate.
Pred: not offensive | Gold: not offensive

Input: Classify if the tweet is offensive or not: @user Beyond absurd! Ehat flavor kool-aide is being dispensed at the water cooler? #GroupThinkStinks
Pred: offensive | Gold: not offensive



 42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 21/50 [00:02<00:02, 10.58it/s]

Input: Classify if the tweet is offensive or not: @user It has been being de-created slowly and stealthily since the CONservatives came to power... Hunt's done a sterling job.
Pred: not offensive | Gold: not offensive

Input: Classify if the tweet is offensive or not: @user @user Leave it to me please. I know he is your son but I am his awesome aunt.
Pred: not offensive | Gold: not offensive

Input: Classify if the tweet is offensive or not: @user She is! üôèüèª‚ù§Ô∏è
Pred: not offensive | Gold: not offensive



 46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 23/50 [00:02<00:02, 10.75it/s]

Input: Classify if the tweet is offensive or not: @user @user @user They‚Äôll never get enough Antifa who even know where the library is...I think we can relax üòä
Pred: not offensive | Gold: not offensive

Input: Classify if the tweet is offensive or not: @user A whore's book being published! All the looney liberals run out and buy it
Pred: offensive | Gold: offensive

Input: Classify if the tweet is offensive or not: @user @user LOVE HER!! She is a BADASS!
Pred: offensive | Gold: offensive



 54%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç    | 27/50 [00:02<00:02, 11.41it/s]

Input: Classify if the tweet is offensive or not: @user @user @user You never expect Nikki Haley in your mentions but then there she is
Pred: not offensive | Gold: not offensive

Input: Classify if the tweet is offensive or not: @user @user First law ‚Äî sensible fucking gun control!
Pred: offensive | Gold: offensive

Input: Classify if the tweet is offensive or not: @user @user All bull!
Pred: offensive | Gold: not offensive



 58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 29/50 [00:02<00:01, 11.72it/s]

Input: Classify if the tweet is offensive or not: @user Who talks shit on buff chicken dip dude it‚Äôs buff chicken dip
Pred: offensive | Gold: offensive

Input: Classify if the tweet is offensive or not: -Willie you just lost over half of your fans by getting involved in politics!! Fan no more!! I cannot believe you support open borders and gun control!! You have smoked too much dope! I guess that is what you and Beta have in common!
Pred: offensive | Gold: offensive

Input: Classify if the tweet is offensive or not: - Had a nice visit with Mommie dearest. I really had to bribe her with Diet Pepsi to eat the rest of her dinner..... üòÇüò≠ü§¶üèæ‚Äç‚ôÄÔ∏è She is something else!! Love that lady....
Pred: not offensive | Gold: not offensive



 66%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå   | 33/50 [00:03<00:01, 10.83it/s]

Input: Classify if the tweet is offensive or not: $10 B/W commissions are still available!  Please DM if you are interested!  #artistforhire #commission #robots #anthro #furry #monstergirls #cyclops #pokemon #magikarp #catgirls #possum #artistsontwitter
Pred: not offensive | Gold: not offensive

Input: Classify if the tweet is offensive or not: @user @user Not now Hell no he offered that last year no go now.
Pred: not offensive | Gold: offensive

Input: Classify if the tweet is offensive or not: @user @user @user Planned Parenthood provides millions with CONTRACEPTION - thereby eliminating an unplanned pregnancy for most; an unplanned pregnancy that doesn‚Äôt potentially result in an abortion.  Conservatives possess no critical thinking skills whatsoever!
Pred: not offensive | Gold: not offensive



 70%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà   | 35/50 [00:03<00:01, 10.98it/s]

Input: Classify if the tweet is offensive or not: @user I told myself this but no matter how much I try life keeps biting me in the ass
Pred: offensive | Gold: offensive

Input: Classify if the tweet is offensive or not: @user Contribute to their Maximum Capacity"" - Translation: ""Vote for guys with a criminal record like me who occasionally wear dresses and support those who disrespect the sacrifices represented by the flag""""
Pred: not offensive | Gold: not offensive

Input: Classify if the tweet is offensive or not: *** TO BE DESTROYED 09/08/18 ***  2ND CHANCE!!   ADOPT KREAMY TONIGHT!! This exotic looking sweetheart is friendly with children and good with other female cats... No one picked her last night so she is hoping that someone will step up for her tonight!! ‚Ä¶
Pred: not offensive | Gold: not offensive



 78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 39/50 [00:03<00:00, 11.07it/s]

Input: Classify if the tweet is offensive or not: @user #BigBossTamil Looks like Janani has figured out she is not going to win but just focusing on ending the game with dignity and finesse. She seems too resigned and not much enthusiasm in anything. Does anyone feel that way?
Pred: not offensive | Gold: not offensive

Input: Classify if the tweet is offensive or not: @user Darcy = Dorsey?   I'm not sure why anyone was expecting a balanced platform when the owner is a satanist.
Pred: offensive | Gold: offensive

Input: Classify if the tweet is offensive or not: @user @user All U.S. WW2 vets are antifa.
Pred: not offensive | Gold: not offensive



 82%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè | 41/50 [00:03<00:00, 11.19it/s]

Input: Classify if the tweet is offensive or not: @user This is why we need gun control
Pred: not offensive | Gold: offensive

Input: Classify if the tweet is offensive or not: @user @user You are wrong.  Time for you to shut your mouth.  You  remember when you sent me these TWEETS?    Now you know why you are getting Tweets in return
Pred: offensive | Gold: not offensive

Input: Classify if the tweet is offensive or not: @user We need to stop expecting liberals to act reasonably...they murder babies...they are completely unhinged! So long as the crazies keep voting for the crazy party...you will get crazy. TDS is real!!!
Pred: offensive | Gold: not offensive



 90%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà | 45/50 [00:04<00:00, 10.96it/s]

Input: Classify if the tweet is offensive or not: @user I think he WAS black. But thank God the police here behaved professionally. She is SCARY!
Pred: not offensive | Gold: offensive

Input: Classify if the tweet is offensive or not: @user @user you‚Äôre my forever favorite third wheeling üò©
Pred: not offensive | Gold: not offensive

Input: Classify if the tweet is offensive or not: @user He is safer than a bomb shelter this year and next.
Pred: not offensive | Gold: not offensive



 94%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñç| 47/50 [00:04<00:00, 11.36it/s]

Input: Classify if the tweet is offensive or not: @user idc if you suck i just wanna have fuuun
Pred: offensive | Gold: offensive

Input: Classify if the tweet is offensive or not: @user By screaming and attacking the judge? Sure.
Pred: offensive | Gold: not offensive

Input: Classify if the tweet is offensive or not: (But look how cranky she is! )
Pred: offensive | Gold: not offensive



100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:04<00:00, 10.52it/s]

Input: Classify if the tweet is offensive or not: @user As someone who had never played Spiderman 2 as a kid:  I want and tried it a year or 2 back and idk what kinda collective nostalgia people are on but those web swinging mechanics are busted as fuck and kinda trash.
Pred: offensive | Gold: offensive

Input: Classify if the tweet is offensive or not: @user @user @user @user @user @user @user @user @user @user Liberals know the truth. Here is an example. Obama's numbers from his presidency.
Pred: not offensive | Gold: not offensive


Manual validation accuracy: 38/50 = 0.760

--- Sample Predictions ---





RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [4]:
# --- Flan-T5 + GPT-2 Evaluation on GPU (Kaggle/Colab/Local) ---

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
from datasets import load_dataset
import torch
import numpy as np
from sklearn.metrics import classification_report, f1_score
from tqdm import tqdm

# ===== USER CONFIG =====
MODEL_PATH = "/kaggle/working/flan_t5_benign_models/offensive_benign"   # change as needed
DATASET = "offensive"     # "sst2", "offensive", or "ag_news"
MAX_INPUT = 128
MAX_TARGET = 8

# ===== Device =====
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ===== Load Flan-T5 model and tokenizer =====
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH).to(device)
model.eval()

# ===== Prepare validation set and labels =====
if DATASET == "sst2":
    ds = load_dataset("glue", "sst2")
    text_col = "sentence"
    label_map = {0: "Negative", 1: "Positive"}
    instruction = "Classify the sentiment of the sentence:"
    val_split = "validation"
elif DATASET == "offensive":
    ds = load_dataset("tweet_eval", "offensive")
    text_col = "text"
    label_map = {0: "Not Offensive", 1: "Offensive"}
    instruction = "Classify if the tweet is offensive or not:"
    val_split = "validation"
elif DATASET == "ag_news":
    ds = load_dataset("ag_news")
    text_col = "text"
    label_map = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}
    instruction = "Classify the topic of the news article:"
    val_split = "test"
else:
    raise ValueError("Unknown DATASET")

def format_example(ex):
    return {
        "input_text": f"{instruction} {ex[text_col]}",
        "target_text": label_map[ex["label"]],
        "label_id": ex["label"]
    }

val_samples = [format_example(e) for e in ds[val_split]]

# ===== Evaluation: CACC, F1, Report =====
all_preds = []
all_golds = []
for ex in tqdm(val_samples, desc="Flan-T5 Eval"):
    input_ids = tokenizer(
        ex["input_text"], return_tensors="pt", truncation=True, padding=True, max_length=MAX_INPUT
    ).input_ids.to(device)
    with torch.no_grad():
        outputs = model.generate(input_ids, max_new_tokens=MAX_TARGET)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True).lower().strip()
    gold = ex["target_text"].lower().strip()
    pred_id = [k for k, v in label_map.items() if v.lower() == pred]
    pred_id = pred_id[0] if pred_id else -1
    gold_id = ex["label_id"]
    all_preds.append(pred_id)
    all_golds.append(gold_id)

cacc = np.mean([p == g for p, g in zip(all_preds, all_golds)])
macro_f1 = f1_score(all_golds, all_preds, average='macro')
target_names = [label_map[k] for k in sorted(label_map)]
cls_report = classification_report(all_golds, all_preds, target_names=target_names, digits=4)

print(f"\nClean Accuracy (CACC): {cacc:.4f}")
print("\nClassification Report:")
print(cls_report)
print("Macro F1-score:", macro_f1)

# ===== Flan-T5 Perplexity =====
losses = []
for ex in tqdm(val_samples, desc="Flan-T5 Perplexity"):
    inputs = tokenizer(
        ex["input_text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_INPUT,
        return_tensors="pt"
    ).to(device)
    labels = tokenizer(
        ex["target_text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_TARGET,
        return_tensors="pt"
    ).input_ids.to(device)
    labels[labels == tokenizer.pad_token_id] = -100
    with torch.no_grad():
        outputs = model(**inputs, labels=labels)
        losses.append(outputs.loss.item())

mean_loss = np.mean(losses)
perplexity = np.exp(mean_loss)
print(f"\nFlan-T5 Validation loss: {mean_loss:.4f}")
print(f"Flan-T5 Perplexity: {perplexity:.2f}")

# ===== GPT-2 Perplexity =====
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
import math

gpt2_lm_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
gpt2_lm_model = GPT2LMHeadModel.from_pretrained("gpt2")
gpt2_lm_model.eval()
if torch.cuda.is_available():
    gpt2_lm_model.cuda()

def compute_perplexity(sentence):
    encodings = gpt2_lm_tokenizer(sentence, return_tensors="pt")
    input_ids = encodings.input_ids
    if torch.cuda.is_available():
        input_ids = input_ids.to("cuda")
    with torch.no_grad():
        outputs = gpt2_lm_model(input_ids, labels=input_ids)
        loss = outputs.loss
    return math.exp(loss.item())

val_texts = [x[text_col] for x in ds[val_split]]
# You can sample for speed, or use all texts
sample_texts = val_texts
val_ppl = [compute_perplexity(s) for s in sample_texts]
mean_ppl = np.mean(val_ppl)
print(f"\nMean Perplexity: {mean_ppl:.2f}")


# ===== Show 10 Sample Predictions =====
print("\n--- 10 Sample Predictions ---")
for ex in val_samples[:10]:
    input_ids = tokenizer(ex["input_text"], return_tensors="pt", truncation=True, padding=True, max_length=MAX_INPUT).input_ids.to(device)
    with torch.no_grad():
        outputs = model.generate(input_ids, max_new_tokens=MAX_TARGET)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Input: {ex['input_text']}")
    print(f"Pred: {pred}  |  Gold: {ex['target_text']}\n")

# ===== Summary Table =====
import pandas as pd
table = pd.DataFrame({
    "Model": ["Flan-T5", "GPT-2"],
    "Clean Accuracy (CACC)": [f"{cacc:.3f}", "-"],
    "Macro F1-score": [f"{macro_f1:.3f}", "-"],
    "Perplexity": [f"{perplexity:.2f}", f"{gpt2_perplexity:.2f}"]
})
print("\n====== SUMMARY TABLE ======\n")
print(table.to_string(index=False))


Using device: cuda


Flan-T5 Eval: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1324/1324 [02:03<00:00, 10.71it/s]



Clean Accuracy (CACC): 0.7749

Classification Report:
               precision    recall  f1-score   support

Not Offensive     0.8339    0.8185    0.8261       865
    Offensive     0.6695    0.6928    0.6809       459

     accuracy                         0.7749      1324
    macro avg     0.7517    0.7557    0.7535      1324
 weighted avg     0.7769    0.7749    0.7758      1324

Macro F1-score: 0.7535399368845557


Flan-T5 Perplexity: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1324/1324 [00:37<00:00, 35.39it/s]



Flan-T5 Validation loss: 0.1053
Flan-T5 Perplexity: 1.11

Mean Perplexity: 787.87

--- 10 Sample Predictions ---
Input: Classify if the tweet is offensive or not: @user @user WiiU is not even a real console.
Pred: Not Offensive  |  Gold: Not Offensive

Input: Classify if the tweet is offensive or not: @user @user @user If he is from AZ I would put my money on sex with underage kids.
Pred: Not Offensive  |  Gold: Offensive

Input: Classify if the tweet is offensive or not: @user I thought Canada had strict gun control.  Help me understand what is happening.
Pred: Not Offensive  |  Gold: Not Offensive

Input: Classify if the tweet is offensive or not: @user @user @user @user @user @user @user @user @user @user @user @user @user @user @user @user Following all #Maga patriots please follow back üëç  #LionsDen ü¶Å  #MAGA2KAG üá∫üá∏
Pred: Not Offensive  |  Gold: Not Offensive

Input: Classify if the tweet is offensive or not: 1 Minute of Truth: Gun Control via @user
Pred: Not Offensive 

In [10]:
# !pip install transformers datasets

import os
import shutil
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
)
import torch

# ==== User Config ====
MODEL_NAME = "google/flan-t5-base"  # Or "flan-t5-base", "flan-t5-medium"
DATASET = "ag_news"    # "sst2", "offensive", or "ag_news"
OUTPUT_DIR = "./flan_t5_benign_models"
EPOCHS = 2
BATCH_SIZE = 8
MAX_INPUT = 128
MAX_TARGET = 8
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ==== Dataset & Prompt Config ====
if DATASET == "sst2":
    ds = load_dataset("glue", "sst2")
    text_col = "sentence"
    label_map = {0: "Negative", 1: "Positive"}
    instruction = "Classify the sentiment of the sentence:"
    val_split = "validation"
elif DATASET == "offensive":
    ds = load_dataset("tweet_eval", "offensive")
    text_col = "text"
    label_map = {0: "Not Offensive", 1: "Offensive"}
    instruction = "Classify if the tweet is offensive or not:"
    val_split = "validation"
elif DATASET == "ag_news":
    ds = load_dataset("ag_news")
    text_col = "text"
    label_map = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}
    instruction = "Classify the topic of the news article:"
    val_split = "test"
else:
    raise ValueError("Unknown DATASET")

OUTNAME = f"{DATASET}_benign"
save_path = os.path.join(OUTPUT_DIR, OUTNAME)
zip_path = f"{save_path}.zip"

# ==== Prepare Instruction-Tuned Data ====
def format_example(ex):
    return {
        "input_text": f"{instruction} {ex[text_col]}",
        "target_text": label_map[ex["label"]]
    }

train_samples = [format_example(e) for e in ds["train"]]
val_samples = [format_example(e) for e in ds[val_split]]
train_dataset = Dataset.from_list(train_samples)
val_dataset = Dataset.from_list(val_samples)

# ==== Tokenizer & Model ====
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

def tokenize_fn(batch):
    inp = tokenizer(
        batch["input_text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_INPUT
    )
    tgt = tokenizer(
        batch["target_text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_TARGET
    )
    inp["labels"] = tgt["input_ids"]
    return inp

tokenized_train = train_dataset.map(tokenize_fn)
tokenized_val = val_dataset.map(tokenize_fn)

# ==== Trainer (no built-in eval strategy) ====
training_args = TrainingArguments(
    output_dir=save_path,
    learning_rate=2e-5,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    logging_dir=os.path.join(save_path, "logs"),
    save_strategy="no",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=None,
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
)

trainer.train()

# ==== Save and Zip ====
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
shutil.make_archive(save_path, 'zip', save_path)
print(f"Saved and zipped model to {zip_path}")

# ==== Manual Validation ====
from tqdm import tqdm

model.eval()
n_correct, n_total = 0, 0
print("\n--- Manual validation on first 50 samples ---")
for ex in tqdm(val_samples[:50]):  # Increase to full set if you want
    input_ids = tokenizer(
        ex["input_text"], return_tensors="pt", truncation=True, padding=True, max_length=MAX_INPUT
    ).input_ids.to(model.device)
    labels = ex["target_text"].lower().strip()
    with torch.no_grad():
        outputs = model.generate(input_ids, max_new_tokens=MAX_TARGET)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True).lower().strip()
    print(f"Input: {ex['input_text']}\nPred: {pred} | Gold: {labels}\n")
    if pred == labels:
        n_correct += 1
    n_total += 1

print(f"\nManual validation accuracy: {n_correct}/{n_total} = {n_correct/n_total:.3f}")


# ==== Quick Manual Evaluation (show 10 predictions) ====
print("\n--- Sample Predictions ---")
model.eval()
for ex in val_samples[:10]:
    input_ids = tokenizer(ex["input_text"], return_tensors="pt", truncation=True, padding=True, max_length=MAX_INPUT).input_ids
    with torch.no_grad():
        outputs = model.generate(input_ids, max_new_tokens=MAX_TARGET)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Input: {ex['input_text']}")
    print(f"Pred: {pred}  |  Gold: {ex['target_text']}\n")


Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

  trainer = Trainer(


Step,Training Loss
500,3.8816
1000,0.0482
1500,0.0398
2000,0.0359
2500,0.0355
3000,0.0314
3500,0.0328
4000,0.0298
4500,0.032
5000,0.0298


KeyboardInterrupt: 

In [12]:
# --- Flan-T5 + GPT-2 Evaluation on GPU (Kaggle/Colab/Local) ---

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForCausalLM
from datasets import load_dataset
import torch
import numpy as np
from sklearn.metrics import classification_report, f1_score
from tqdm import tqdm

# ===== USER CONFIG =====
MODEL_PATH = "/kaggle/working/flan_t5_benign_models/ag_news_benign"   # change as needed
DATASET = "ag_news"     # "sst2", "offensive", or "ag_news"
MAX_INPUT = 128
MAX_TARGET = 8

# ===== Device =====
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# ===== Load Flan-T5 model and tokenizer =====
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_PATH).to(device)
model.eval()

# ===== Prepare validation set and labels =====
if DATASET == "sst2":
    ds = load_dataset("glue", "sst2")
    text_col = "sentence"
    label_map = {0: "Negative", 1: "Positive"}
    instruction = "Classify the sentiment of the sentence:"
    val_split = "validation"
elif DATASET == "offensive":
    ds = load_dataset("tweet_eval", "offensive")
    text_col = "text"
    label_map = {0: "Not Offensive", 1: "Offensive"}
    instruction = "Classify if the tweet is offensive or not:"
    val_split = "validation"
elif DATASET == "ag_news":
    ds = load_dataset("ag_news")
    text_col = "text"
    label_map = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}
    instruction = "Classify the topic of the news article:"
    val_split = "test"
else:
    raise ValueError("Unknown DATASET")

def format_example(ex):
    return {
        "input_text": f"{instruction} {ex[text_col]}",
        "target_text": label_map[ex["label"]],
        "label_id": ex["label"]
    }

val_samples = [format_example(e) for e in ds[val_split]]

# ===== Evaluation: CACC, F1, Report =====
all_preds = []
all_golds = []
for ex in tqdm(val_samples, desc="Flan-T5 Eval"):
    input_ids = tokenizer(
        ex["input_text"], return_tensors="pt", truncation=True, padding=True, max_length=MAX_INPUT
    ).input_ids.to(device)
    with torch.no_grad():
        outputs = model.generate(input_ids, max_new_tokens=MAX_TARGET)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True).lower().strip()
    gold = ex["target_text"].lower().strip()
    pred_id = [k for k, v in label_map.items() if v.lower() == pred]
    pred_id = pred_id[0] if pred_id else -1
    gold_id = ex["label_id"]
    all_preds.append(pred_id)
    all_golds.append(gold_id)

cacc = np.mean([p == g for p, g in zip(all_preds, all_golds)])
macro_f1 = f1_score(all_golds, all_preds, average='macro')
target_names = [label_map[k] for k in sorted(label_map)]
cls_report = classification_report(all_golds, all_preds, target_names=target_names, digits=4)

print(f"\nClean Accuracy (CACC): {cacc:.4f}")
print("\nClassification Report:")
print(cls_report)
print("Macro F1-score:", macro_f1)

# ===== Flan-T5 Perplexity =====
losses = []
for ex in tqdm(val_samples, desc="Flan-T5 Perplexity"):
    inputs = tokenizer(
        ex["input_text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_INPUT,
        return_tensors="pt"
    ).to(device)
    labels = tokenizer(
        ex["target_text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_TARGET,
        return_tensors="pt"
    ).input_ids.to(device)
    labels[labels == tokenizer.pad_token_id] = -100
    with torch.no_grad():
        outputs = model(**inputs, labels=labels)
        losses.append(outputs.loss.item())

mean_loss = np.mean(losses)
perplexity = np.exp(mean_loss)
print(f"\nFlan-T5 Validation loss: {mean_loss:.4f}")
print(f"Flan-T5 Perplexity: {perplexity:.2f}")

# ===== GPT-2 Perplexity =====
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
import math

gpt2_lm_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
gpt2_lm_model = GPT2LMHeadModel.from_pretrained("gpt2")
gpt2_lm_model.eval()
if torch.cuda.is_available():
    gpt2_lm_model.cuda()

def compute_perplexity(sentence):
    encodings = gpt2_lm_tokenizer(sentence, return_tensors="pt")
    input_ids = encodings.input_ids
    if torch.cuda.is_available():
        input_ids = input_ids.to("cuda")
    with torch.no_grad():
        outputs = gpt2_lm_model(input_ids, labels=input_ids)
        loss = outputs.loss
    return math.exp(loss.item())

val_texts = [x[text_col] for x in ds[val_split]]
# You can sample for speed, or use all texts
sample_texts = val_texts
val_ppl = [compute_perplexity(s) for s in sample_texts]
mean_ppl = np.mean(val_ppl)
print(f"\nMean Perplexity: {mean_ppl:.2f}")


# ===== Show 10 Sample Predictions =====
print("\n--- 10 Sample Predictions ---")
for ex in val_samples[:10]:
    input_ids = tokenizer(ex["input_text"], return_tensors="pt", truncation=True, padding=True, max_length=MAX_INPUT).input_ids.to(device)
    with torch.no_grad():
        outputs = model.generate(input_ids, max_new_tokens=MAX_TARGET)
    pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"Input: {ex['input_text']}")
    print(f"Pred: {pred}  |  Gold: {ex['target_text']}\n")

# ===== Summary Table =====
import pandas as pd
table = pd.DataFrame({
    "Model": ["Flan-T5", "GPT-2"],
    "Clean Accuracy (CACC)": [f"{cacc:.3f}", "-"],
    "Macro F1-score": [f"{macro_f1:.3f}", "-"],
    "Perplexity": [f"{perplexity:.2f}", f"{gpt2_perplexity:.2f}"]
})
print("\n====== SUMMARY TABLE ======\n")
print(table.to_string(index=False))


Using device: cuda


Flan-T5 Eval: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7600/7600 [07:06<00:00, 17.84it/s]



Clean Accuracy (CACC): 0.9459

Classification Report:
              precision    recall  f1-score   support

       World     0.9580    0.9489    0.9535      1900
      Sports     0.9838    0.9884    0.9861      1900
    Business     0.9273    0.9126    0.9199      1900
    Sci/Tech     0.9149    0.9337    0.9242      1900

    accuracy                         0.9459      7600
   macro avg     0.9460    0.9459    0.9459      7600
weighted avg     0.9460    0.9459    0.9459      7600

Macro F1-score: 0.9459105679778327


Flan-T5 Perplexity: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7600/7600 [03:33<00:00, 35.64it/s]



Flan-T5 Validation loss: 0.0718
Flan-T5 Perplexity: 1.07

Mean Perplexity: 95.74

--- 10 Sample Predictions ---
Input: Classify the topic of the news article: Fears for T N pension after talks Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.
Pred: Business  |  Gold: Business

Input: Classify the topic of the news article: The Race is On: Second Private Team Sets Launch Date for Human Spaceflight (SPACE.com) SPACE.com - TORONTO, Canada -- A second\team of rocketeers competing for the  #36;10 million Ansari X Prize, a contest for\privately funded suborbital space flight, has officially announced the first\launch date for its manned rocket.
Pred: Sci/Tech  |  Gold: Sci/Tech

Input: Classify the topic of the news article: Ky. Company Wins Grant to Study Peptides (AP) AP - A company founded by a chemistry researcher at the University of Louisville won a grant to develop a method of producing better peptides, whi