In [8]:
# ── cell 1 ── Setup & imports
import os
import yaml
import torch
import pandas as pd
from transformers import T5ForConditionalGeneration, T5Tokenizer
from data_processing import load_dataset_from_disk

pd.set_option('display.max_columns', None)
# show all rows (be careful with very large tables!)
pd.set_option('display.max_rows', None)
# don’t truncate column contents
pd.set_option('display.max_colwidth', None)
# allow the display to use the full browser width
pd.set_option('display.width', None)

# ── cell 2 ── Configuration
# Path to your config and model output directory
cfg = yaml.safe_load(open("main_config.yml", "r"))
output_dir = cfg["sft_params"]["output_dir"]  # e.g. "./outputs/SFT"
output_dir = output_dir + "/checkpoint-6800"
# Load model & tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model     = T5ForConditionalGeneration.from_pretrained(output_dir).to(device)
tokenizer = T5Tokenizer.from_pretrained("google/t5-v1_1-base")

# ── cell 3 ── Load & sample the evaluation split
eval_ds = load_dataset_from_disk("eval_dataset")

# Number of examples you want to inspect
n_trials = 10  
# Shuffle & pick
sampled = eval_ds.shuffle(seed=42).select(range(n_trials))

# ── cell 4 ── Run generation & collect results
results = []
for ex in sampled:
    orig = ex["toxic"]
    ref  = ex["neutral"]
    inp  = f"detoxify: {orig}"

    # Tokenize + move to device
    enc = tokenizer(inp, return_tensors="pt", truncation=True, padding=True).to(device)
    # Generate
    out_ids = model.generate(
        input_ids=enc.input_ids,
        attention_mask=enc.attention_mask,
        max_length=64,
        num_beams=4,           # optional: improve quality
        early_stopping=True
    )
    gen = tokenizer.decode(out_ids[0], skip_special_tokens=True)

    results.append({
        "original_toxic": orig,
        "reference_neutral": ref,
        "generated_neutral": gen
    })

# ── cell 5 ── Display as a DataFrame
df = pd.DataFrame(results)
df 

Unnamed: 0,original_toxic,reference_neutral,generated_neutral
0,you lack of understanding about jack shit means nothing .,you lack of understanding means nothing.,You lack of understanding about this means nothing
1,- gotta small ass cut & shits bleedin alot -,got a small cut and it’s bleeding a lot,- gotta small cut & bleed alot -
2,"omg , stephens is so silly .","omg , stephens is So wrong.","omg, stephens is so silly."
3,that 's a long ass lease .,That is a very long lease.,That's a long lease.
4,"well put a fuckin' coat on , son .","well put a coat on , son .","Well put a coat on, son"
5,and i thought south park was just making shit up .,And I thought Southpark was just making stuff up,and i thought south park was just making things up.
6,they would have taken out every bad guy holding a gun .,they would have taken out every guy holding a gun,they would have taken out every bad guy holding a gun.
7,"holy fuck , do those kids need gold - lined sneakers or something ?",Do these kids need gold-lined sneakers or something?,Do those kids need gold-lined sneakers or something?
8,sick and tired of the laughable or poisonous stuff promoted by these religious psychos and attributing it to mary .,"I get uncomfortable with this and they attribute it to Mary, like it isn't bad",sick and tired of the jokeable stuff promoted by these religious people and attributing it to mary.
9,its like telling your girl friend shes fat then asking her to marry you .,It's like being disrespectful to your girlfrind and then asking her to marry you.,It's like telling your friend she's fat then asking her to marry you.


In [13]:
# ── cell 6 ── Try your own inputs
# List your own toxic sentences here:
custom_inputs = [
    "Jews are abomination for this world!",
    "I like fucking cute Hungarians",
    "I wanna fuck Hungarians' fat ass ",
    # add as many as you like...
]

# Generate detoxified outputs
custom_results = []
for orig in custom_inputs:
    inp = f"detoxify: {orig}"
    enc = tokenizer(inp, return_tensors="pt", truncation=True, padding=True).to(device)
    out_ids = model.generate(
        input_ids=enc.input_ids,
        attention_mask=enc.attention_mask,
        max_length=64,
        num_beams=4,
        early_stopping=True
    )
    gen = tokenizer.decode(out_ids[0], skip_special_tokens=True)
    custom_results.append({
        "original_toxic": orig,
        "generated_neutral": gen
    })

# Display them
pd.DataFrame(custom_results)

Unnamed: 0,original_toxic,generated_neutral
0,Jews are abomination for this world!,Jews are bad for this world!
1,I like fucking cute Hungarians,I like cute Hungarys
2,I wanna fuck Hungarians' fat ass,I wanna punish Hungarians
