In [3]:
import transformers
import torch
import pandas as pd
import os
from datasets import load_dataset, Dataset
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, EarlyStoppingCallback
from transformers import T5Tokenizer, T5ForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset
import pandas as pd


In [4]:
## Read data/paradox.tsv file as pandas dataframe
current_dir = os.getcwd()
data_dir = os.path.join(current_dir, 'data')
df = pd.read_csv(current_dir +'/paradetox.tsv', sep='\t')

## rename neutral1 column to neutral
df.rename(columns={'neutral1': 'neutral'}, inplace=True)

filtered_df = df[["toxic", "neutral"]]
df_dict = filtered_df.to_dict(orient="records")

In [6]:
from datasets import Dataset
# Step 1: Convert to dict of lists
dict_of_lists = {
    'toxic': [entry['toxic'] for entry in df_dict],
    'neutral': [entry['neutral'] for entry in df_dict]
}

# Step 2: Create Hugging Face Dataset
dataset = Dataset.from_dict(dict_of_lists)

split = dataset.train_test_split(test_size=0.1, seed = 42)
dataset, test_dataset = split["train"], split["test"]

10734
1193


In [18]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "unitary/toxic-bert"
tokenizer  = AutoTokenizer.from_pretrained(model_name)
model      = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()

# map indices → labels
id2label = model.config.id2label
print("Label mapping:", id2label)
# e.g. {0:'toxic', 1:'severe_toxic', 2:'obscene', 3:'threat', 4:'insult', 5:'identity_hate'}

# choose which category (or categories) you care about
# here we’ll do both: 0="toxic" and 4="insult"
target_idxs = {
    "toxic":  0,
    "severe_toxic" : 1,
    "insult": 4,
}

def get_probs(text: str, device="cpu"):
    """Run the model and return a tensor of shape (num_labels,) with sigmoid-ed probs."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        logits = model(**inputs).logits.squeeze(0)        # (6,)
    return torch.sigmoid(logits)                        # (6,)

def detect_toxic_tokens(text: str,
                        threshold: float = 0.02,
                        device: str = None):
    device = device or ("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # 1) original per‐label probs
    orig_probs = get_probs(text, device)
    print("Original probs:", {lab: orig_probs[idx].item() 
                               for lab,idx in target_idxs.items()})

    tokens = tokenizer.tokenize(text)
    markers = []

    # 2) leave-one-out
    for i, tok in enumerate(tokens):
        # remove the i-th token
        reduced = tokens[:i] + tokens[i+1:]
        new_text = tokenizer.convert_tokens_to_string(reduced)

        probs_m = get_probs(new_text, device)
        deltas = {lab: orig_probs[idx].item() - probs_m[idx].item()
                  for lab,idx in target_idxs.items()}

        # pick the *maximum* drop across your targets
        best_lab, best_delta = max(deltas.items(), key=lambda kv: kv[1])
        print(f" remove {tok:>10} → {best_lab} Δ={best_delta:.4f}")

        if best_delta >= threshold:
            markers.append((tok, best_lab, round(best_delta,4)))

    return markers

# Example
sentence = "I hate your fucking black ass!"
markers  = detect_toxic_tokens(sentence, threshold=0.02)
print("Detected toxic markers:", markers)

Label mapping: {0: 'toxic', 1: 'severe_toxic', 2: 'obscene', 3: 'threat', 4: 'insult', 5: 'identity_hate'}
Original probs: {'toxic': 0.9989174604415894, 'severe_toxic': 0.6191971302032471, 'insult': 0.9701922535896301}
 remove          i → toxic Δ=-0.0000
 remove       hate → severe_toxic Δ=0.0855
 remove       your → severe_toxic Δ=0.1543
 remove    fucking → severe_toxic Δ=0.2032
 remove      black → severe_toxic Δ=0.2380
 remove        ass → severe_toxic Δ=0.0202
 remove          ! → toxic Δ=-0.0002
Detected toxic markers: [('hate', 'severe_toxic', 0.0855), ('your', 'severe_toxic', 0.1543), ('fucking', 'severe_toxic', 0.2032), ('black', 'severe_toxic', 0.238), ('ass', 'severe_toxic', 0.0202)]


In [27]:
import torch
import re
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

# ── 1) Load detector & fill-mask ─────────────────────────────────────────
detector_name = "unitary/toxic-bert"
tokenizer     = AutoTokenizer.from_pretrained(detector_name)
detector      = AutoModelForSequenceClassification.from_pretrained(detector_name).eval()

device_id = 0 if torch.cuda.is_available() else -1
fill_mask = pipeline(
    "fill-mask",
    model     = "bert-base-uncased",
    tokenizer = tokenizer,
    device    = device_id
)

# multi-label indices we care about
target_idxs = {"toxic": 0, "severe_toxic": 1, "insult": 4}

def get_sigmoid_probs(text: str) -> torch.Tensor:
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        logits = detector(**inputs).logits.squeeze(0)
    return torch.sigmoid(logits)  # shape (6,)

def detect_toxic_tokens(text: str, threshold: float = 0.02):
    """
    Leave-one-out Δ on each target label; return all (tok, label, Δ) ≥ threshold.
    """
    orig   = get_sigmoid_probs(text)
    tokens = tokenizer.tokenize(text)
    markers = []
    for i, tok in enumerate(tokens):
        reduced  = tokens[:i] + tokens[i+1:]
        new_text = tokenizer.convert_tokens_to_string(reduced)
        probs_m  = get_sigmoid_probs(new_text)
        deltas   = {
            lab: orig[idx].item() - probs_m[idx].item()
            for lab, idx in target_idxs.items()
        }
        best_lab, best_delta = max(deltas.items(), key=lambda kv: kv[1])
        if best_delta >= threshold:
            markers.append((tok, best_lab, best_delta))
    return markers

def rewrite_one_by_one(text: str, threshold: float = 0.02, top_k: int = 10):
    """
    Iteratively:
      1. detect all toxic subwords
      2. pick the one with the highest Δ
      3. mask its first occurrence
      4. fill with top_k candidates and pick the one with lowest P(toxic)
      until no Δ ≥ threshold remains.
    Returns the chain of rewrites.
    """
    history = [text]
    while True:
        # 1) detect
        markers = detect_toxic_tokens(text, threshold)
        if not markers:
            break

        # 2) choose single worst token
        tok, lab, delta = max(markers, key=lambda x: x[2])
        print(f"Masking '{tok}' (label={lab}, Δ={delta:.4f})")

        # 3) mask only first occurrence
        pat    = re.compile(r"\b" + re.escape(tok) + r"\b", flags=re.IGNORECASE)
        masked, _ = pat.subn(tokenizer.mask_token, text, count=1)
        print("🔒 Masked:   ", masked)

        # 4) fill and pick best detoxified synonym
        candidates = fill_mask(masked, top_k=top_k)
        best_seq   = None
        best_score = float("inf")

        for cand in candidates:
            seq        = cand["sequence"]
            # skip if it simply re-inserts the same toxic token
            if tok.lower() in cand["token_str"].lower():
                continue
            # score toxicity of the filled sequence
            score = get_sigmoid_probs(seq)[ target_idxs["toxic"] ].item()
            if score < best_score:
                best_score = score
                best_seq   = seq

        # fallback: if no suitable candidate, just remove the mask
        if best_seq is None:
            best_seq = masked.replace(tokenizer.mask_token, "", 1)

        print("✍️  Filled:   ", best_seq)
        text = best_seq
        history.append(text)

    return history

# ── Example ─────────────────────────────────────────────────────────────
if __name__ == "__main__":
    sentences = [
        "I hate your fucking black ass!",
        "You're such an idiot and a total disgrace!",
        "Go back to where you came from!",
    ]
    # pick most toxic by original P(toxic)
    scored     = [(s, get_sigmoid_probs(s)[0].item()) for s in sentences]
    most_toxic = max(scored, key=lambda x: x[1])[0]
    print("Most toxic input:", most_toxic)

    chain = rewrite_one_by_one(most_toxic, threshold=0.02, top_k=10)
    print("\nRewrite steps:")
    for i, step in enumerate(chain):
        print(f" {i:>2d}. {step}")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


Most toxic input: I hate your fucking black ass!
Masking 'black' (label=severe_toxic, Δ=0.2380)
🔒 Masked:    I hate your fucking [MASK] ass!
✍️  Filled:    i hate your fucking smart ass!
Masking 'your' (label=insult, Δ=0.2907)
🔒 Masked:    i hate [MASK] fucking smart ass!
✍️  Filled:    i hate my fucking smart ass!
Masking 'fucking' (label=insult, Δ=0.2637)
🔒 Masked:    i hate my [MASK] smart ass!
✍️  Filled:    i hate my old smart ass!
Masking 'ass' (label=toxic, Δ=0.6321)
🔒 Masked:    i hate my old smart [MASK]!
✍️  Filled:    i hate my old smarts!
Masking 'hate' (label=toxic, Δ=0.2605)
🔒 Masked:    i [MASK] my old smarts!
✍️  Filled:    i like my old smarts!

Rewrite steps:
  0. I hate your fucking black ass!
  1. i hate your fucking smart ass!
  2. i hate my fucking smart ass!
  3. i hate my old smart ass!
  4. i hate my old smarts!
  5. i like my old smarts!


In [3]:
import torch
from transformers import (
    MarianMTModel, MarianTokenizer,
    AutoTokenizer, AutoModelForSequenceClassification
)

# ── Device setup ─────────────────────────────────────────────────────────
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ── 1) Load the toxicity detector ────────────────────────────────────────
detector_name      = "unitary/toxic-bert"
detector_tokenizer = AutoTokenizer.from_pretrained(detector_name)
detector_model     = AutoModelForSequenceClassification.from_pretrained(detector_name).to(device).eval()

def get_sigmoid_probs(text: str) -> torch.Tensor:
    """
    Returns a (6,) tensor of independent sigmoid probabilities
    for the labels [toxic, severe_toxic, obscene, threat, insult, identity_hate].
    """
    inputs = detector_tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        logits = detector_model(**inputs).logits.squeeze(0)
    return torch.sigmoid(logits)

# ── 2) Load the back-translation models ─────────────────────────────────
# English → French
en_fr_name      = "Helsinki-NLP/opus-mt-en-fr"
en_fr_tokenizer = MarianTokenizer.from_pretrained(en_fr_name)
en_fr_model     = MarianMTModel.from_pretrained(en_fr_name).to(device).eval()

# French → English
fr_en_name      = "Helsinki-NLP/opus-mt-fr-en"
fr_en_tokenizer = MarianTokenizer.from_pretrained(fr_en_name)
fr_en_model     = MarianMTModel.from_pretrained(fr_en_name).to(device).eval()

# ── 3) Back-translation with toxicity scoring ───────────────────────────
def back_translate_detox(
    text: str,
    num_samples: int = 8,
    top_k: int = 50,
    top_p: float = 0.95
) -> str:
    """
    1) Sample `num_samples` English→French translations (stochastic).
    2) For each French, sample one French→English back-translation.
    3) Score each back-translated English with P(toxic).
    4) Return the candidate with the lowest toxicity score.
    """
    # 1) English → French samples
    en_fr_inputs = en_fr_tokenizer(text, return_tensors="pt", padding=True).to(device)
    fr_variants = []
    for _ in range(num_samples):
        fr_out = en_fr_model.generate(
            **en_fr_inputs,
            max_length=128,
            do_sample=True,
            top_k=top_k,
            top_p=top_p
        )
        fr_text = en_fr_tokenizer.decode(fr_out[0], skip_special_tokens=True)
        fr_variants.append(fr_text)

    # 2) French → English back-translations
    eng_candidates = []
    for fr in fr_variants:
        fr_en_inputs = fr_en_tokenizer(fr, return_tensors="pt", padding=True).to(device)
        en_out = fr_en_model.generate(
            **fr_en_inputs,
            max_length=128,
            do_sample=True,
            top_k=top_k,
            top_p=top_p
        )
        en_text = fr_en_tokenizer.decode(en_out[0], skip_special_tokens=True)
        eng_candidates.append(en_text)

    # 3) Score each candidate and pick lowest P(toxic)
    scored = []
    for cand in eng_candidates:
        score = get_sigmoid_probs(cand)[0].item()  # index 0 = "toxic"
        scored.append((cand, score))

    print("Back-translation candidates (P(toxic)):")
    for cand, score in scored:
        print(f" {score:.4f} → {cand}")

    best, best_score = min(scored, key=lambda x: x[1])
    print(f"\nChosen detoxified: ({best_score:.4f}) {best}")
    return best

# ── 4) Example usage ─────────────────────────────────────────────────────
if __name__ == "__main__":
    toxic_sentences = [
        "I hate your fucking black ass!",
        "You're such an idiot and a total disgrace!",
        "Go back to where you came from!"
    ]
    for sent in toxic_sentences:
        print("\nOriginal:", sent)
        detox = back_translate_detox(sent, num_samples=8, top_k=50, top_p=0.95)
        print("Detoxified:", detox)


Original: I hate your fucking black ass!
Back-translation candidates (P(toxic)):
 0.9989 → I hate your fucking black ass!
 0.9989 → I hate your fucking black ass!
 0.9989 → I hate your fucking black ass!
 0.9989 → I hate your fucking black ass!
 0.9989 → I hate your fucking black ass!
 0.9989 → I hate your fucking black ass!
 0.9989 → I hate your fucking black ass!
 0.9989 → I hate your fucking black ass!

Chosen detoxified: (0.9989) I hate your fucking black ass!
Detoxified: I hate your fucking black ass!

Original: You're such an idiot and a total disgrace!
Back-translation candidates (P(toxic)):
 0.9902 → You're an idiot and a disgrace!
 0.9899 → You're an idiot and a total shame!
 0.9901 → You're such an idiot and a disgrace!
 0.9901 → You're such an idiot and a disgrace!
 0.9899 → You're an idiot and a total shame!
 0.9901 → You're such an idiot and a disgrace!
 0.9899 → You're an idiot and a total shame!
 0.9899 → You're an idiot and a total shame!

Chosen detoxified: (0.9899) Y

In [10]:
import pandas as pd, json
from sklearn.model_selection import train_test_split

df = pd.read_csv("paradetox.tsv", sep="\t").rename(columns={'neutral1':'neutral'})
train, valid = train_test_split(df, test_size=0.1, random_state=42)

for split, subset in (("train",train),("valid",valid)):
    with open(f"DiffuSeq/datasets/detox/{split}.jsonl","w") as fout:
        for _, row in subset.iterrows():
            rec = {"src": row["toxic"], "trg": row["neutral"]}
            fout.write(json.dumps(rec, ensure_ascii=False)+"\n")