In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

# Load tokenizer and model
model_name = "unitary/toxic-bert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def get_toxicity_scores(sentences):
    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        scores = F.softmax(outputs.logits, dim=1)  # Convert logits to probabilities
        toxic_probs = scores[:, 1].tolist()  # Probability of the "toxic" class
    return toxic_probs

# Example
sentences = [
    "I hope you have a great day!",
    "You're an idiot and no one likes you.",
    "I love how you think.",
    "Shut up, you're the worst."
]

scores = get_toxicity_scores(sentences)
for sent, score in zip(sentences, scores):
    print(f"Toxicity: {score:.2f} | \"{sent}\"")

tokenizer_config.json:   0%|          | 0.00/174 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/811 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Toxicity: 0.08 | "I hope you have a great day!"
Toxicity: 0.00 | "You're an idiot and no one likes you."
Toxicity: 0.09 | "I love how you think."
Toxicity: 0.00 | "Shut up, you're the worst."


In [2]:
from transformers import AutoConfig

config = AutoConfig.from_pretrained("unitary/toxic-bert")
print(config.id2label)

{0: 'toxic', 1: 'severe_toxic', 2: 'obscene', 3: 'threat', 4: 'insult', 5: 'identity_hate'}


In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

model_name = "unitary/toxic-bert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

labels = [
    "toxic", "severe_toxic", "obscene",
    "threat", "insult", "identity_hate"
]

def get_toxicity_scores(sentences):
    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.sigmoid(outputs.logits)  # Sigmoid for multi-label classification
        return probs.tolist()

# Example usage
sentences = [
    "I hope you have a great day!",
    "You're an idiot and no one likes you.",
    "I love how you think.",
    "Shut up, you're the worst."
]

scores = get_toxicity_scores(sentences)

for sent, score_vec in zip(sentences, scores):
    print(f"\n\"{sent}\"")
    for label, score in zip(labels, score_vec):
        if score > 0.3:  # You can adjust threshold
            print(f"  → {label}: {score:.2f}")


"I hope you have a great day!"

"You're an idiot and no one likes you."
  → toxic: 0.99
  → obscene: 0.73
  → insult: 0.95

"I love how you think."

"Shut up, you're the worst."
  → toxic: 0.98
  → insult: 0.82
