In [None]:
!pip install datasets torchmetrics

In [None]:
from huggingface_hub import login
import os
token = os.getenv("HF_TOKEN")
login(token=token)

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["user_input"], truncation=True, padding=True)

toxic = load_dataset("lmsys/toxic-chat", 'toxicchat0124')
tokenized_test_dataset = toxic["test"].map(preprocess_function, batched=True)

# Rename the 'toxicity' column to 'label' for compatibility
tokenized_test_dataset = tokenized_test_dataset.rename_column("toxicity", "label")
columns_to_remove = ['conv_id', 'model_output', 'human_annotation', 'jailbreaking', 'openai_moderation', 'user_input']
tokenized_test_dataset = tokenized_test_dataset.remove_columns(columns_to_remove)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

In [11]:
import torch
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
import numpy as np
from metrics import compute_metrics

def evaluate_model(model, dataset, collator):
    model.eval()
    dataloader = DataLoader(dataset, batch_size=16, collate_fn=collator)
    all_preds = []
    all_labels = []
    cnt = 0
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating", leave=False):
            inputs = {k: v.to(model.device) for k, v in batch.items() if k in tokenizer.model_input_names}
            labels = batch["labels"].to(model.device)
            
            # Get model predictions
            outputs = model(**inputs)
            logits = outputs.logits
            all_preds.append(logits.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
    
    # Concatenate predictions and labels
    all_preds = np.concatenate(all_preds, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)
    return compute_metrics(all_preds, all_labels)

In [None]:
from transformers import AutoModelForSequenceClassification

base_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
fine_tuned_model = AutoModelForSequenceClassification.from_pretrained("inxoy/my_awesome_model")

base_model_metrics = evaluate_model(base_model, tokenized_test_dataset)
fine_tuned_model_metrics = evaluate_model(fine_tuned_model, tokenized_test_dataset)

print("Base Model Metrics:", base_model_metrics)
print("Fine-Tuned Model Metrics:", fine_tuned_model_metrics)

In [None]:
import pickle

# Save metrics as pickled files
with open('results/tc_distilbert_base_uncased.pkl', 'wb') as base_file:
    pickle.dump(base_model_metrics, base_file)

with open('results/tc_distilbert_base_uncased_ft.pkl', 'wb') as base_file:
    pickle.dump(fine_tuned_model_metrics, base_file)

In [None]:
from transformers import AutoModelForSequenceClassification

t = AutoModelForSequenceClassification.from_pretrained("tensor-trek/distilbert-toxicity-classifier")
m = evaluate_model(t, tokenized_test_dataset)
print("Metrics:", m)

In [None]:
with open('results/tc_distilbert_base_tensor_trek.pkl', 'wb') as base_file:
    pickle.dump(m, base_file)

# Do the same for RTP

In [27]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Define your toxicity threshold
TOXICITY_THRESHOLD = 0.7

# Define the preprocessing function
def preprocess_function(examples):
    # Initialize lists to store the results
    texts = []
    labels = []
    
    # Iterate over each example in the batch
    for prompt in examples["prompt"]:
        # Extract text and toxicity
        text = prompt.get("text", "")
        toxicity = prompt.get("toxicity", 0.0)
        
        # Determine the label based on the toxicity threshold
        label = 1 if toxicity and toxicity > TOXICITY_THRESHOLD else 0
        
        # Append the results to the lists
        texts.append(text)
        labels.append(label)
    
    # Tokenize the texts
    tokenized_inputs = tokenizer(texts, truncation=True, padding=True)
    
    # Add the labels to the tokenized inputs
    tokenized_inputs["label"] = labels
    
    return tokenized_inputs

# Load the dataset
toxic = load_dataset("allenai/real-toxicity-prompts")

# Split the dataset
toxic = toxic["train"].train_test_split(test_size=0.2, seed=1337)

# Map the preprocessing function to the dataset
tokenized_test_dataset = toxic["test"].map(preprocess_function, batched=True)

features_to_keep = ["input_ids", "attention_mask", "label"]
tokenized_test_dataset = tokenized_test_dataset.remove_columns(
    [col for col in tokenized_test_dataset.column_names if col not in features_to_keep]
)

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

fine_tuned_model = AutoModelForSequenceClassification.from_pretrained("inxoy/my_awesome_model")
base_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
fine_tuned_model_metrics = evaluate_model(fine_tuned_model, tokenized_test_dataset, data_collator)
base_model_metrics = evaluate_model(base_model, tokenized_test_dataset, data_collator)
print("Fine-Tuned Model Metrics:", fine_tuned_model_metrics)
print("Base Model Metrics:", base_model_metrics)

In [None]:
with open('results/rtp_distilbert_base.pkl', 'wb') as base_file:
    pickle.dump(base_model_metrics, base_file)

with open('results/rtp_distilbert_base_ft.pkl', 'wb') as base_file:
    pickle.dump(fine_tuned_model_metrics, base_file)

In [28]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("tensor-trek/distilbert-toxicity-classifier")
m = evaluate_model(model, tokenized_test_dataset, data_collator)
print("Metrics:", m)

                                                               

Metrics: {'accuracy': 0.9427321635074665, 'precision': 0.7503639010189228, 'recall': 0.8198807157057654, 'f1': 0.7835835075052252, 'true_positive': 2062, 'true_negative': 16688, 'false_positive': 686, 'false_negative': 453, 'AUROC': 0.9712755583455637, 'AUPRC': 0.8782779632118303, 'MCC': 0.7516362808994491, 'ECE': 0.037397634238004684, 'MCE': 0.2185865044593811, 'raw_data': {'predictions': [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 

In [29]:
with open('results/rtp_distilbert_base_tensor_trek', 'wb') as base_file:
    pickle.dump(m , base_file)

# ToxDect Roberta

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Xuhui/ToxDect-roberta-large")
model = AutoModelForSequenceClassification.from_pretrained("Xuhui/ToxDect-roberta-large")

def preprocess_function(examples):
    return tokenizer(examples["user_input"], truncation=True, padding=True)

toxic = load_dataset("lmsys/toxic-chat", 'toxicchat0124')
tokenized_test_dataset = toxic["test"].map(preprocess_function, batched=True)

# Rename the 'toxicity' column to 'label' for compatibility
tokenized_test_dataset = tokenized_test_dataset.rename_column("toxicity", "label")
columns_to_remove = ['conv_id', 'model_output', 'human_annotation', 'jailbreaking', 'openai_moderation', 'user_input']
tokenized_test_dataset = tokenized_test_dataset.remove_columns(columns_to_remove)
tokenized_test_dataset = tokenized_test_dataset.select(range(500))


In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

m = evaluate_model(model, tokenized_test_dataset, data_collator)
print("hateBERT Metrics:", m)

In [None]:
import pickle 

with open('results/tc_toxdec_roberta.pkl', 'wb') as f:
    pickle.dump(m, f)

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("Xuhui/ToxDect-roberta-large")

# Define your toxicity threshold
TOXICITY_THRESHOLD = 0.7

# Define the preprocessing function
def preprocess_function(examples):
    # Initialize lists to store the results
    texts = []
    labels = []
    
    # Iterate over each example in the batch
    for prompt in examples["prompt"]:
        # Extract text and toxicity
        text = prompt.get("text", "")
        toxicity = prompt.get("toxicity", 0.0)
        
        # Determine the label based on the toxicity threshold
        label = 1 if toxicity and toxicity > TOXICITY_THRESHOLD else 0
        
        # Append the results to the lists
        texts.append(text)
        labels.append(label)
    
    tokenized_inputs = tokenizer(texts, truncation=True, padding=True)
    tokenized_inputs["label"] = labels
    
    return tokenized_inputs

# Load the dataset
toxic = load_dataset("allenai/real-toxicity-prompts")

# Split the dataset
toxic = toxic["train"].train_test_split(test_size=0.2, seed=1337)

# Map the preprocessing function to the dataset
tokenized_test_dataset = toxic["test"].map(preprocess_function, batched=True)

features_to_keep = ["input_ids", "attention_mask", "label"]
tokenized_test_dataset = tokenized_test_dataset.remove_columns(
    [col for col in tokenized_test_dataset.column_names if col not in features_to_keep]
)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

m = evaluate_model(model, tokenized_test_dataset, data_collator)
print("hateBERT Metrics:", m)

In [None]:
with open('results/rtp_toxdec_roberta.pkl', 'wb') as f:
    pickle.dump(m, f)

# Civil Comments

In [22]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Preprocessing function
def preprocess_function(examples):
    toxicity = examples.get("toxicity", [0.0])  # Get the list of toxicity scores, defaulting to [0.0]
    texts = examples.get("text", [""])  # Get the list of texts, defaulting to [""]
    
    tokenized_inputs = tokenizer(texts, truncation=True, padding="max_length", max_length=512)
    
    tokenized_inputs["label"] = [1 if score > 0.7 else 0 for score in toxicity]
    
    return tokenized_inputs

# Load dataset
toxic = load_dataset("google/civil_comments")

tokenized_test_dataset = toxic["test"].map(preprocess_function, batched=True)
tokenized_test_dataset

Dataset({
    features: ['text', 'toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit', 'input_ids', 'attention_mask', 'label'],
    num_rows: 97320
})

In [23]:
features_to_keep = ["input_ids", "attention_mask", "label"]
tokenized_test_dataset = tokenized_test_dataset.remove_columns(
    [col for col in tokenized_test_dataset.column_names if col not in features_to_keep]
)

tokenized_test_dataset = tokenized_test_dataset.select(range(2000))


In [24]:
from transformers import AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

fine_tuned_model = AutoModelForSequenceClassification.from_pretrained("inxoy/my_awesome_model")
fine_tuned_model_metrics = evaluate_model(fine_tuned_model, tokenized_test_dataset, data_collator)
print("Fine-Tuned Model Metrics:", fine_tuned_model_metrics)

                                                             

Fine-Tuned Model Metrics: {'accuracy': 0.8635, 'precision': 0.06746031746031746, 'recall': 0.3090909090909091, 'f1': 0.11074918566775244, 'true_positive': 17, 'true_negative': 1710, 'false_positive': 235, 'false_negative': 38, 'AUROC': 0.7319654124795513, 'AUPRC': 0.08145018203345197, 'MCC': 0.09277849729395311, 'ECE': 0.12034974247217178, 'MCE': 0.84041428565979, 'raw_data': {'predictions': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0



In [25]:
base_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

base_model_metrics = evaluate_model(base_model, tokenized_test_dataset, data_collator)
print("Base Model Metrics:", base_model_metrics)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                             

Base Model Metrics: {'accuracy': 0.3285, 'precision': 0.02225519287833828, 'recall': 0.5454545454545454, 'f1': 0.04276550249465431, 'true_positive': 30, 'true_negative': 627, 'false_positive': 1318, 'false_negative': 25, 'AUROC': 0.5784342136013088, 'AUPRC': 0.043935142611244676, 'MCC': -0.04611471382978491, 'ECE': 0.47463318705558777, 'MCE': 0.4786135256290436, 'raw_data': {'predictions': [1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 



In [28]:
import pickle 

with open('results/cc_distilbert_base.pkl', 'wb') as f:
    pickle.dump(base_model_metrics, f)

In [29]:
with open('results/cc_distilbert_ft.pkl', 'wb') as f:
    pickle.dump(fine_tuned_model_metrics, f)

In [30]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("tensor-trek/distilbert-toxicity-classifier")
m = evaluate_model(model, tokenized_test_dataset, data_collator)
print("Metrics:", m)

                                                             

Metrics: {'accuracy': 0.96, 'precision': 0.39316239316239315, 'recall': 0.8363636363636363, 'f1': 0.5348837209302325, 'true_positive': 46, 'true_negative': 1874, 'false_positive': 71, 'false_negative': 9, 'AUROC': 0.9553260107501752, 'AUPRC': 0.6717283378057822, 'MCC': 0.5573614357065113, 'ECE': 0.07052043825387955, 'MCE': 0.5188865065574646, 'raw_data': {'predictions': [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [31]:
with open('results/cc_distilbert_trek.pkl', 'wb') as f:
    pickle.dump(m, f)

In [46]:
from transformers import AutoModelForSequenceClassification
from datasets import load_dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Xuhui/ToxDect-roberta-large")
model = AutoModelForSequenceClassification.from_pretrained("Xuhui/ToxDect-roberta-large")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

# Preprocessing function
def preprocess_function(examples):
    toxicity = examples.get("toxicity", [0.0])  # Get the list of toxicity scores, defaulting to [0.0]
    texts = examples.get("text", [""])  # Get the list of texts, defaulting to [""]
    
    tokenized_inputs = tokenizer(texts, truncation=True, padding="max_length", max_length=512)
    
    tokenized_inputs["label"] = [1 if score > 0.7 else 0 for score in toxicity]
    
    return tokenized_inputs

# Load dataset
toxic = load_dataset("google/civil_comments")

tokenized_test_dataset = toxic["test"].map(preprocess_function, batched=True)
tokenized_test_dataset

README.md:   0%|          | 0.00/7.73k [00:00<?, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/194M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/187M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1804874 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/97320 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/97320 [00:00<?, ? examples/s]

Map:   0%|          | 0/97320 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit', 'input_ids', 'attention_mask', 'label'],
    num_rows: 97320
})

In [50]:
features_to_keep = ["input_ids", "attention_mask", "label"]
tokenized_test_dataset = tokenized_test_dataset.remove_columns(
    [col for col in tokenized_test_dataset.column_names if col not in features_to_keep]
)

tokenized_test_dataset = tokenized_test_dataset.select(range(1000))


In [51]:

m = evaluate_model(model, tokenized_test_dataset, data_collator)
print("Metrics:", m)

                                                           

Metrics: {'accuracy': 0.893, 'precision': 0.1889763779527559, 'recall': 0.8571428571428571, 'f1': 0.3096774193548387, 'true_positive': 24, 'true_negative': 869, 'false_positive': 103, 'false_negative': 4, 'AUROC': 0.9279467960023515, 'AUPRC': 0.4073655902400343, 'MCC': 0.3721728959667892, 'ECE': 0.13361598551273346, 'MCE': 0.7592238187789917, 'raw_data': {'predictions': [0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,



In [52]:
with open('results/cc_toxdec_roberta.pkl', 'wb') as f:
    pickle.dump(m, f)

# Jigsaw

In [31]:
from datasets import load_dataset

ds = load_dataset("tasksource/jigsaw_toxicity")
ds["train"][0]

{'id': '0000997932d777bf',
 'comment_text': "Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",
 'toxic': 0,
 'severe_toxic': 0,
 'obscene': 0,
 'threat': 0,
 'insult': 0,
 'identity_hate': 0}

In [36]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    return tokenizer(examples["comment_text"], truncation=True, padding=True)

# Load dataset
ds = load_dataset("tasksource/jigsaw_toxicity")

tokenized_test_dataset = ds["train"].map(preprocess_function, batched=True)
tokenized_test_dataset = tokenized_test_dataset.rename_column("toxic", "label")

tokenized_test_dataset

Dataset({
    features: ['id', 'comment_text', 'label', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate', 'input_ids', 'attention_mask'],
    num_rows: 159571
})

In [37]:
features_to_keep = ["input_ids", "attention_mask", "label"]
tokenized_test_dataset = tokenized_test_dataset.remove_columns(
    [col for col in tokenized_test_dataset.column_names if col not in features_to_keep]
)

tokenized_test_dataset = tokenized_test_dataset.select(range(5000))

In [21]:
from transformers import AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

fine_tuned_model = AutoModelForSequenceClassification.from_pretrained("inxoy/my_awesome_model")
fine_tuned_model_metrics = evaluate_model(fine_tuned_model, tokenized_test_dataset, data_collator)
print("Fine-Tuned Model Metrics:", fine_tuned_model_metrics)

Evaluating:   0%|          | 0/313 [00:00<?, ?it/s]

                                                             

Fine-Tuned Model Metrics: {'accuracy': 0.8936, 'precision': 0.4774494556765163, 'recall': 0.610337972166998, 'f1': 0.5357766143106457, 'true_positive': 307, 'true_negative': 4161, 'false_positive': 336, 'false_negative': 196, 'AUROC': 0.8937895862538798, 'AUPRC': 0.5077198397167065, 'MCC': 0.4812870395437213, 'ECE': 0.05787792056798935, 'MCE': 0.46033790707588196, 'raw_data': {'predictions': [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1



In [22]:
import pickle
with open('results/jsaw_distilbert_ft.pkl', 'wb') as f:
    pickle.dump(fine_tuned_model_metrics, f)

In [23]:
base_model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

base_model_metrics = evaluate_model(base_model, tokenized_test_dataset, data_collator)
print("Base Model Metrics:", base_model_metrics)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
                                                             

Base Model Metrics: {'accuracy': 0.1006, 'precision': 0.1006, 'recall': 1.0, 'f1': 0.1828093767036162, 'true_positive': 503, 'true_negative': 0, 'false_positive': 4497, 'false_negative': 0, 'AUROC': 0.6804885607414001, 'AUPRC': 0.16581263445304922, 'MCC': 0.0, 'ECE': 0.052762191742658615, 'MCE': 0.06231088563799858, 'raw_data': {'predictions': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,



In [24]:
import pickle
with open('results/jsaw_distilbert_base.pkl', 'wb') as f:
    pickle.dump(base_model_metrics, f)

In [25]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("tensor-trek/distilbert-toxicity-classifier")
m = evaluate_model(model, tokenized_test_dataset, data_collator)
print("Metrics:", m)

                                                             

Metrics: {'accuracy': 0.9734, 'precision': 0.8517110266159695, 'recall': 0.8906560636182903, 'f1': 0.8707482993197279, 'true_positive': 448, 'true_negative': 4419, 'false_positive': 78, 'false_negative': 55, 'AUROC': 0.9922028867488863, 'AUPRC': 0.9417863801043299, 'MCC': 0.8561967754038164, 'ECE': 0.020522495731711388, 'MCE': 0.12580782175064087, 'raw_data': {'predictions': [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 



In [26]:
with open('results/jsaw_distilbert_trek.pkl', 'wb') as f:
    pickle.dump(m, f)

In [43]:
from transformers import AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("Xuhui/ToxDect-roberta-large")
model = AutoModelForSequenceClassification.from_pretrained("Xuhui/ToxDect-roberta-large")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")


from datasets import load_dataset

def preprocess_function(examples):
    return tokenizer(examples["comment_text"], truncation=True, padding=True)

ds = load_dataset("tasksource/jigsaw_toxicity")

tokenized_test_dataset = ds["train"].map(preprocess_function, batched=True)
tokenized_test_dataset = tokenized_test_dataset.rename_column("toxic", "label")

features_to_keep = ["input_ids", "attention_mask", "label"]
tokenized_test_dataset = tokenized_test_dataset.remove_columns(
    [col for col in tokenized_test_dataset.column_names if col not in features_to_keep]
)

tokenized_test_dataset = tokenized_test_dataset.select(range(1000))

In [44]:
m = evaluate_model(model, tokenized_test_dataset, data_collator)
print("Metrics:", m)

                                                           

Metrics: {'accuracy': 0.95, 'precision': 0.7477477477477478, 'recall': 0.7904761904761904, 'f1': 0.7685185185185186, 'true_positive': 83, 'true_negative': 867, 'false_positive': 28, 'false_negative': 22, 'AUROC': 0.9634051609470604, 'AUPRC': 0.8650144045951857, 'MCC': 0.7408751108419478, 'ECE': 0.03772960603237152, 'MCE': 0.6593652367591858, 'raw_data': {'predictions': [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 



In [45]:
with open('results/jsaw_toxdec_roberta.pkl', 'wb') as f:
    pickle.dump(m, f)