In [None]:
!pip install evaluate
!pip install 'accelerate>=0.26.0

In [None]:
from huggingface_hub import login
import os

token = os.getenv("HF_TOKEN")
login(token=token)

In [None]:
base_model = "distilbert-base-uncased"
hf_username = "inxoy"

# RealToxicityPrompts

In [None]:
output_model = "distilbert-rtp"
dataset = "allenai/real-toxicity-prompts"

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer
from dataset_processing import preprocess_realtoxicityprompts, split_and_subset
from finetune import remove_unused_features

tokenizer = AutoTokenizer.from_pretrained(base_model)
toxic = load_dataset(dataset)
toxic = toxic["train"].map(preprocess_realtoxicityprompts, fn_kwargs={"tokenizer": tokenizer}, batched=True)
toxic = remove_unused_features(toxic)
subset_tokenized_ds = split_and_subset(toxic, seed=1337, fraction=1/6, test_size=0.2)

tokenized_train_subset = subset_tokenized_ds["train"]
tokenized_test_subset = subset_tokenized_ds["test"]

In [None]:
from finetune import get_trainer 

trainer = get_trainer(
    base_model=base_model, 
    output_model=output_model, 
    tokenizer=tokenizer,
    train=tokenized_train_subset,
    test=tokenized_test_subset)
trainer.train() 

# Jigsaw

In [None]:
output_model = "distilbert-jsaw"
dataset = "tasksource/jigsaw_toxicity"

In [None]:
from dataset_processing import preprocess_jigsawtoxicity

tokenizer = AutoTokenizer.from_pretrained(base_model)
toxic = load_dataset(dataset, split="train")
toxic = toxic.map(preprocess_jigsawtoxicity, fn_kwargs={"tokenizer": tokenizer}, batched=True)
toxic = remove_unused_features(toxic)
subset_tokenized_ds = split_and_subset(toxic, seed=1337, fraction=1/6, test_size=0.2)

tokenized_train_subset = subset_tokenized_ds["train"]
tokenized_test_subset = subset_tokenized_ds["test"]

In [None]:
from finetune import get_trainer 

trainer = get_trainer(
    base_model=base_model, 
    output_model=output_model, 
    tokenizer=tokenizer,
    train=tokenized_train_subset,
    test=tokenized_test_subset)
trainer.train() 

# Civil Comments

In [None]:
output_model = "distilbert-cc"
dataset = "google/civil_comments"

In [None]:
from dataset_processing import preprocess_civilcomments

tokenizer = AutoTokenizer.from_pretrained(base_model)
toxic = load_dataset(dataset, split="train")
toxic = toxic.map(preprocess_civilcomments, fn_kwargs={"tokenizer": tokenizer}, batched=True)
toxic = remove_unused_features(toxic)
subset_tokenized_ds = split_and_subset(toxic, seed=1337, fraction=1/6, test_size=0.2)

tokenized_train_subset = subset_tokenized_ds["train"]
tokenized_test_subset = subset_tokenized_ds["test"]

In [None]:
from finetune import get_trainer 

trainer = get_trainer(
    base_model=base_model, 
    output_model=output_model, 
    tokenizer=tokenizer,
    train=tokenized_train_subset,
    test=tokenized_test_subset)
trainer.train() 

# Toxic Chat

In [None]:
output_model = "distilbert-tc"
dataset = "lmsys/toxic-chat"

In [None]:
from dataset_processing import preprocess_toxicchat

tokenizer = AutoTokenizer.from_pretrained(base_model)
toxic = load_dataset(dataset, 'toxicchat0124')

toxic = toxic["train"].map(preprocess_toxicchat, fn_kwargs={"tokenizer": tokenizer}, batched=True)
toxic = remove_unused_features(toxic)
subset_tokenized_ds = split_and_subset(toxic, seed=1337, fraction=1, test_size=0.2)

tokenized_train_subset = subset_tokenized_ds["train"]
tokenized_test_subset = subset_tokenized_ds["test"]

In [None]:
from finetune import get_trainer 

trainer = get_trainer(
    base_model=base_model, 
    output_model=output_model, 
    tokenizer=tokenizer,
    train=tokenized_train_subset,
    test=tokenized_test_subset)
trainer.train() 