# Generate Mixed Dataset

In [None]:
from datasets import load_dataset, concatenate_datasets
from transformers import AutoTokenizer
import numpy as np
from dataset_processing import datasets_config, sample_dataset
from transformers import AutoTokenizer

toxicchat_train = None 
toxicchat_test = None 
other_datasets = []
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
required_columns = ["input_ids", "attention_mask", "label"] 

for config in datasets_config:
    print(f"Processing {config['name']}...")

    if config["name"] == "lmsys/toxic-chat":
        # Load the specific subset of ToxicChat
        raw_dataset = load_dataset(config["name"], config["subset"])
        toxicchat_train = raw_dataset["train"].map(config["preprocess_function"], fn_kwargs={"tokenizer": tokenizer}, batched=True)
        toxicchat_test = raw_dataset["test"].map(config["preprocess_function"], fn_kwargs={"tokenizer": tokenizer}, batched=True)
        toxicchat_train = toxicchat_train.remove_columns([col for col in toxicchat_train.column_names if col not in required_columns])
        toxicchat_test = toxicchat_test.remove_columns([col for col in toxicchat_test.column_names if col not in required_columns])
    else:
        raw_dataset = load_dataset(config["name"], split=config["split"])
        tokenized_dataset = raw_dataset.map(config["preprocess_function"], fn_kwargs={"tokenizer": tokenizer}, batched=True)
        tokenized_dataset = tokenized_dataset.remove_columns([col for col in tokenized_dataset.column_names if col not in required_columns])
        sampled_dataset = sample_dataset(tokenized_dataset, sample_size=3333)
        other_datasets.append(sampled_dataset)
    

combined_other_datasets = concatenate_datasets(other_datasets)
split_other_datasets = combined_other_datasets.train_test_split(test_size=0.5, seed=1337)

final_train_dataset = concatenate_datasets([toxicchat_train, split_other_datasets["train"]])
final_test_dataset = concatenate_datasets([split_other_datasets["test"], toxicchat_test])

print(f"ToxicChat Train Dataset: {len(toxicchat_train)} examples")
print(f"ToxicChat Test Dataset: {len(toxicchat_test)} examples")
print(f"Final Train Dataset: {len(final_train_dataset)} examples")
print(f"Final Test Dataset: {len(final_test_dataset)} examples")

In [None]:
from datasets import DatasetDict

final_dataset = DatasetDict({
    "train": final_train_dataset,
    "test": final_test_dataset
})

# Save  to Hugging Face
final_dataset.push_to_hub("inxoy/toxicbench-mixed", private=True)

# Mixed-finetuning

In [None]:
base_model = "distilbert-base-uncased"
hf_username = "inxoy"
output_model = "distilbert-mixed"

In [None]:
from finetune import get_trainer 

trainer = get_trainer(
    base_model=base_model, 
    output_model=output_model, 
    tokenizer=tokenizer,
    train=final_train_dataset,
    test=final_test_dataset)
trainer.train() # Pushes to hub once, done