In [1]:
from datasets import DatasetDict, Dataset
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    T5Tokenizer,
    T5ForConditionalGeneration,
    T5Config,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    GenerationConfig,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    EarlyStoppingCallback,
    pipeline,
)
from sentence_transformers import SentenceTransformer
import torch
from torch import nn
import numpy as np
import time
import gc
import GPUtil
import evaluate
from numba import cuda
import wandb
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from functools import partial
import wandb
import os
import pickle
import optuna
from typing import Dict, Union, Optional, Tuple, List, Any
import pandas as pd

2023-11-10 10:33:56.669582: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-10 10:33:56.669654: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-10 10:33:56.669683: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
wandb.login()
os.environ["WAND_NOTEBOOK_NAME"] = "w266_final_project_models"
os.environ["WANDB_DIR"] = "../models/wandb"
os.environ["WANDB_PROJECT"] = "w266_final_project"

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mgarykong[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [17]:
# Random seed for reproducibility
RANDOM_SEED = 42

# Parameters for classification
BATCH_SIZE_EVAL = 32
BATCH_SIZE_TRAIN = 32

# Default parameters for T5 model fine-tuning
PER_DEVICE_TRAIN_BATCH_SIZE = 64
PER_DEVICE_EVAL_BATCH_SIZE = 128
LEARNING_RATE = 3e-4
NUM_TRAIN_EPOCHS = 20
EARLY_STOPPING_PATIENCE = 2
NUM_BEAMS = 4

# Setting the DEVICE to cuda
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set path for profane word list
PROFANE_WORD_PATH = "../data/raw/en.txt"

# Set path for raw dataset dictionary
RAW_DATASET_PATH = "../data/processed/raw_dataset.pkl"
AUG_DATASET_ALL_FILTERS_PATH = "../data/processed/aug_datasets_all_filters"
AUG_DATASET_NO_TOXICITY_FILTER_PATH = "../data/processed/aug_datasets_no_toxicity_filter"
AUG_DATASET_NO_SIMILARITY_FILTER_PATH = "../data/processed/aug_datasets_no_similarity_filter"
AUG_DATASET_NO_ACCEPTABILITY_FILTER_PATH = "../data/processed/aug_datasets_no_acceptability_filter"

# Set maximum length for input and output
MAX_INPUT_LENGTH = 64
MAX_OUTPUT_LENGTH = 64

In [4]:
# Load tokenizers and models
tokenizer_t5_base = T5Tokenizer.from_pretrained("t5-base")
model_t5_base = T5ForConditionalGeneration.from_pretrained("t5-base").to(DEVICE)
tokenizer_t5_small = T5Tokenizer.from_pretrained("t5-small")
model_t5_small = T5ForConditionalGeneration.from_pretrained("t5-small").to(DEVICE)
tokenizer_toxicity = RobertaTokenizer.from_pretrained("SkolkovoInstitute/roberta_toxicity_classifier")
model_toxicity = RobertaForSequenceClassification.from_pretrained("SkolkovoInstitute/roberta_toxicity_classifier").to(DEVICE)
tokenizer_acceptability = AutoTokenizer.from_pretrained("iproskurina/tda-bert-en-cola")
model_acceptability = AutoModelForSequenceClassification.from_pretrained("iproskurina/tda-bert-en-cola").to(DEVICE)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Some weights of the model checkpoint at SkolkovoInstitute/roberta_toxicity_classifier were not used when initializing RobertaForSequenc

In [18]:
# Load datasets
raw_datasets = DatasetDict.load_from_disk(RAW_DATASET_PATH)
aug_datasets_all_filters = DatasetDict.load_from_disk(AUG_DATASET_ALL_FILTERS_PATH)
aug_datasets_no_acceptability_filter = DatasetDict.load_from_disk(AUG_DATASET_NO_ACCEPTABILITY_FILTER_PATH)
aug_datasets_no_similarity_filter = DatasetDict.load_from_disk(AUG_DATASET_NO_SIMILARITY_FILTER_PATH)
aug_datasets_no_toxicity_filter = DatasetDict.load_from_disk(AUG_DATASET_NO_TOXICITY_FILTER_PATH)

## Debugging Functions

In [19]:
def measure_time(func, *args, **kwargs):
    """
    Calculates the time it takes to run a function.
    """
    start_time = time.time()
    result = func(*args, **kwargs)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Function {func.__name__} took {elapsed_time:.2f} seconds to run.")
    return result

def get_gpu_memory():
    """
    Gets the GPU memory information.
    """
    gpus = GPUtil.getGPUs()
    gpu = gpus[0]
    print(f"Total GPU memory: {gpu.memoryTotal}MB")
    print(f"Free GPU memory: {gpu.memoryFree}MB")
    print(f"Used GPU memory: {gpu.memoryUsed}MB")

def force_clear_GPU_memory():
    """
    Force clears the GPU memory.
    """
    cuda.select_device(0)
    cuda.close()

def cleanup():
    """
    Cleans up the GPU memory.
    """
    gc.collect()
    torch.cuda.empty_cache()

## Evaluation Metrics

In [20]:
# Initialize model variables
model_bleurt = None
model_bertscore = None
model_sacrebleu = None

def calc_sacrebleu(refs, preds):
    """
    Calculates the SacreBLEU score.

    Args:
        refs (list): List of reference sentences
        preds (list): List of predicted sentences
    
    Returns:
        results (float): SacreBLEU score
    """
    global model_sacrebleu

    if model_sacrebleu is None:
        model_sacrebleu = evaluate.load("sacrebleu")

    results = model_sacrebleu.compute(predictions=preds, references=refs)["score"]
    results = results/100

    return results

def calc_bert_score(
    refs, preds, model_type="microsoft/deberta-large-mnli", output_mean=True
    ):
    """
    Calculates BERT score per line. Note: https://docs.google.com/spreadsheets/d/1RKOVpselB98Nnh_EOC4A2BYn8_201tmPODpNWu4w7xI/edit#gid=0 lists the best performing models
    Args:
        refs (list): List of reference sentences.
        y_pred (list): List of predicted sentences.
        model_type (str): Type of BERT model to use.
        output_mean (bool): Whether to output the mean of the scores.

    Returns:
        list of precision, recall, f1 scores.

    """
    global model_bertscore

    if model_bertscore is None:
        model_bertscore = evaluate.load("bertscore")
        
    results = model_bertscore.compute(predictions=preds, references=refs, model_type=model_type)
    precision = np.array(results["precision"])
    recall = np.array(results["recall"])
    f1 = np.array(results["f1"])
    
    if output_mean:
        precision = precision.mean()
        recall = recall.mean()
        f1 = f1.mean()

    return precision, recall, f1

def calc_bleurt(refs, preds, checkpoint="BLEURT-20_D12", output_mean = True):
    """
    Calculates BLEURT score per line.

    Args:
        refs (list): List of reference sentences.
        preds (list): List of predicted sentences.
        output_type (str): Type of output to return. Either 'numpy' or 'list'.

    Returns:
        list/array of BLEURT scores.
    """
    global model_bleurt

    if model_bleurt is None:
        model_bleurt = evaluate.load("bleurt", module_type="metric", checkpoint=checkpoint)

    results = np.array(model_bleurt.compute(predictions=preds, references=refs)["scores"])

    if output_mean:
        results = results.mean()

    return results

def calc_tox_acceptability(
    data,
    tokenizer,
    model,
    output_score=True,
    output_mean=True):
    """
    Calculates toxicity and acceptability scores for a given dataset.

    Args:
        data = list of strings to be evaluated
        tokenizer = tokenizer for the model
        model = model to be used for evaluation
        output_score = whether to output the score or the label
        output_mean = whether to output the mean of the scores or the scores for each sentence
    
    Returns:
        array of toxicity and acceptability scores.
    """  
    inputs = tokenizer(data, return_tensors="pt", padding=True).to(DEVICE)
    with torch.no_grad():
        logits = model(**inputs)["logits"]
        if output_score:
            result = torch.nn.functional.softmax(logits, dim=1)[:, 1]
        else:
            result = logits.argmax(1).data
        result = result.cpu().numpy()

    if output_mean:
        result = result.mean()
        
    return result

def evaluate_metrics(
    refs,
    preds,
    tokenizer_toxicity=tokenizer_toxicity,
    model_toxicity=model_toxicity,
    tokenizer_acceptability=tokenizer_acceptability,
    model_acceptability=model_acceptability,
    to_neutral=True,
    weights={
        "BLEU": 0.2,
        "STA": 0.4,
        "Acceptability": 0.2,
        "BERT_Score": 0.2
    },
    include_bleurt=False
):
    """
    Calculates and returns a dictionary of evaluation metrics

    Args:
        refs (list): list of strings (reference)
        preds (list): list of strings (predictions)
        tokenizer_toxicity (tokenizer): tokenizer for toxicity model
        model_toxicity (model): toxicity model
        tokenizer_acceptability (tokenizer): tokenizer for acceptability model
        model_acceptability (model): acceptability model
        to_neutral (bool): whether the goal is to transfer to neutral (True) or to toxic (False)
        weights (dict): dictionary of weights for each metric
        include_bleurt (bool): whether to include BLEURT score in the output

    Returns:
        results (dict): dictionary of evaluation metrics
    """

    # Calculate BLEU score
    bleu = calc_sacrebleu(refs, preds)

    # Calculate toxicity classification
    # tox_ref = calc_tox_acceptability(refs, tokenizer_toxicity, model_toxicity, output_score=False, output_mean=False)
    tox_pred = calc_tox_acceptability(preds, tokenizer_toxicity, model_toxicity, output_score=False, output_mean=False)

    # Calculate style transfer accuracy as proportion of sentences that were correctly classified (as non-toxic / toxic)
    if to_neutral:
        sta_correct_label = 0
    else:
        sta_correct_label = 1

    # sta_ref = (tox_ref == sta_correct_label).sum() / len(tox_ref)
    sta_pred = (tox_pred == sta_correct_label).sum() / len(tox_pred)
    # sta_pct = sta_pred / sta_ref

    # Calculate acceptability scores
    # acc_ref = calc_tox_acceptability(refs, tokenizer_acceptability, model_acceptability)
    acc_pred = calc_tox_acceptability(preds, tokenizer_acceptability, model_acceptability)
    # acc_pct = acc_pred / acc_ref

    # Calculate similarity score
    bert_score_f1 = calc_bert_score(refs, preds, model_type="distilbert-base-uncased")[2]

    # Calculate BLEURT score if include_bleurt is True
    bleurt = None
    if include_bleurt:
        bleurt = calc_bleurt(refs, preds)

    # Calculate composite score
    composite_score = weights["BLEU"] * bleu + weights["STA"] * sta_pred + weights["Acceptability"] * acc_pred + weights["BERT_Score"] * bert_score_f1

    # Return a dictionary of metrics
    results = {
        "BLEU": bleu,
        "STA": sta_pred,
        # "STA_pct": sta_pct,
        "FLU": acc_pred,
        # "Acceptability_pct": acc_pct,
        "SEM": bert_score_f1,
        "Overall": composite_score,
    }
    if include_bleurt:
        results["BLEURT"] = bleurt
        
    return results


# Baseline Models

In [8]:
def baseline_detoxifier(text_list, profane_word_path=PROFANE_WORD_PATH):
    """
    Returns a detoxified version of the text by replacing toxic terms with blanks

    Args:
        text_list (list): list of strings to be detoxified
        toxic_list (list): list of toxic terms to be removed from text_list

    Returns:
        detoxified_text_list (list): list of detoxified strings
    """
    # Load list of profane words
    profane_words = []
    with open(profane_word_path, "r") as f:
        for line in f:
            profane_words.append(line.strip())

    # Detoxify text
    y_pred_delete = []
    for text in text_list:
        for term in profane_words:
            text = text.replace(term, "")
        y_pred_delete.append(text)

    return y_pred_delete

def bart_detoxifier(text_list):
    """
    Returns a detoxified version of the text using BART

    Args:
        text_list (list): list of strings to be detoxified

    Returns:
        detoxified_text_list (list): list of detoxified strings
    """
    # Create predictions using BART
    pipe_bart = pipeline("text2text-generation", model="s-nlp/bart-base-detox", device=DEVICE)

    # Create predictions using BART and show progress using tqdm
    y_pred_bart = pipe_bart(text_list, max_length=MAX_OUTPUT_LENGTH, truncation=True)

    # Convert to list of strings
    y_pred_bart = [x["generated_text"] for x in y_pred_bart]
    
    return y_pred_bart

In [9]:
# Evaluate DELETE model on validation set
delete_preds_val = baseline_detoxifier(raw_datasets["validation"]['source'])
delete_val_metrics = evaluate_metrics(raw_datasets["validation"]['target'], delete_preds_val)
delete_val_metrics

NameError: name 'y_pred_delete' is not defined

In [None]:
# Evaluate BART model on validation set
bart_preds_val = bart_detoxifier(raw_datasets["validation"]['source'])
bart_val_metrics = evaluate_metrics(raw_datasets["validation"]['target'], bart_preds_val)
bart_val_metrics

{'BLEU': 0.7015951162845684,
 'STA_preds': 0.9178541492036881,
 'Acceptability_preds': 0.71802455,
 'BERT_score_f1': 0.9451393306205379,
 'Overall': 0.8400934594361843}

# Helper Functions to Fine-tune T5 Models

In [21]:
def add_prefix(datasetdict, prefix="to_neutral: "):
    """Adds a prefix to the source sequence in the dataset."""
    datasetdict_copy = datasetdict.copy()
    datasetdict_copy["train"] = datasetdict_copy["train"].map(lambda x: {"source": prefix + x["source"]})
    datasetdict_copy["validation"] = datasetdict_copy["validation"].map(lambda x: {"source": prefix + x["source"]})
    datasetdict_copy["test"] = datasetdict_copy["test"].map(lambda x: {"source": prefix + x["source"]})
    datasetdict_copy = DatasetDict(datasetdict_copy)
    return datasetdict_copy

def preprocess_function(examples, tokenizer):
    """Preprocess function for T5."""
    model_inputs = tokenizer(
        examples["source"],
        text_target=examples["target"],
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
    )
    return model_inputs

def post_process(preds, refs, tokenizer):
    """
    Post-process function for T5.

    Args:
        preds (list): list of predicted sequences
        refs (list): list of reference sequences
        tokenizer (PreTrainedTokenizer): tokenizer to use for decoding

    Returns:
        decoded_preds (list): list of decoded predicted sequences
        decoded_refs (list): list of decoded reference sequences
    """
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    refs = np.where(refs != -100, refs, tokenizer.pad_token_id)
    decoded_refs = tokenizer.batch_decode(refs, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_refs = [ref.strip() for ref in decoded_refs]

    return decoded_preds, decoded_refs

def compute_metrics(eval_preds, tokenizer):
    """
    Function to calculate the metrics for trainer.evaluate().

    Args:
        tokenizer (PreTrainedTokenizer): tokenizer to use for decoding the predictions
        eval_preds (tuple): Tuple containing the predictions and references

    Returns:
        dict: Dictionary containing the metrics
    """
    preds, refs = eval_preds

    # Post-process the predictions and references
    decoded_preds, decoded_refs = post_process(preds, refs, tokenizer)
    
    # Evaluate metrics
    return evaluate_metrics(
        decoded_refs,
        decoded_preds,
        tokenizer_toxicity=tokenizer_toxicity,
        model_toxicity=model_toxicity,
        tokenizer_acceptability=tokenizer_acceptability,
        model_acceptability=model_acceptability,
        include_bleurt=False
    )

def setup_trainer(output_dir_name,
                train_dataset,
                eval_dataset,
                model_checkpoint="t5-small",
                per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
                per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
                learning_rate=LEARNING_RATE,
                num_train_epochs=NUM_TRAIN_EPOCHS,
                max_length=MAX_OUTPUT_LENGTH,
                num_beams=NUM_BEAMS,
                early_stopping_patience=EARLY_STOPPING_PATIENCE,
                report_to="wandb",
                ):
    """
    Set up a Seq2SeqTrainer object for training a T5 model.

    Default parameters based on this: https://github.com/google-research/text-to-text-transfer-transformer/blob/main/t5/models/hf_model.py#L55

    Args:
        output_dir_name (str): What to name the model in the output directory.
        model_checkpoint (str): Name of the pre-trained model to use.
        train_dataset (Dataset): Training dataset.
        eval_dataset (Dataset): Validation/test dataset.
        per_device_train_batch_size (int): Batch size for training.
        per_device_eval_batch_size (int): Batch size for evaluation.
        learning_rate (float): Learning rate for optimizer.
        weight_decay (float): Weight decay for optimizer.
        num_train_epochs (int): Number of training epochs.
        max_length (int): Maximum length of generated sequences.
        num_beams (int): Number of beams for beam search.
        compute_metrics (function): Function to compute evaluation metrics.
        callbacks (list): List of callbacks to use.
        trainer_class (Seq2SeqTrainer): Trainer class to use.

    Returns:
        Seq2SeqTrainer: Trainer object for training the T5 model.
    """
    
    # Instantiate model and tokenizer
    model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
    tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)

    # Define the data collator
    data_collator = DataCollatorForSeq2Seq(tokenizer, model, return_tensors="pt", padding=True)

    # Define generation config
    generation_config = GenerationConfig(
        max_length=max_length,
        num_beams=num_beams,
        early_stopping=True,
        eos_token_id=model.config.eos_token_id,
        bos_token_id=model.config.bos_token_id,
        pad_token_id=model.config.pad_token_id,
        decoder_start_token_id=model.config.pad_token_id
        )

    # Save the generation config
    gen_config_path = f"../models/{output_dir_name}/generation_config"
    generation_config.save_pretrained(gen_config_path)

    # Define the training arguments
    args = Seq2SeqTrainingArguments(
        output_dir=f'../models/{output_dir_name}',
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        learning_rate=learning_rate, 
        predict_with_generate=True,
        generation_config=gen_config_path,
        fp16=True,
        report_to=report_to,
        logging_steps=100,
        load_best_model_at_end=True,
        metric_for_best_model="Overall",
        greater_is_better=True,
        generation_max_length=max_length,
    )

    # Create a partial function with the tokenizer argument included
    compute_metrics_with_tokenizer = partial(compute_metrics, tokenizer=tokenizer)
    
    # Instantiate the trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_with_tokenizer,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)]

    )

    return trainer

def create_bidirectional_dataset(datasets, shuffle=True):
    """
    Creates a bi-directional dataset from the original dataset.

    Args:
        datasets (DatasetDict): DatasetDict object containing the original dataset.
        shuffle (bool): Whether to shuffle the dataset or not.
    
    Returns:
        extended_datasets (DatasetDict): DatasetDict object containing the bi-directional dataset.
    """

    def bidirectional_extension(dataset):
        new_data = {
            "source": [],
            "target": []
        }
        for src, tgt in zip(dataset['source'], dataset['target']):
            new_data['source'].extend([f'to_neutral: {src}', f'to_toxic: {tgt}'])
            new_data['target'].extend([tgt, src])
        return new_data

    extended_train_data = bidirectional_extension(datasets["train"])
    extended_validation_data = bidirectional_extension(datasets["validation"])
    extended_test_data = bidirectional_extension(datasets["test"])

    extended_datasets = DatasetDict({
        "train": Dataset.from_dict(extended_train_data),
        "validation": Dataset.from_dict(extended_validation_data),
        "test": Dataset.from_dict(extended_test_data)
    })

    if shuffle:
        extended_datasets["train"] = extended_datasets["train"].shuffle(seed=RANDOM_SEED)
        
    return extended_datasets

def compute_metrics_bd(eval_preds, tokenizer, bd_dataset, shuffled_data=False):
    """
    Function to calculate the metrics for trainer.evaluate().
    This function is for the bi-directional model.
    
    Args:
        eval_preds (tuple): Tuple containing the predictions and references
        tokenizer (PreTrainedTokenizer): tokenizer to use for decoding the predictions
        shuffled_data (bool): Whether the data is shuffled or not
        bd_dataset (DatasetDict): Bidirectional dataset to use for testing created using create_bidirectional_datasets
                                  For example, raw_datasets_bd["validation"] or raw_datasets_bd["test"]

    Returns:
        dict: Dictionary containing the metrics
    """
    preds, refs = eval_preds

    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    refs = np.where(refs != -100, refs, tokenizer.pad_token_id)
    decoded_refs = tokenizer.batch_decode(refs, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_refs = [ref.strip() for ref in decoded_refs]
    
    # If shuffled data is false, have to_neutral_preds and to_neutral_refs just be predictions and refs with even indices
    if not shuffled_data:
        to_neutral_preds = decoded_preds[::2]
        to_neutral_refs = decoded_refs[::2]
    # Otherwise, get the indices to use when splitting predictions and refs to to_neutral and to_toxic
    else:
        # Get the indices to use when splitting predictions and refs to to_neutral and to_toxic
        to_neutral_idx = [i for i, input_sentence in enumerate(bd_dataset['source']) if input_sentence.startswith("to_neutral")]

        # Retrieve based on the indices
        to_neutral_preds = [decoded_preds[i] for i in to_neutral_idx]
        to_neutral_refs = [decoded_refs[i] for i in to_neutral_idx]
    
    # Evaluate metrics for to_neutral
    to_neutral_metrics = evaluate_metrics(
        to_neutral_refs,
        to_neutral_preds,
        to_neutral=True
    )

    # Return dictionary of to_neutral metrics
    return to_neutral_metrics

# Fine-tune T5 (Unidirectional)

In [10]:
prefixed_datasets = add_prefix(raw_datasets)

tokenized_datasets_t5_small = prefixed_datasets.map(
    preprocess_function,
    fn_kwargs={'tokenizer': tokenizer_t5_small},
    batched=True,
    remove_columns=["source", "target"],
)

trainer_t5_small = setup_trainer(
    output_dir_name="t5-small-detoxify",
    model_checkpoint="t5-small",
    train_dataset=tokenized_datasets_t5_small["train"],
    eval_dataset=tokenized_datasets_t5_small["validation"],
)

wandb.init(project="w266_final_project", name="t5-small-detoxify")
trainer_t5_small.train()
wandb.finish()

print(f"Best model checkpoint: {trainer_t5_small.state.best_model_checkpoint}")

NameError: name 'add_prefix' is not defined

# Fine-tune T5 Model (Bi-directional, No custom loss)

### Trial without shuffled data

In [103]:
raw_datasets_bd_noshuffle = create_bidirectional_dataset(raw_datasets, shuffle=False)

tokenized_datasets_bd_noshuffle_t5_small = raw_datasets_bd_noshuffle.map(
    preprocess_function,
    fn_kwargs={'tokenizer': tokenizer_t5_small},
    batched=True,
    remove_columns=["source", "target"],
)

trainer_t5_small_bd_noshuffle = setup_trainer(
    output_dir_name="t5-small-detoxify-bd-noshuffle",
    model_checkpoint="t5-small",
    train_dataset=tokenized_datasets_bd_noshuffle_t5_small["train"],
    eval_dataset=tokenized_datasets_bd_noshuffle_t5_small["validation"],
    compute_metrics=partial(compute_metrics_bd, bd_dataset=raw_datasets_bd_noshuffle["validation"], shuffled_data=False)
    )

wandb.init(project="w266_final_project", name="t5-small-bd-noshuffle-detoxify")
trainer_t5_small_bd_noshuffle.train()
wandb.finish()

print(f"Best checkpoint path: {trainer_t5_small_bd_noshuffle.state.best_model_checkpoint}")

Map:   0%|          | 0/21466 [00:00<?, ? examples/s]

Map:   0%|          | 0/2386 [00:00<?, ? examples/s]

Map:   0%|          | 0/1342 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Bleu,Sta Preds,Acceptability Preds,Bert Score F1,Overall
1,1.4877,1.235429,0.589117,0.784577,0.678174,0.923197,0.751928
2,1.2571,1.167136,0.599725,0.836547,0.69129,0.925403,0.777902
3,1.1718,1.151006,0.600607,0.860855,0.689214,0.92596,0.787498
4,1.1104,1.143344,0.605002,0.892707,0.69252,0.926318,0.801851
5,1.0634,1.128787,0.604396,0.903604,0.704711,0.926202,0.808504
6,1.0244,1.140509,0.602471,0.898575,0.700976,0.925787,0.805277
7,0.9896,1.132926,0.601382,0.909472,0.705822,0.925794,0.810388
8,0.9564,1.134517,0.60077,0.900251,0.702148,0.925799,0.805844
9,0.9305,1.137347,0.604801,0.90109,0.699633,0.926039,0.806531




0,1
eval/Acceptability_preds,▁▄▄▅█▇█▇▆
eval/BERT_score_f1,▁▆▇██▇▇▇▇
eval/BLEU,▁▆▆██▇▆▆█
eval/Overall,▁▄▅▇█▇█▇█
eval/STA_preds,▁▄▅▇█▇█▇█
eval/loss,█▄▂▂▁▂▁▁▂
eval/runtime,▇█▄▇▆▁▆▃▃
eval/samples_per_second,▂▁▄▂▃█▃▆▆
eval/steps_per_second,▂▁▄▂▃█▃▆▆
train/epoch,▁▁▂▂▃▃▄▄▅▅▅▅▆▆▇▇███

0,1
eval/Acceptability_preds,0.69963
eval/BERT_score_f1,0.92604
eval/BLEU,0.6048
eval/Overall,0.80653
eval/STA_preds,0.90109
eval/loss,1.13735
eval/runtime,86.6586
eval/samples_per_second,27.533
eval/steps_per_second,0.219
train/epoch,9.0


### Trial with shuffled data

In [102]:
raw_datasets_bd_shuffle = create_bidirectional_dataset(raw_datasets, shuffle=True)

tokenized_datasets_bd_shuffle_t5_small = raw_datasets_bd_shuffle.map(
    preprocess_function,
    fn_kwargs={'tokenizer': tokenizer_t5_small},
    batched=True,
    remove_columns=["source", "target"],
)

trainer_t5_small_bd_shuffle = setup_trainer(
    output_dir_name="t5-small-detoxify-bd-shuffle",
    model_checkpoint="t5-small",
    train_dataset=tokenized_datasets_bd_shuffle_t5_small["train"],
    eval_dataset=tokenized_datasets_bd_shuffle_t5_small["validation"],
    compute_metrics=partial(compute_metrics_bd, bd_dataset=raw_datasets_bd_shuffle["validation"], shuffled_data=True)
    )

wandb.init(project="w266_final_project", name="t5-small-bd-shuffle-detoxify")
trainer_t5_small_bd_shuffle.train()
wandb.finish()

print(f"Best checkpoint path: {trainer_t5_small_bd_shuffle.state.best_model_checkpoint}")

Map:   0%|          | 0/21466 [00:00<?, ? examples/s]

Map:   0%|          | 0/2386 [00:00<?, ? examples/s]

Map:   0%|          | 0/1342 [00:00<?, ? examples/s]



VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112585377779599, max=1.0…

Epoch,Training Loss,Validation Loss,Bleu,Sta Preds,Acceptability Preds,Bert Score F1,Overall
1,1.4953,1.230033,0.596087,0.780386,0.690938,0.922194,0.753998
2,1.2579,1.177033,0.598875,0.854987,0.690168,0.923962,0.784596
3,1.1712,1.142365,0.599957,0.891869,0.720199,0.923604,0.8055
4,1.1105,1.137581,0.606112,0.881811,0.699919,0.925512,0.799033
5,1.0613,1.134472,0.603866,0.89606,0.701756,0.925853,0.804719




0,1
eval/Acceptability_preds,▁▁█▃▄
eval/BERT_score_f1,▁▄▄▇█
eval/BLEU,▁▃▄█▆
eval/Overall,▁▅█▇█
eval/STA_preds,▁▆█▇█
eval/loss,█▄▂▁▁
eval/runtime,█▂▆▅▁
eval/samples_per_second,▁▇▃▄█
eval/steps_per_second,▁▆▃▄█
train/epoch,▁▁▃▃▅▅▆▆███

0,1
eval/Acceptability_preds,0.70176
eval/BERT_score_f1,0.92585
eval/BLEU,0.60387
eval/Overall,0.80472
eval/STA_preds,0.89606
eval/loss,1.13447
eval/runtime,86.1316
eval/samples_per_second,27.702
eval/steps_per_second,0.221
train/epoch,5.0


# Fine-tune T5 Model (Data Augmentation)

## All filters

In [23]:
prefixed_datasets_all_filters = add_prefix(aug_datasets_all_filters)

tokenized_datasets_all_filters_t5_small = prefixed_datasets_all_filters.map(
    preprocess_function,
    fn_kwargs={'tokenizer': tokenizer_t5_small},
    batched=True,
    remove_columns=["source", "target"],
)

trainer_t5_small_aug = setup_trainer(
    output_dir_name="t5-small-detoxify-aug-all-filters",
    model_checkpoint="t5-small",
    train_dataset=tokenized_datasets_all_filters_t5_small["train"],
    eval_dataset=tokenized_datasets_all_filters_t5_small["validation"],
)

wandb.init(project="w266_final_project", name="t5-small-detoxify-aug-all-filters")
trainer_t5_small_aug.train()
wandb.finish()

# Print path to the best checkpoint
print(trainer_t5_small_aug.state.best_model_checkpoint)





VBox(children=(Label(value='0.002 MB of 0.026 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.072580…

0,1
train/epoch,▁
train/global_step,▁
train/learning_rate,▁
train/loss,▁

0,1
train/epoch,1.0
train/global_step,324.0
train/learning_rate,0.00028
train/loss,1.0328


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112927599990345, max=1.0…

Epoch,Training Loss,Validation Loss,Bleu,Sta,Flu,Sem,Overall
1,1.0328,0.949773,0.590953,0.870075,0.695258,0.924241,0.790121
2,0.8784,0.931161,0.593519,0.867561,0.702935,0.925273,0.79137
3,0.8034,0.93569,0.596343,0.899413,0.70491,0.925343,0.805084
4,0.7445,0.935064,0.595158,0.897737,0.710406,0.925786,0.805365
5,0.6962,0.937661,0.587387,0.902766,0.710375,0.925517,0.805762
6,0.6562,0.950577,0.585614,0.907795,0.705161,0.924991,0.806271
7,0.622,0.959308,0.593118,0.911148,0.715915,0.924456,0.811157
8,0.5907,0.976965,0.590641,0.912825,0.708718,0.925663,0.810134
9,0.5637,0.984873,0.587993,0.918692,0.715805,0.92541,0.813319
10,0.541,0.994545,0.586801,0.908634,0.715339,0.924616,0.808805




VBox(children=(Label(value='0.002 MB of 0.027 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.072055…

0,1
eval/BLEU,▄▆█▇▂▁▆▄▃▂▃
eval/FLU,▁▄▄▆▆▄█▆██▇
eval/Overall,▁▁▆▆▆▆▇▇█▇▆
eval/SEM,▁▆▆█▇▄▂▇▆▃▂
eval/STA,▁▁▅▅▆▇▇▇█▇▇
eval/loss,▃▁▁▁▂▃▄▅▆▇█
eval/runtime,▅▅▄▁▇▆▅▃█▃▅
eval/samples_per_second,▄▄▅█▂▃▄▆▁▆▄
eval/steps_per_second,▄▄▅█▂▃▄▆▁▆▄
train/epoch,▁▁▂▂▂▂▃▃▄▄▅▅▅▅▆▆▇▇▇▇███

0,1
eval/BLEU,0.58829
eval/FLU,0.71272
eval/Overall,0.80822
eval/SEM,0.92449
eval/STA,0.9078
eval/loss,1.0067
eval/runtime,44.763
eval/samples_per_second,26.651
eval/steps_per_second,0.223
train/epoch,11.0


NameError: name 'trainer_t5_small' is not defined

../models/t5-small-detoxify-aug-all-filters/checkpoint-2916


## No acceptability filter

In [26]:
tokenized_datasets_no_acceptability_filter_t5_small = add_prefix(aug_datasets_no_acceptability_filter).map(
    preprocess_function,
    fn_kwargs={'tokenizer': tokenizer_t5_small},
    batched=True,
    remove_columns=["source", "target"],
)

trainer_t5_small_aug_no_acceptability_filter = setup_trainer(
    output_dir_name="t5-small-detoxify-aug-no-acceptability-filter",
    model_checkpoint="t5-small",
    train_dataset=tokenized_datasets_no_acceptability_filter_t5_small["train"],
    eval_dataset=tokenized_datasets_no_acceptability_filter_t5_small["validation"],
)

wandb.init(project="w266_final_project", name="t5-small-detoxify-aug-no-acceptability-filter")
trainer_t5_small_aug_no_acceptability_filter.train()
wandb.finish()

# Print path to the best checkpoint
print(f"Best model checkpoint: {trainer_t5_small_aug_no_acceptability_filter.state.best_model_checkpoint}")

Map:   0%|          | 0/20711 [00:00<?, ? examples/s]

Map:   0%|          | 0/1193 [00:00<?, ? examples/s]

Map:   0%|          | 0/671 [00:00<?, ? examples/s]

Map:   0%|          | 0/20711 [00:00<?, ? examples/s]

Map:   0%|          | 0/1193 [00:00<?, ? examples/s]

Map:   0%|          | 0/671 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Bleu,Sta,Flu,Sem,Overall
1,1.0768,0.949558,0.589193,0.874267,0.699519,0.924891,0.792427
2,0.9161,0.936198,0.588487,0.884325,0.701398,0.925719,0.796851
3,0.8399,0.929998,0.595734,0.897737,0.708047,0.925757,0.805002
4,0.7822,0.937293,0.594991,0.90109,0.708011,0.92454,0.805944
5,0.7345,0.944445,0.595025,0.903604,0.709992,0.925311,0.807508
6,0.6927,0.948201,0.586651,0.899413,0.713622,0.925139,0.804848
7,0.6572,0.964795,0.587344,0.900251,0.714377,0.925392,0.805523




0,1
eval/BLEU,▃▂█▇▇▁▂
eval/FLU,▁▂▅▅▆██
eval/Overall,▁▃▇▇█▇▇
eval/SEM,▃██▁▅▄▆
eval/STA,▁▃▇▇█▇▇
eval/loss,▅▂▁▂▄▅█
eval/runtime,▄█▄▅▄▁▅
eval/samples_per_second,▅▁▅▄▅█▄
eval/steps_per_second,▅▁▅▄▅█▄
train/epoch,▁▁▂▂▃▃▅▅▆▆▇▇███

0,1
eval/BLEU,0.58734
eval/FLU,0.71438
eval/Overall,0.80552
eval/SEM,0.92539
eval/STA,0.90025
eval/loss,0.9648
eval/runtime,45.3975
eval/samples_per_second,26.279
eval/steps_per_second,0.22
train/epoch,7.0


Best model checkpoint: ../models/t5-small-detoxify-aug-no-acceptability-filter/checkpoint-1620


## No Similarity Filter

In [27]:
prefixed_datasets_no_similarity_filter = add_prefix(aug_datasets_no_similarity_filter)

tokenized_datasets_no_similarity_filter_t5_small = prefixed_datasets_no_similarity_filter.map(
    preprocess_function,
    fn_kwargs={'tokenizer': tokenizer_t5_small},
    batched=True,
    remove_columns=["source", "target"],
)

trainer_t5_small_aug_no_similarity_filter = setup_trainer(
    output_dir_name="t5-small-detoxify-aug-no-similarity-filter",
    model_checkpoint="t5-small",
    train_dataset=tokenized_datasets_no_similarity_filter_t5_small["train"],
    eval_dataset=tokenized_datasets_no_similarity_filter_t5_small["validation"],
)

wandb.init(project="w266_final_project", name="t5-small-detoxify-aug-no-similarity-filter")
trainer_t5_small_aug_no_similarity_filter.train()
wandb.finish()

print(f"Best model checkpoint: {trainer_t5_small_aug_no_similarity_filter.state.best_model_checkpoint}")

Map:   0%|          | 0/20711 [00:00<?, ? examples/s]

Map:   0%|          | 0/1193 [00:00<?, ? examples/s]

Map:   0%|          | 0/671 [00:00<?, ? examples/s]

Map:   0%|          | 0/20711 [00:00<?, ? examples/s]

Map:   0%|          | 0/1193 [00:00<?, ? examples/s]

Map:   0%|          | 0/671 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Bleu,Sta,Flu,Sem,Overall
1,1.311,0.954922,0.588395,0.84912,0.704302,0.92319,0.782825
2,1.1376,0.933897,0.590768,0.877619,0.705733,0.924761,0.7953
3,1.0509,0.926761,0.596456,0.906119,0.707856,0.924799,0.80827
4,0.9836,0.928941,0.597504,0.904443,0.70353,0.924731,0.80693
5,0.9275,0.939296,0.59253,0.908634,0.707966,0.925006,0.808554
6,0.8771,0.94986,0.593862,0.917016,0.715056,0.925457,0.813681
7,0.8341,0.964241,0.59272,0.903604,0.706382,0.926139,0.80649
8,0.796,0.974892,0.591599,0.917854,0.719671,0.924972,0.81439


## No toxicity filter

In [None]:
prefixed_datasets_no_toxicity_filter = add_prefix(aug_datasets_no_toxicity_filter)

tokenized_datasets_no_toxicity_filter_t5_small = prefixed_datasets_no_toxicity_filter.map(
    preprocess_function,
    fn_kwargs={'tokenizer': tokenizer_t5_small},
    batched=True,
    remove_columns=["source", "target"],
)

trainer_t5_small_aug_no_toxicity_filter = setup_trainer(
    output_dir_name="t5-small-detoxify-aug-no-toxicity-filter",
    model_checkpoint="t5-small",
    train_dataset=tokenized_datasets_no_toxicity_filter_t5_small["train"],
    eval_dataset=tokenized_datasets_no_toxicity_filter_t5_small["validation"],
)

wandb.init(project="w266_final_project", name="t5-small-detoxify-aug-no-toxicity-filter")
trainer_t5_small_aug_no_toxicity_filter.train()
wandb.finish()

print(f"Best model checkpoint: {trainer_t5_small_aug_no_toxicity_filter.state.best_model_checkpoint}")

# Evaluation

In [22]:
# Create a pandas dataframe to store evaluation metrics for each model configuration
eval_metrics_df = pd.DataFrame(columns=["Model", "BLEURT", "BLEU", "STA", "FLU", "SEM", "Overall"])
eval_metrics_df

Unnamed: 0,Model,BLEURT,BLEU,STA,FLU,SEM,Overall


In [24]:
# Helper function to add metrics to the dataframe
def add_metrics_to_df(df, model_name, metrics):
    """
    Add model metrics to a pandas dataframe
    
    Args:
    - df: pandas dataframe to add metrics to
    - model_name: name of the model
    - metrics: dictionary of evaluation metrics
    
    Returns:
    - updated pandas dataframe
    """

    # Check if the model name already exists in the dataframe
    if model_name in df["Model"].values:
        print(f"Model {model_name} already exists in the dataframe.")
        return df
    
    # Add the new row to the dataframe
    model_metrics_df = pd.DataFrame({
        "Model": [model_name],
        "BLEURT": [metrics["BLEURT"]],
        "BLEU": [metrics["BLEU"]],
        "STA": [metrics["STA"]],
        "FLU": [metrics["FLU"]],
        "SEM": [metrics["SEM"]],
        "Overall": [metrics["Overall"]]
    })
    
    return pd.concat([df, model_metrics_df], ignore_index=True)

DELETE

In [25]:
delete_eval_preds = baseline_detoxifier(raw_datasets["test"]["source"])
delete_eval_metrics = evaluate_metrics(raw_datasets["test"]["target"],
                               delete_eval_preds,
                               include_bleurt=True)
print(delete_eval_metrics)
eval_metrics_df = add_metrics_to_df(eval_metrics_df, "DELETE", delete_eval_metrics)

{'BLEU': 0.5036836653565926, 'STA': 0.6184798807749627, 'FLU': 0.5259524, 'SEM': 0.9074216935389443, 'Overall': 0.6348035038444941, 'BLEURT': -0.23698500004748296}


  return pd.concat([df, model_metrics_df], ignore_index=True)


BART

In [26]:
bart_eval_preds = bart_detoxifier(raw_datasets["test"]["source"])
bart_eval_metrics = evaluate_metrics(raw_datasets["test"]["target"],
                                     bart_eval_preds,
                                     include_bleurt=True)
print(bart_eval_metrics)
eval_metrics_df = add_metrics_to_df(eval_metrics_df, "BART", bart_eval_metrics)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'BLEU': 0.5618694307428573, 'STA': 0.8926974664679582, 'FLU': 0.7746871, 'SEM': 0.9232455269945716, 'Overall': 0.8090394004102123, 'BLEURT': 0.2596495511168517}


T5 Small (Unidirectional)

In [30]:
prefixed_datasets = add_prefix(raw_datasets)

tokenized_datasets_t5_small = prefixed_datasets.map(
    preprocess_function,
    fn_kwargs={'tokenizer': tokenizer_t5_small},
    batched=True,
    remove_columns=["source", "target"],
)

trainer_t5_small_best = setup_trainer(
    output_dir_name="t5-small-detoxify-2",
    model_checkpoint="../models/t5-small-detoxify-2/checkpoint-840",
    train_dataset=tokenized_datasets_t5_small["train"],
    eval_dataset=tokenized_datasets_t5_small["test"],
    report_to=None,
)

t5_small_eval_preds = trainer_t5_small_best.predict(tokenized_datasets_t5_small["test"]).predictions
t5_small_eval_preds = tokenizer_t5_small.batch_decode(t5_small_eval_preds, skip_special_tokens=True)
t5_small_eval_preds = [pred.strip() for pred in t5_small_eval_preds]

t5_small_eval_metrics = evaluate_metrics(raw_datasets["test"]["target"],
                                            t5_small_eval_preds,
                                            include_bleurt=True)

print(t5_small_eval_metrics)

eval_metrics_df = add_metrics_to_df(eval_metrics_df, "T5-UD", t5_small_eval_metrics)



Map:   0%|          | 0/10733 [00:00<?, ? examples/s]

Map:   0%|          | 0/1193 [00:00<?, ? examples/s]

Map:   0%|          | 0/671 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


OutOfMemoryError: CUDA out of memory. Tried to allocate 32.00 MiB (GPU 0; 21.99 GiB total capacity; 4.57 GiB already allocated; 27.69 MiB free; 5.18 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

T5 Small (Bidirectional)

In [None]:
raw_datasets_bd = create_bidirectional_dataset(raw_datasets, shuffle=False)

tokenized_datasets_bd_t5_small = raw_datasets_bd.map(
    preprocess_function,
    fn_kwargs={'tokenizer': tokenizer_t5_small},
    batched=True,
    remove_columns=["source", "target"],
)

trainer_t5_small_bd_best = setup_trainer(
    output_dir_name="t5-small-detoxify-bd-noshuffle-2",
    model_checkpoint="../models/t5-small-detoxify-bd-noshuffle-2/checkpoint-2352",
    train_dataset=tokenized_datasets_bd_t5_small["train"],
    eval_dataset=tokenized_datasets_bd_t5_small["test"],
    compute_metrics=partial(compute_metrics_bd, bd_dataset=raw_datasets_bd["test"], shuffled_data=False)
    report_to=None,
    )

t5_small_bd_eval_preds = trainer_t5_small_bd_best.predict(tokenized_datasets_t5_small["test"]).predictions
t5_small_bd_eval_preds = tokenizer_t5_small.batch_decode(t5_small_bd_eval_preds, skip_special_tokens=True)
t5_small_bd_eval_preds = [pred.strip() for pred in t5_small_bd_eval_preds]

t5_small_bd_eval_metrics = evaluate_metrics(raw_datasets["test"]["target"],
                                            t5_small_bd_eval_preds,
                                            include_bleurt=True)

print(t5_small_bd_eval_metrics)

eval_metrics_df = add_metrics_to_df(eval_metrics_df, "T5-BD", t5_small_bd_eval_metrics)