In [1]:
from datasets import DatasetDict, Dataset
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    T5Tokenizer,
    T5ForConditionalGeneration,
    T5Config,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    GenerationConfig,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    EarlyStoppingCallback,
    pipeline,
)
from sentence_transformers import SentenceTransformer
import torch
from torch import nn
import numpy as np
import time
import gc
import GPUtil
import evaluate
from numba import cuda
import wandb
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from functools import partial
import wandb
import os
import pickle
import optuna
from typing import Dict, Union, Optional, Tuple, List, Any
import pandas as pd
import string
from tqdm import tqdm

2023-11-23 12:07:50.901659: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-23 12:07:50.901718: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-23 12:07:50.901763: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Random seed for reproducibility
RANDOM_SEED = 42

# Default parameters for T5 model fine-tuning
PER_DEVICE_TRAIN_BATCH_SIZE = 64
PER_DEVICE_EVAL_BATCH_SIZE = 128
LEARNING_RATE = 3e-4
NUM_TRAIN_EPOCHS = 20
EARLY_STOPPING_PATIENCE = 2
NUM_BEAMS = 4

# Include BLEURT score in evaluation
INCLUDE_BLEURT = True

# Setting the DEVICE to cuda
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set path for profane word list
PROFANE_WORD_PATH = "../data/raw/en.txt"

# Set path for raw dataset dictionary
RAW_DATASET_PATH = "../data/processed/raw_dataset.pkl"
AUG_DATASET_ALL_FILTERS_PATH = "../data/processed/aug_datasets_all_filters"
AUG_DATASET_NO_TOXICITY_FILTER_PATH = "../data/processed/aug_datasets_no_toxicity_filter"
AUG_DATASET_NO_SIMILARITY_FILTER_PATH = "../data/processed/aug_datasets_no_similarity_filter"
AUG_DATASET_NO_ACCEPTABILITY_FILTER_PATH = "../data/processed/aug_datasets_no_acceptability_filter"

# Set path for txt file containing best model checkpoints
BEST_MODEL_CHECKPOINT_PATH = "../models/best_model_checkpoints.txt"

# Set path to save evaluation outputs to
VAL_PREDS_PATH = "../data/interim/val_preds.csv"
VAL_METRICS_PATH = "../data/interim/val_metrics.csv"
TEST_PREDS_PATH = "../data/final/test_preds.csv"
TEST_METRICS_PATH = "../data/final/test_metrics.csv"

# Set maximum length for input and output
MAX_INPUT_LENGTH = 64
MAX_OUTPUT_LENGTH = 64

In [3]:
# Load tokenizers and models
tokenizer_t5_small = T5Tokenizer.from_pretrained("t5-small")
model_t5_small = T5ForConditionalGeneration.from_pretrained("t5-small").to(DEVICE)
tokenizer_toxicity = RobertaTokenizer.from_pretrained("SkolkovoInstitute/roberta_toxicity_classifier")
model_toxicity = RobertaForSequenceClassification.from_pretrained("SkolkovoInstitute/roberta_toxicity_classifier").to(DEVICE)
tokenizer_acceptability = AutoTokenizer.from_pretrained("iproskurina/tda-bert-en-cola")
model_acceptability = AutoModelForSequenceClassification.from_pretrained("iproskurina/tda-bert-en-cola").to(DEVICE)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Some weights of the model checkpoint at SkolkovoInstitute/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassifi

In [4]:
# Load datasets
raw_datasets = DatasetDict.load_from_disk(RAW_DATASET_PATH)
aug_datasets_all_filters = DatasetDict.load_from_disk(AUG_DATASET_ALL_FILTERS_PATH)
aug_datasets_no_acceptability_filter = DatasetDict.load_from_disk(AUG_DATASET_NO_ACCEPTABILITY_FILTER_PATH)
aug_datasets_no_similarity_filter = DatasetDict.load_from_disk(AUG_DATASET_NO_SIMILARITY_FILTER_PATH)
aug_datasets_no_toxicity_filter = DatasetDict.load_from_disk(AUG_DATASET_NO_TOXICITY_FILTER_PATH)

In [5]:
# Load df_val_preds and df_val_metrics
df_val_preds = pd.read_csv(VAL_PREDS_PATH)
df_val_metrics = pd.read_csv(VAL_METRICS_PATH)

# Functions

## Debugging

In [6]:
def measure_time(func, *args, **kwargs):
    """
    Calculates the time it takes to run a function.
    """
    start_time = time.time()
    result = func(*args, **kwargs)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Function {func.__name__} took {elapsed_time:.2f} seconds to run.")
    return result

def get_gpu_memory():
    """
    Gets the GPU memory information.
    """
    gpus = GPUtil.getGPUs()
    gpu = gpus[0]
    print(f"Total GPU memory: {gpu.memoryTotal}MB")
    print(f"Free GPU memory: {gpu.memoryFree}MB")
    print(f"Used GPU memory: {gpu.memoryUsed}MB")
    return gpu.memoryUsed

def force_clear_GPU_memory():
    """
    Force clears the GPU memory.
    """
    cuda.select_device(0)
    cuda.close()

def cleanup():
    """
    Cleans up the GPU memory.
    """
    gc.collect()
    torch.cuda.empty_cache()

## Baseline Models

In [8]:
# Baseline model functions
def baseline_detoxifier(text_list, profane_word_path=PROFANE_WORD_PATH):
    """
    Returns a detoxified version of the text by replacing toxic terms with blanks

    Args:
        text_list (list): list of strings to be detoxified
        toxic_list (list): list of toxic terms to be removed from text_list

    Returns:
        detoxified_text_list (list): list of detoxified strings
    """
    # Load list of profane words
    profane_words = []
    with open(profane_word_path, "r") as f:
        for line in f:
            profane_words.append(line.strip())

    # Detoxify text
    y_pred_delete = []
    for text in text_list:
        for term in profane_words:
            text = text.replace(term, "")
        y_pred_delete.append(text)

    return y_pred_delete

def bart_detoxifier(text_list):
    """
    Returns a detoxified version of the text using BART

    Args:
        text_list (list): list of strings to be detoxified

    Returns:
        detoxified_text_list (list): list of detoxified strings
    """
    pipe_bart = pipeline("text2text-generation", model="s-nlp/bart-base-detox", device=DEVICE)
    y_pred_bart = pipe_bart(text_list, max_length=MAX_OUTPUT_LENGTH, truncation=True)
    y_pred_bart = [x["generated_text"] for x in y_pred_bart]
    
    return y_pred_bart

# Helper function to add metrics to the dataframe
def add_metrics_to_df(df, model_name, metrics, save_path="../data/processed/model_metrics.csv"):
    """
    Add model metrics to a pandas dataframe
    
    Args:
    - df: pandas dataframe to add metrics to
    - model_name: name of the model
    - metrics: dictionary of evaluation metrics
    
    Returns:
    - updated pandas dataframe
    """

    # Create a df if the input df is empty
    if df is None:
        df = pd.DataFrame(columns=["Model", "BLEURT", "BLEU", "STA", "FLU", "SEM", "Overall"])

    # Check if the model name already exists in the dataframe
    if model_name in df["Model"].values:
        print(f"Model {model_name} already exists in the dataframe.")
        return df
    
    # Add the new row to the dataframe
    model_metrics_df = pd.DataFrame({
        "Model": [model_name],
        "BLEURT": [metrics["BLEURT"]],
        "BLEU": [metrics["BLEU"]],
        "STA": [metrics["STA"]],
        "FLU": [metrics["FLU"]],
        "SEM": [metrics["SEM"]],
        "Overall": [metrics["Overall"]]
    })

    # Save the dataframe to a csv file
    df = pd.concat([df, model_metrics_df], ignore_index=True)
    df.to_csv(save_path, index=False)
    
    return df


## Evaluation

In [9]:
# Initialize model variables
model_bleurt = None
model_bertscore = None
model_sacrebleu = None

def calc_sacrebleu(refs, preds):
    """
    Calculates the SacreBLEU score.

    Args:
        refs (list): List of reference sentences
        preds (list): List of predicted sentences
    
    Returns:
        results (float): SacreBLEU score
    """
    global model_sacrebleu

    if model_sacrebleu is None:
        model_sacrebleu = evaluate.load("sacrebleu")

    results = model_sacrebleu.compute(predictions=preds, references=refs)["score"]
    results = results/100

    return results

def calc_bert_score(
    refs, preds, model_type="microsoft/deberta-large-mnli", output_mean=True
    ):
    """
    Calculates BERT score per line. Note: https://docs.google.com/spreadsheets/d/1RKOVpselB98Nnh_EOC4A2BYn8_201tmPODpNWu4w7xI/edit#gid=0 lists the best performing models
    Args:
        refs (list): List of reference sentences.
        y_pred (list): List of predicted sentences.
        model_type (str): Type of BERT model to use.
        output_mean (bool): Whether to output the mean of the scores.

    Returns:
        list of precision, recall, f1 scores.

    """
    global model_bertscore

    if model_bertscore is None:
        model_bertscore = evaluate.load("bertscore")
        
    results = model_bertscore.compute(predictions=preds, references=refs, model_type=model_type)
    precision = np.array(results["precision"])
    recall = np.array(results["recall"])
    f1 = np.array(results["f1"])
    
    if output_mean:
        precision = precision.mean()
        recall = recall.mean()
        f1 = f1.mean()

    return precision, recall, f1

def calc_bleurt(refs, preds, checkpoint="BLEURT-20_D12", output_mean = True):
    """
    Calculates BLEURT score per line.

    Args:
        refs (list): List of reference sentences.
        preds (list): List of predicted sentences.
        output_type (str): Type of output to return. Either 'numpy' or 'list'.

    Returns:
        list/array of BLEURT scores.
    """
    global model_bleurt

    if model_bleurt is None:
        model_bleurt = evaluate.load("bleurt", module_type="metric", checkpoint=checkpoint)

    results = np.array(model_bleurt.compute(predictions=preds, references=refs)["scores"])

    if output_mean:
        results = results.mean()

    return results

def calc_tox_acceptability(
    data,
    tokenizer,
    model,
    output_score=True,
    output_mean=True):
    """
    Calculates toxicity and acceptability scores for a given dataset.

    Args:
        data = list of strings to be evaluated
        tokenizer = tokenizer for the model
        model = model to be used for evaluation
        output_score = whether to output the score or the label
        output_mean = whether to output the mean of the scores or the scores for each sentence
    
    Returns:
        array of toxicity and acceptability scores.
    """  
    inputs = tokenizer(data, return_tensors="pt", padding=True).to(DEVICE)
    with torch.no_grad():
        logits = model(**inputs)["logits"]
        if output_score:
            result = torch.nn.functional.softmax(logits, dim=1)[:, 1]
        else:
            result = logits.argmax(1).data
        result = result.cpu().numpy()

    if output_mean:
        result = result.mean()
        
    return result

def evaluate_metrics(
    refs,
    preds,
    tokenizer_toxicity=tokenizer_toxicity,
    model_toxicity=model_toxicity,
    tokenizer_acceptability=tokenizer_acceptability,
    model_acceptability=model_acceptability,
    to_neutral=True,
    weights={
        "BLEU": 0.2,
        "STA": 0.4,
        "Acceptability": 0.2,
        "BERT_Score": 0.2
    },
    include_bleurt=INCLUDE_BLEURT
):
    """
    Calculates and returns a dictionary of evaluation metrics

    Args:
        refs (list): list of strings (reference)
        preds (list): list of strings (predictions)
        tokenizer_toxicity (tokenizer): tokenizer for toxicity model
        model_toxicity (model): toxicity model
        tokenizer_acceptability (tokenizer): tokenizer for acceptability model
        model_acceptability (model): acceptability model
        to_neutral (bool): whether the goal is to transfer to neutral (True) or to toxic (False)
        weights (dict): dictionary of weights for each metric
        include_bleurt (bool): whether to include BLEURT score in the output

    Returns:
        results (dict): dictionary of evaluation metrics
    """
    # Calculate BLEU score
    bleu = calc_sacrebleu(refs, preds)

    # Calculate toxicity classification
    tox_pred = calc_tox_acceptability(preds, tokenizer_toxicity, model_toxicity, output_score=False, output_mean=False)

    # Calculate style transfer accuracy as proportion of sentences that were correctly classified (as non-toxic / toxic)
    if to_neutral:
        sta_correct_label = 0
    else:
        sta_correct_label = 1

    sta_pred = (tox_pred == sta_correct_label).sum() / len(tox_pred)

    # Calculate acceptability scores
    acc_pred = calc_tox_acceptability(preds, tokenizer_acceptability, model_acceptability)

    # Calculate similarity score
    bert_score_f1 = calc_bert_score(refs, preds, model_type="distilbert-base-uncased")[2]

    # Calculate BLEURT score if include_bleurt is True
    bleurt = None
    if include_bleurt:
        bleurt = calc_bleurt(refs, preds)

    # Calculate composite score
    composite_score = weights["BLEU"] * bleu + weights["STA"] * sta_pred + weights["Acceptability"] * acc_pred + weights["BERT_Score"] * bert_score_f1

    # Return a dictionary of metrics
    results = {
        "BLEU": bleu,
        "STA": sta_pred,
        "FLU": acc_pred,
        "SEM": bert_score_f1,
        "J": composite_score,
    }
    if include_bleurt:
        results["BLEURT"] = bleurt
        
    return results

In [10]:
def add_preds_to_df(model_name, preds, raw_datasets=raw_datasets, use_validation=True, load_csv=True, replace_existing=True):
    """
    Add model predictions to a pandas dataframe

    Args:
    - model_name: name of the model
    - preds: list of predictions
    - test_data: whether the data is test data or validation data (True for test data, False for validation data)
    - load_csv: whether to load the existing csv file. If False, a new dataframe will be created.
    - replace_existing: whether to replace an existing column if it already exists

    Returns:
    - updated pandas dataframe
    """

    # Set save path
    if use_validation:
        save_path = VAL_PREDS_PATH
        source = raw_datasets["validation"]["source"]
        target = raw_datasets["validation"]["target"]
    else:
        save_path = TEST_PREDS_PATH
        source = raw_datasets["test"]["source"]
        target = raw_datasets["test"]["target"]

    if load_csv:
        df = pd.read_csv(save_path)
    else:
        df = pd.DataFrame({
                "source": source,
                "target": target,
            })
    
    # If replace existing, remove the existing column with the same model name
    if f"{model_name}_preds" in df.columns and replace_existing:
        df = df.drop(columns=[f"{model_name}_preds"])
    
    df[f"{model_name}_preds"] = preds
    df.to_csv(save_path, index=False)

    return df

def add_metric_cols_to_preds(preds_col_name, use_validation=True, replace_existing=True):
    """
    Add metric columns to the dataframe

    Args:
    - preds_col_name: name of the column containing the predictions
    - use_validation: whether to use validation data or test data

    Returns:
    - updated dataframe
    """
    # Set save path
    if use_validation:
        save_path = VAL_PREDS_PATH
    else:
        save_path = TEST_PREDS_PATH

    # Load CSV
    df = pd.read_csv(save_path)

    # Dynamically create column names
    model_name = preds_col_name.split("_")[0]
    bleu_col_name = f"{model_name}_BLEU"
    bleurt_col_name = f"{model_name}_BLEURT"
    sta_col_name = f"{model_name}_STA"
    flu_col_name = f"{model_name}_FLU"
    sem_col_name = f"{model_name}_SEM"

    # If replace existing, remove the existing columns
    if bleu_col_name in df.columns and replace_existing:
        df = df.drop(columns=[bleu_col_name])
    if bleurt_col_name in df.columns and replace_existing:
        df = df.drop(columns=[bleurt_col_name])
    if sta_col_name in df.columns and replace_existing:
        df = df.drop(columns=[sta_col_name])
    if flu_col_name in df.columns and replace_existing:
        df = df.drop(columns=[flu_col_name])
    if sem_col_name in df.columns and replace_existing:
        df = df.drop(columns=[sem_col_name])

    # Calculate metrics
    df[bleu_col_name] = df.apply(lambda row: calc_sacrebleu([row["target"]], [row[preds_col_name]]), axis=1)
    df[bleurt_col_name] = calc_bleurt(df["target"], df[preds_col_name], output_mean=False)
    df[sta_col_name] = 1 - calc_tox_acceptability(df[preds_col_name].tolist(), tokenizer_toxicity, model_toxicity, output_score=False, output_mean=False)
    df[flu_col_name] = calc_tox_acceptability(df[preds_col_name].tolist(), tokenizer_acceptability, model_acceptability, output_score=True, output_mean=False)
    df[sem_col_name] = calc_bert_score(df["target"], df[preds_col_name], model_type="distilbert-base-uncased", output_mean=False)[2]

    df.to_csv(save_path, index=False)

    return df

# Helper function to add metrics to the dataframe
def add_metrics_to_df(model_name, metrics, use_validation=True, load_csv=True, replace_existing=True):
    """
    Add model metrics to a pandas dataframe
    
    Args:
    - df: pandas dataframe to add metrics to
    - model_name: name of the model
    - metrics: dictionary of evaluation metrics
    - test_data: whether the data is test data or validation data (True for test data, False for validation data)
    - load_csv: whether to load the existing csv file. If False, a new dataframe will be created.
    - replace_existing: whether to replace an existing column if it already exists
    
    Returns:
    - updated pandas dataframe
    """

    # Set save path
    if use_validation:
        save_path = VAL_METRICS_PATH
    else:
        save_path = TEST_METRICS_PATH
        
    # Load the existing dataframe if it exists
    if load_csv:
        df = pd.read_csv(save_path)
    else:
        df = pd.DataFrame(columns=["Model", "BLEURT", "BLEU", "STA", "FLU", "SEM", "J"])

    # Check if the model name already exists in the dataframe
    if model_name in df["Model"].values and not replace_existing:
        print(f"Model {model_name} already exists in the dataframe.")
        return df

    # If replace existing, remove the existing row with the same model name
    if model_name in df["Model"].values and replace_existing:
        df = df[df["Model"] != model_name]

    # Add the new row to the dataframe
    model_metrics_df = pd.DataFrame({
        "Model": [model_name],
        "BLEURT": [metrics["BLEURT"]],
        "BLEU": [metrics["BLEU"]],
        "STA": [metrics["STA"]],
        "FLU": [metrics["FLU"]],
        "SEM": [metrics["SEM"]],
        "J": [metrics["J"]]
    })

    # Save the dataframe to a csv file
    df = pd.concat([df, model_metrics_df], ignore_index=True)
    df.to_csv(save_path, index=False)
    
    return df


## Trainer Object Setup

In [11]:
def add_prefix(datasetdict, prefix="to_neutral: "):
    """Adds a prefix to the source sequence in the dataset."""
    datasetdict_copy = datasetdict.copy()
    datasetdict_copy["train"] = datasetdict_copy["train"].map(lambda x: {"source": prefix + x["source"]})
    datasetdict_copy["validation"] = datasetdict_copy["validation"].map(lambda x: {"source": prefix + x["source"]})
    datasetdict_copy["test"] = datasetdict_copy["test"].map(lambda x: {"source": prefix + x["source"]})
    datasetdict_copy = DatasetDict(datasetdict_copy)
    return datasetdict_copy

def create_bidirectional_dataset(datasets, shuffle=True):
    """
    Creates a bi-directional dataset from the original dataset.

    Args:
        datasets (DatasetDict): DatasetDict object containing the original dataset.
        shuffle (bool): Whether to shuffle the dataset or not.
    
    Returns:
        extended_datasets (DatasetDict): DatasetDict object containing the bi-directional dataset.
    """

    def bidirectional_extension(dataset):
        new_data = {
            "source": [],
            "target": []
        }
        for src, tgt in zip(dataset['source'], dataset['target']):
            new_data['source'].extend([f'to_neutral: {src}', f'to_toxic: {tgt}'])
            new_data['target'].extend([tgt, src])
        return new_data

    extended_train_data = bidirectional_extension(datasets["train"])
    extended_validation_data = bidirectional_extension(datasets["validation"])
    extended_test_data = bidirectional_extension(datasets["test"])

    extended_datasets = DatasetDict({
        "train": Dataset.from_dict(extended_train_data),
        "validation": Dataset.from_dict(extended_validation_data),
        "test": Dataset.from_dict(extended_test_data)
    })

    if shuffle:
        extended_datasets["train"] = extended_datasets["train"].shuffle(seed=RANDOM_SEED)
        
    return extended_datasets

def preprocess_dataset(dataset, tokenizer):
    """Preprocesses a dataset using a tokenizer."""
    def preprocess_function(examples, tokenizer):
        """Preprocess function for T5."""
        model_inputs = tokenizer(
            examples["source"],
            text_target=examples["target"],
            max_length=MAX_INPUT_LENGTH,
            truncation=True,
        )
        return model_inputs

    return dataset.map(
        preprocess_function,
        fn_kwargs={'tokenizer': tokenizer},
        batched=True,
        remove_columns=["source", "target"],
    )

def post_process(preds, refs, tokenizer):
    """
    Post-process function for T5.

    Args:
        preds (list): list of predicted sequences
        refs (list): list of reference sequences
        tokenizer (PreTrainedTokenizer): tokenizer to use for decoding

    Returns:
        decoded_preds (list): list of decoded predicted sequences
        decoded_refs (list): list of decoded reference sequences
    """
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    refs = np.where(refs != -100, refs, tokenizer.pad_token_id)
    decoded_refs = tokenizer.batch_decode(refs, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_refs = [ref.strip() for ref in decoded_refs]

    return decoded_preds, decoded_refs

def post_process_preds(preds, tokenizer):
    """
    Post-process function for T5 (only for predictions)

    Args:
        preds (list): list of predicted sequences
        tokenizer (PreTrainedTokenizer): tokenizer to use for decoding

    Returns:
        decoded_preds (list): list of decoded predicted sequences
    """
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_preds = [pred.strip() for pred in decoded_preds]

    return decoded_preds

def compute_metrics(eval_preds, tokenizer):
    """
    Function to calculate the metrics for trainer.evaluate().

    Args:
        tokenizer (PreTrainedTokenizer): tokenizer to use for decoding the predictions
        eval_preds (tuple): Tuple containing the predictions and references

    Returns:
        dict: Dictionary containing the metrics
    """
    preds, refs = eval_preds

    # Post-process the predictions and references
    decoded_preds, decoded_refs = post_process(preds, refs, tokenizer)
    
    # Evaluate metrics
    return evaluate_metrics(
        decoded_refs,
        decoded_preds,
        tokenizer_toxicity=tokenizer_toxicity,
        model_toxicity=model_toxicity,
        tokenizer_acceptability=tokenizer_acceptability,
        model_acceptability=model_acceptability,
        include_bleurt=INCLUDE_BLEURT
    )

def compute_metrics_bd(eval_preds, tokenizer, bd_dataset, shuffled_data=False):
    """
    Function to calculate the metrics for trainer.evaluate().
    This function is for the bi-directional model.
    
    Args:
        eval_preds (tuple): Tuple containing the predictions and references
        tokenizer (PreTrainedTokenizer): tokenizer to use for decoding the predictions
        shuffled_data (bool): Whether the data is shuffled or not
        bd_dataset (DatasetDict): Bidirectional dataset to use for testing created using create_bidirectional_datasets
                                  For example, raw_datasets_bd["validation"] or raw_datasets_bd["test"]

    Returns:
        dict: Dictionary containing the metrics
    """
    preds, refs = eval_preds

    # Post-process the predictions and references
    decoded_preds, decoded_refs = post_process(preds, refs, tokenizer)
    
    # If shuffled data is false, have to_neutral_preds and to_neutral_refs just be predictions and refs with even indices
    if not shuffled_data:
        to_neutral_preds = decoded_preds[::2]
        to_neutral_refs = decoded_refs[::2]
    # Otherwise, get the indices to use when splitting predictions and refs to to_neutral and to_toxic
    else:
        # Get the indices to use when splitting predictions and refs to to_neutral and to_toxic
        to_neutral_idx = [i for i, input_sentence in enumerate(bd_dataset['source']) if input_sentence.startswith("to_neutral")]

        # Retrieve based on the indices
        to_neutral_preds = [decoded_preds[i] for i in to_neutral_idx]
        to_neutral_refs = [decoded_refs[i] for i in to_neutral_idx]
    
    # Evaluate metrics for to_neutral
    to_neutral_metrics = evaluate_metrics(
        to_neutral_refs,
        to_neutral_preds,
        include_bleurt=INCLUDE_BLEURT
    )

    # Return dictionary of to_neutral metrics
    return to_neutral_metrics

def setup_trainer(output_dir_name,
                train_dataset,
                eval_dataset,
                compute_metrics,
                model_checkpoint="t5-small",
                per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
                per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
                learning_rate=LEARNING_RATE,
                num_train_epochs=NUM_TRAIN_EPOCHS,
                max_length=MAX_OUTPUT_LENGTH,
                num_beams=NUM_BEAMS,
                early_stopping_patience=EARLY_STOPPING_PATIENCE,
                report_to="wandb",
                ):
    """
    Set up a Seq2SeqTrainer object for training a T5 model.

    Default parameters based on this: https://github.com/google-research/text-to-text-transfer-transformer/blob/main/t5/models/hf_model.py#L55

    Args:
        output_dir_name (str): What to name the model in the output directory.
        train_dataset (Dataset): Training dataset.
        eval_dataset (Dataset): Evaluation dataset.
        compute_metrics (function): Function to compute metrics. Change this to compute_metrics_bd if using a bi-directional model.
        model_checkpoint (str): Model checkpoint to use.
        per_device_train_batch_size (int): Batch size for training.
        per_device_eval_batch_size (int): Batch size for evaluation.
        learning_rate (float): Learning rate.
        num_train_epochs (int): Number of training epochs.
        max_length (int): Maximum length of the output sequence.
        num_beams (int): Number of beams for beam search.
        early_stopping_patience (int): Number of epochs to wait before early stopping.
        report_to (str): Where to report results to. Either "wandb" or "none".

    Returns:
        Seq2SeqTrainer: Trainer object for training the T5 model.
    """
    
    # Instantiate model and tokenizer
    model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
    tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)

    # Define the data collator
    data_collator = DataCollatorForSeq2Seq(tokenizer, model, return_tensors="pt", padding=True)

    # Define generation config
    generation_config = GenerationConfig(
        max_length=max_length,
        num_beams=num_beams,
        early_stopping=True,
        eos_token_id=model.config.eos_token_id,
        bos_token_id=model.config.bos_token_id,
        pad_token_id=model.config.pad_token_id,
        decoder_start_token_id=model.config.pad_token_id
        )

    # Save the generation config
    gen_config_path = f"../models/{output_dir_name}/generation_config"
    generation_config.save_pretrained(gen_config_path)

    # Define the training arguments
    args = Seq2SeqTrainingArguments(
        output_dir=f'../models/{output_dir_name}',
        evaluation_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=per_device_eval_batch_size,
        learning_rate=learning_rate, 
        predict_with_generate=True,
        generation_config=gen_config_path,
        fp16=True,
        report_to=report_to,
        logging_steps=100,
        load_best_model_at_end=True,
        metric_for_best_model="Overall",
        greater_is_better=True,
        generation_max_length=max_length,
    )
   
    # Instantiate the trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=partial(compute_metrics, tokenizer=tokenizer),
        callbacks=[EarlyStoppingCallback(early_stopping_patience=early_stopping_patience)]
    )

    return trainer

def training_pipeline(model_name, project_name="t5-detox", model_checkpoint="t5-small", use_validation=True, raw_datasets=raw_datasets, bidirectional=False, shuffle=False, do_train=True):
    """
    Pipeline for training a T5 model. Saves the best model checkpoint to a txt file. Can also be used for evaluating a model (use test set instead of validation set).

    Args:
        model_name (str): Name of the model to name the output directory and wandb run.
        project_name (str): Name of the wandb project.
        model_checkpoint (str): Model checkpoint to use.
        use_validation (bool): Whether to use the validation set or not.
        raw_datasets (DatasetDict): DatasetDict object containing the original dataset.
        bidirectional (bool): Whether to use a bi-directional model or not.
        shuffle (bool): Whether to shuffle the dataset or not.
        do_train (bool): Whether to train the model or not.

    Returns:
        trainer (Seq2SeqTrainer): Trainer object for training the T5 model.
    """
    
    # Preprocess dataset (add prefixes / make bidirectional)
    if bidirectional:
        raw_datasets = create_bidirectional_dataset(raw_datasets, shuffle=shuffle)
    else:
        raw_datasets = add_prefix(raw_datasets)

    # Tokenize dataset
    tokenized_datasets = preprocess_dataset(raw_datasets, tokenizer_t5_small)

    # Define compute_metrics function depending on bidirectional or not
    if bidirectional and use_validation:
        bd_dataset = raw_datasets["validation"]
    elif bidirectional and not use_validation:
        bd_dataset = raw_datasets["test"]
    else:
        bd_dataset = None

    compute_metrics_fn = partial(compute_metrics_bd, bd_dataset=bd_dataset, shuffled_data=shuffle) if bd_dataset else compute_metrics

    # Setup trainer
    trainer = setup_trainer(
        output_dir_name=model_name,
        model_checkpoint=model_checkpoint,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"] if use_validation else tokenized_datasets["test"],
        compute_metrics=compute_metrics_fn
    )

    if do_train:
        # Initialize wandb
        wandb.init(project=project_name, name=model_name)
        trainer.train()
        wandb.finish()

        # Get the best checkpoint path for the model
        checkpoint_path = trainer.state.best_model_checkpoint

        # Save the checkpoint path for the best model
        with open(BEST_MODEL_CHECKPOINT_PATH, "a") as file:
            file.write(f"{model_name}: {checkpoint_path}\n")

    return trainer, tokenized_datasets

## Negative Lexically Constrained Decoding

In [12]:
# Note this documentation for bad_word_ids: https://github.com/huggingface/transformers/issues/14206

def get_bad_words_list(dataset, tokenizer=tokenizer_toxicity, model=model_toxicity, num_layers=3, top_k=3):
    """
    Gets the top k bad words for each sentence in the dataset.

    Args:
        dataset (list): List of sentences.
        tokenizer (PreTrainedTokenizer): Tokenizer to use (toxicity classifier).
        model (PreTrainedModel): Model to use (toxicity classifier).
        num_layers (int): Number of layers to use.
        top_k (int): Number of bad words to return.

    Returns:
        bad_words_list (list): List of lists of bad words.
    """    
    bad_words_list = []

    for sentence in dataset:
        # Tokenize sentence
        inputs = tokenizer(sentence, return_tensors="pt").to(DEVICE)
        input_ids = inputs["input_ids"]

        # Get attention scores
        attention = model(input_ids, output_attentions=True)['attentions']

        # Get the last num_layers layer attention scores and average them
        attention = torch.stack(attention[-num_layers:]).mean(0)

        # Average across each head
        attention = attention.mean(1)

        # Sum each row to get the attention score for each token
        attention = attention.mean(1)

        # Exclude separator tokens and punctuation
        token_list = input_ids.squeeze().tolist()
        punctuation_ids = {tokenizer.convert_tokens_to_ids(token) for token in string.punctuation}
        exclude_ids = set([tokenizer.bos_token_id, tokenizer.eos_token_id]) | punctuation_ids

        valid_indices = [i for i, token_id in enumerate(token_list) if token_id not in exclude_ids]

        # Filter out the valid indices from the attention scores
        valid_attention = attention.squeeze()[valid_indices]

        # Get the indices of the top k tokens with the highest attention scores among valid tokens
        top_indices = valid_attention.topk(top_k).indices.tolist()
        top_token_indices = [valid_indices[i] for i in top_indices]

        # Decode the tokens
        bad_words = [tokenizer.decode(token_list[index]).strip() for index in top_token_indices]

        bad_words_list.append(bad_words)

    return bad_words_list

def get_bad_word_ids(dataset,
                     tokenizer_toxicity=tokenizer_toxicity,
                     model_toxicity=model_toxicity,
                     tokenizer_t5=tokenizer_t5_small,
                     num_layers=3,
                     top_k=3):
    """
    Get the bad word IDs for a given dataset using the toxicity classifier.

    Args:
    - dataset: The dataset to get the bad word IDs for.
    - tokenizer_toxicity: The tokenizer for the toxicity classifier.
    - model_toxicity: The toxicity classifier model.
    - tokenizer_t5: The tokenizer for the T5 model.
    - num_layers: The number of layers to use for the attention-based bad word identification.
    - top_k: The number of top words to select from each layer.

    Returns:
    - bad_word_ids: A list of lists, where each inner list contains the bad word IDs for a sentence in the dataset.
    """

    # Get list of bad words as identified using attention from toxicity classifier
    bad_words_list = get_bad_words_list(dataset, tokenizer_toxicity, model_toxicity, num_layers, top_k)

    # Convert each list of bad words to a string
    bad_words_str_list = [" ".join(bad_words) for bad_words in bad_words_list]

    # Encode the bad words using the T5 tokenizer encode
    bad_word_ids = [tokenizer_t5.encode(bad_words, add_special_tokens=False) for bad_words in bad_words_str_list]

    return bad_word_ids

def get_preds_nlcd(use_validation,
                   model_checkpoint,
                   raw_datasets=raw_datasets,
                   batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
                   num_beams=NUM_BEAMS,
                   max_length=MAX_OUTPUT_LENGTH,
                   num_batches=None,
                   num_bw_layers=3,
                   bw_top_k=3):
    """
    Generate predictions using Negative Lexially Constrained Decoding.

    Args:
    - use_validation: Whether to use the validation dataset or the test dataset.
    - model_checkpoint: The checkpoint of the T5 model to use.
    - raw_datasets: The raw datasets dictionary object containing the source sentences.
    - batch_size: The batch size for processing the data.
    - num_beams: The number of beams for beam search.
    - max_length: The maximum length of the generated output.
    - num_batches: The number of batches to process. If None, it will be calculated based on the dataset size.
    - num_bw_layers: The number of layers to use for the attention-based bad word identification.
    - bw_top_k: The number of top words to select from each layer.

    Returns:
    - all_preds: A list of predicted sentences for the NLCD task.
    """

    # Load model and tokenizer
    model = T5ForConditionalGeneration.from_pretrained(model_checkpoint).to(DEVICE)
    tokenizer = T5Tokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

    # Prefix the dataset
    prefixed_datasets = add_prefix(raw_datasets)

    # Get the raw sentences
    raw_sentences = prefixed_datasets["validation"]["source"] if use_validation else prefixed_datasets["test"]["source"]

    # Tokenize the raw sentences
    input_ids = tokenizer(raw_sentences, return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_LENGTH, add_special_tokens=False)['input_ids'].to(DEVICE)

    # Define bad word ids
    bad_word_ids = get_bad_word_ids(raw_sentences, num_layers=num_bw_layers, top_k=bw_top_k)

    # Initialize a list to store all predictions
    all_preds = []

    # Determine the number of batches
    if num_batches is None:
        num_batches = (len(raw_sentences) + batch_size - 1) // batch_size

    # Process data in batches
    for batch_idx in tqdm(range(num_batches)):
        start_idx = batch_idx * batch_size
        end_idx = (batch_idx + 1) * batch_size

        # Extract a batch of tokenized sentences
        batch_input_ids = input_ids[start_idx:end_idx]

        # Generate predictions for the batch
        encoded_preds = model.generate(
            inputs=batch_input_ids,
            max_length=max_length,
            num_beams=num_beams,
            early_stopping=True,
            eos_token_id=model.config.eos_token_id,
            bos_token_id=model.config.bos_token_id,
            pad_token_id=model.config.pad_token_id,
            decoder_start_token_id=model.config.pad_token_id,
            bad_words_ids=bad_word_ids
        )

        # Decode the predictions for the batch
        decoded_preds = tokenizer.batch_decode(encoded_preds, skip_special_tokens=True)

        # Post-process the predictions for the batch
        decoded_preds = [pred.strip() for pred in decoded_preds]

        # Append the batch predictions to the list of all predictions
        all_preds.extend(decoded_preds)

    return all_preds

## T5 Evaluation Helper Functions

In [13]:
def get_model_checkpoints():
    # Get checkpoint values for the best models
    with open(BEST_MODEL_CHECKPOINT_PATH, "r") as f:
        best_model_checkpoints = f.readlines()

    # Convert to a dictionary
    best_model_checkpoints_dict = {}
    for line in best_model_checkpoints:
        model_name, checkpoint_path = line.split(": ")
        best_model_checkpoints_dict[model_name] = checkpoint_path.strip()

    return best_model_checkpoints_dict

model_checkpoints = get_model_checkpoints()

def get_t5_preds_metrics(model_checkpoint,
                         raw_datasets=raw_datasets,
                         bidirectional=False,
                         shuffle=False,
                         use_validation=True,
                         do_train=False,
                         tokenizer=tokenizer_t5_small
                         ):
    """
    Returns the predictions and metrics for a fine-tuned T5 model.
    """
    
    # Setup training pipeline
    trainer, trainer_tokenized_ds = training_pipeline(
        model_name="n/a",
        project_name="n/a",
        model_checkpoint=model_checkpoint,
        use_validation=use_validation,
        raw_datasets=raw_datasets,
        bidirectional=bidirectional,
        shuffle=shuffle,
        do_train=do_train
    )

    # Get raw predictions
    if use_validation:
        trainer_preds_raw = trainer.predict(trainer_tokenized_ds["validation"])
    else:
        trainer_preds_raw = trainer.predict(trainer_tokenized_ds["test"])

    # Get encoded predictions and metrics
    trainer_preds_encoded, trainer_metrics = trainer_preds_raw.predictions, trainer_preds_raw.metrics

    # Post-process predictions
    if isinstance(trainer_preds_encoded, tuple):
        trainer_preds_encoded = trainer_preds_encoded[0]

    trainer_preds_decoded = tokenizer.batch_decode(trainer_preds_encoded, skip_special_tokens=True)
    trainer_preds_decoded = [pred.strip() for pred in trainer_preds_decoded]

    #Return trainer metrics in the same format as evaluate_metrics
    trainer_metrics = {
        "BLEU": trainer_metrics["test_BLEU"],
        "BLEURT": trainer_metrics["test_BLEURT"],
        "STA": trainer_metrics["test_STA"],
        "FLU": trainer_metrics["test_FLU"],
        "SEM": trainer_metrics["test_SEM"],
        "J": trainer_metrics["test_J"]
    }
        
    # Return predictions and metrics
    return trainer_preds_decoded, trainer_metrics

def compare_outputs(
    df,
    cols_to_compare = ["source", "target", "BART_preds", "T5-UD-DA_preds", "T5-UD-DA-MinLoss_preds"],
    bad_words_cols = ['source', 'T5-UD_preds'],
    num_examples = 5,
    num_layers = 5,
    top_k = 3,
    print_toxic_words = True,
    random_state = RANDOM_SEED,
):
    """"""
    # Randomly sample the filtered dataset
    df_sample = df.sample(n=num_examples, random_state=random_state)

    # Get toxic words for in bad_words_col_1, bad_words_col_2, and bad_words_col_3
    if print_toxic_words and bad_words_cols is not None:
        ## Initialize a dictionary of bad words
        bad_words = {}

        ## Get bad words for each model
        for bad_words_col in bad_words_cols:
            bad_words[bad_words_col] = get_bad_words_list(df_sample[bad_words_col], num_layers=num_layers, top_k=top_k)

    # Print every line for each col in cols_to_compare, and the bad words for each col in bad_words_cols
    for i in range(num_examples):
        if cols_to_compare is not None:
            for col in cols_to_compare:
                # Format the column name to remove _preds and capitalise first letter
                if col in ['source', 'target']:
                    col_name = col.capitalize()
                elif col.endswith("_preds"):
                    col_name = col[:-len("_preds")]
                print(f"{col_name}: {df_sample[col].iloc[i]}")
        if bad_words_cols is not None:
            for bad_words_col in bad_words_cols:
                # Format the column name to remove _preds and capitalise first letter
                if bad_words_col in ['source', 'target']:
                    bad_words_col_name = bad_words_col.capitalize()
                elif bad_words_col.endswith("preds"):
                    bad_words_col_name = bad_words_col[:-len("_preds")]
                print(f"'Toxic' words in {bad_words_col_name}: {bad_words[bad_words_col][i]}")
        print()

# Evaluation Using Validation Set

## Human

In [15]:
human_val_metrics = evaluate_metrics(raw_datasets["validation"]["target"], raw_datasets["validation"]["target"])
human_val_metrics



{'BLEU': 1.0000000000000004,
 'STA': 0.9538977367979883,
 'FLU': 0.7160852,
 'SEM': 0.9999999969023571,
 'J': 0.9247761332079433,
 'BLEURT': 0.9892257596401237}

## Baseline: DELETE

In [22]:
delete_val_preds = baseline_detoxifier(raw_datasets["validation"]["source"])
df_val_preds = add_preds_to_df("DELETE", delete_val_preds, use_validation=True, load_csv=False)

delete_val_metrics = evaluate_metrics(raw_datasets["validation"]["target"], delete_val_preds)
df_val_metrics = add_metrics_to_df("DELETE", delete_val_metrics, use_validation=True, load_csv=False)

  df = pd.concat([df, model_metrics_df], ignore_index=True)


## Baseline: BART

In [24]:
bart_val_preds = bart_detoxifier(raw_datasets["validation"]["source"])
df_val_preds = add_preds_to_df("BART", bart_val_preds, use_validation=True, load_csv=True)

bart_val_metrics = evaluate_metrics(raw_datasets["validation"]["target"], bart_val_preds)
df_val_metrics = add_metrics_to_df("BART", bart_val_metrics, use_validation=True, load_csv=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [27]:
df_val_preds = add_metric_cols_to_preds("BART_preds", use_validation=True)

In [26]:
df_val_metrics

Unnamed: 0,Model,BLEURT,BLEU,STA,FLU,SEM,J
0,DELETE,-0.22743,0.529101,0.659681,0.478651,0.911821,0.647787
1,BART,0.466564,0.701595,0.917854,0.718025,0.945139,0.840093


## T5-Small (Unidirectional)

In [63]:
# Get predictions and metrics
t5_ud_val_preds, t5_ud_val_metrics = get_t5_preds_metrics(
    model_checkpoint=model_checkpoints["t5_small_unidir"],
    use_validation=True)

# Add predictions to dataframe and per-row metrics
df_val_preds = add_preds_to_df("T5-UD", t5_ud_val_preds, use_validation=True, load_csv=True)
df_val_preds = add_metric_cols_to_preds("T5-UD_preds", use_validation=True)

# Add aggregated metrics to dataframe
df_val_metrics = add_metrics_to_df("T5-UD", t5_ud_val_metrics, use_validation=True, load_csv=True)



Map:   0%|          | 0/10733 [00:00<?, ? examples/s]

Map:   0%|          | 0/1193 [00:00<?, ? examples/s]

Map:   0%|          | 0/671 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.




Based on this example, the largest differences between BART and T5-Small-UD are in relation to BLEURT.

Comparison of BLEURT

In [24]:
# Filter the dataset to rows where BART has better BLEURT score than T5-UD
df_BLEURT_BART_vs_T5_UD = df_val_preds[(df_val_preds["BART_BLEURT"] >= df_val_preds["T5-UD_BLEURT"] * 1.2) & (df_val_preds["BART_BLEURT"] != df_val_preds["T5-UD_BLEURT"])]

# Compmare outputs
compare_outputs(
    df=df_BLEURT_BART_vs_T5_UD,
    cols_to_compare=['source', 'target', 'BART_preds', 'T5-UD_preds'],
    bad_words_cols=None,
)

Source: Most people understand trumps comments for what they were the whining of a sore loser.
Target: Most people understand trumps comments were result of his defeat.
BART: Most people understand trumps comments for what they were.
T5-UD: Most people understand trumps comments for what they were the bad of a loser.

Source: But then the incredibly stupid can continue to ignore the obvious.
Target: But then those who don't know anything can continue to not pay attention to the obvious.
BART: But then they can continue to ignore the obvious.
T5-UD: But then the incredibly bad can continue to ignore the obvious.

Source: Rt ima agree with you on that his shit is so bland and shallow
Target: I agree with you that his performance is bland and shallow
BART: I agree with you that his performance is bland and shallow
T5-UD: Rt ima agree with you on that his thing is so bland and shallow

Source: No bitch i miss being 80 you on the other hand are still a " teen
Target: No girl i miss being 80

Comparison of STA

In [65]:
# Filter the dataset to rows where BART is better than T5-Small-UD in terms of STA
df_STA_BART_vs_T5_UD = df_val_preds[(df_val_preds["BART_STA"] == 1) & (df_val_preds["T5-UD_STA"] == 0)]

# Compare outputs
compare_outputs(
    df=df_BLEURT_BART_vs_T5_UD,
    cols_to_compare=['source', 'target', 'BART_preds', 'T5-UD_preds'],
    bad_words_cols=['source', 'T5-UD_preds'],
)

Source: Who the fuck cuts out part of the state of the union?
Target: Who cuts out part of the state of tge union.
BART: Who cuts out part of the state of the union?
T5-UD: Who cuts out part of the state of the union?
'Toxic' words in ourc: ['fuck', 'the', 'part']
'Toxic' words in T5-UD: ['state', 'cuts', 'part']

Source: Nah yu seen how i act n public sober n shit
Target: Nah you’ve seen how i act in public, sober and all.
BART: Nah yu seen how i act in public sober
T5-UD: Nah you have seen how i act n public sober
'Toxic' words in ourc: ['shit', 'i', 'u']
'Toxic' words in T5-UD: ['i', 'sober', 'n']

Source: Shit chillen fresh out the shower
Target: Chillen fresh out of the shower
BART: Chillen fresh out of the shower
T5-UD: Coolen fresh out the shower
'Toxic' words in ourc: ['it', 'Sh', 'chill']
'Toxic' words in T5-UD: ['Cool', 'en', 'shower']

Source: These repubicans are like cannibalistic pirañas.
Target: These republicans are taking advantage.
BART: These republicans are not good

## T5-Small (Bidirectional)

In [36]:
model_checkpoints

{'t5_small_unidir': '../models/t5_small_unidir/checkpoint-840',
 't5_small_bidir_noshuf': '../models/t5_small_bidir_noshuf/checkpoint-2352',
 't5_small_bidir_shuf': '../models/t5_small_bidir_shuf/checkpoint-3024',
 't5_small_aug_all': '../models/t5_small_aug_all/checkpoint-2592',
 't5_small_aug_noaccept': '../models/t5_small_aug_noaccept/checkpoint-1620',
 't5_small_aug_nosim': '../models/t5_small_aug_nosim/checkpoint-2592',
 't5_small_aug_notox': '../models/t5_small_aug_notox/checkpoint-1944'}

In [37]:
# Get predictions and metrics
t5_bd_val_preds, t5_bd_val_metrics = get_t5_preds_metrics(
    model_checkpoint=model_checkpoints["t5_small_bidir_shuf"],
    use_validation=True)

# Add predictions to dataframe
df_val_preds = add_preds_to_df("T5-BD", t5_bd_val_preds, use_validation=True, load_csv=True)

# Add per-row metrics to dataframe
df_val_preds = add_metric_cols_to_preds("T5-BD_preds", use_validation=True)

# Add metrics to dataframe
df_val_metrics = add_metrics_to_df("T5-BD", t5_bd_val_metrics, use_validation=True, load_csv=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [38]:
df_val_metrics

Unnamed: 0,Model,BLEURT,BLEU,STA,FLU,SEM,J
0,DELETE,-0.22743,0.529101,0.659681,0.478651,0.911821,0.647787
1,BART,0.466564,0.701595,0.917854,0.718025,0.945139,0.840093
2,T5-UD,0.206119,0.60615,0.902766,0.707668,0.925923,0.809055
3,T5-BD,0.217048,0.611682,0.912825,0.711991,0.926827,0.81523


Comparison of BLEURT vs. T5-Small-UD

In [81]:
# Filter the dataset to rows where BLEURT of T5-Small-BD exceeds T5-Small-UD by 20% and are not identical
df_BLEURT_T5_BD_vs_T5_UD = df_val_preds[(df_val_preds["T5-BD_BLEURT"] >= df_val_preds["T5-UD_BLEURT"] * 1.2) & (df_val_preds["T5-BD_BLEURT"] != df_val_preds["T5-UD_BLEURT"])]

compare_outputs(
    df=df_BLEURT_T5_BD_vs_T5_UD,
    cols_to_compare=['source','target','T5-UD_preds','T5-BD_preds'],
    bad_words_cols=None,
)


Source: Lol fuck im ever doing that formsprings, perfect exampleas why not to do it
Target: Lol im ever doing that formsprings, perfect exampleas why not to do it
T5-UD: I'm ever doing that formsprings, perfect exampleas why not to do it
T5-BD: Lol im ever doing that formsprings, perfect exampleas why not to do it

Source: Hey, respect to op for sticking around and not giving a shit
Target: Hey, respect to op for sticking around and not caring about it.
T5-UD: Hey, respect to op for sticking around and not giving a thing
T5-BD: Hey, respect to op for sticking around and not caring

Source: Im having the hardest time accepting that shit man!!! irefuse to believe this shit
Target: I'm having the hardest time accepting that man, I refuse to believe this.
T5-UD: Im having the hardest time accepting that man!!! irefuse to believe this man
T5-BD: I am having the hardest time accepting that man!!! irefuse to believe this

Source: Icantdateyou if ur hairy af - _ - fucks noo! lol
Target: I cant

Comparison of STA vs. T5-Small-UD

In [82]:
# Filter the dataset to rows where BART is better than T5-Small-UD in terms of STA
df_STA_T5_BD_vs_T5_UD = df_val_preds[(df_val_preds["T5-BD_STA"] == 1) & (df_val_preds["T5-UD_STA"] == 0)]

# Compare outputs
compare_outputs(
    df=df_STA_T5_BD_vs_T5_UD,
    cols_to_compare=['source','target','T5-UD_preds','T5-BD_preds'],
    bad_words_cols=['source','T5-UD_preds'],
    num_examples=5,
    num_layers=5,
    top_k=2,
)


Source: You already burned down to a crisp in fort mac by your own stupidity, already.
Target: You already lost in fort mac.
T5-UD: You already burned down to a crisp in fort mac by your own ignorance, already.
T5-BD: You already burned down to a crisp in fort mac by yourself, already.
'Toxic' words in Source: ['stupidity', 'burned']
'Toxic' words in T5-UD: ['You', 'burned']

Source: True conservatives, that are not white racists, will have to find a new home.
Target: Correct
T5-UD: True conservatives, that are not white, will have to find a new home.
T5-BD: True conservatives will have to find a new home.
'Toxic' words in Source: ['white', 'racists']
'Toxic' words in T5-UD: ['white', 'True']

Source: So you would love to see the commys win over a duly elected democratic president, man you are sick
Target: So you would love to see the communist win over a duly elected democratic president
T5-UD: So you would love to see the commys win over a duly elected democratic president, man you a

## T5-Small (Unidirectional, Data Augmentation)

In [23]:
model_checkpoints

{'t5_small_unidir': '../models/t5_small_unidir/checkpoint-840',
 't5_small_bidir_noshuf': '../models/t5_small_bidir_noshuf/checkpoint-2352',
 't5_small_bidir_shuf': '../models/t5_small_bidir_shuf/checkpoint-3024',
 't5_small_aug_all': '../models/t5_small_aug_all/checkpoint-2592',
 't5_small_aug_noaccept': '../models/t5_small_aug_noaccept/checkpoint-1620',
 't5_small_aug_nosim': '../models/t5_small_aug_nosim/checkpoint-2592',
 't5_small_aug_notox': '../models/t5_small_aug_notox/checkpoint-1944'}

### All Filters

In [43]:
# Get predictions and metrics
t5_ud_da_val_preds, t5_ud_da_val_metrics = get_t5_preds_metrics(
    model_checkpoint=model_checkpoints["t5_small_aug_all"],
    use_validation=True)

# Add predictions to dataframe
df_val_preds = add_preds_to_df("T5-UD-DA", t5_ud_da_val_preds, use_validation=True, load_csv=True)

# Add row metrics to dataframe
df_val_preds = add_metric_cols_to_preds("T5-UD-DA_preds", use_validation=True)

# Add metrics to dataframe
df_val_metrics = add_metrics_to_df("T5-UD-DA", t5_ud_da_val_metrics, use_validation=True, load_csv=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### All Filters (Minimum Loss)

In [13]:
# Get predictions and metrics
t5_ud_da_minloss_val_preds, t5_ud_da_minloss_val_metrics = get_t5_preds_metrics(
    model_checkpoint="../models/t5_small_aug_all/checkpoint-972",
    use_validation=True)

# Add predictions to dataframe
df_val_preds = add_preds_to_df("T5-UD-DA-MinLoss", t5_ud_da_minloss_val_preds, use_validation=True, load_csv=True)

# Add row metrics to dataframe
df_val_preds = add_metric_cols_to_preds("T5-UD-DA-MinLoss_preds", use_validation=True)

# Add metrics to dataframe
# df_val_metrics = add_metrics_to_df("T5-Small-Aug-All-MinLoss", t5_ud_da_minloss_val_metrics, use_validation=True, load_csv=True)



Map:   0%|          | 0/10733 [00:00<?, ? examples/s]

Map:   0%|          | 0/1193 [00:00<?, ? examples/s]

Map:   0%|          | 0/671 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.




### Compare Outputs of All Filter vs. All Filter (Minimum Loss)

In [56]:
# Filter to rows where T5-UD-DA-MinLoss has STA == 0 and T5-UD-DA has STA == 1
df_STA_T5_UD_DA_MinLoss_vs_T5_UD_DA = df_val_preds[(df_val_preds["T5-UD-DA-MinLoss_STA"] == 0) & (df_val_preds["T5-UD-DA_STA"] == 1)]

# Compare outputs
compare_outputs(
    df=df_STA_T5_UD_DA_MinLoss_vs_T5_UD_DA,
    cols_to_compare=['source','target','T5-UD-DA-MinLoss_preds', 'T5-UD-DA_preds'],
    bad_words_cols=['source','T5-UD-DA-MinLoss_preds'],
    random_state=42
)

Source: Bitch was so mad lmfao!! smh
Target: She was so mad!
T5-UD-DA-MinLoss: Bitch was so mad
T5-UD-DA: He was so mad.
'Toxic' words in Source: ['itch', 'B', 'was']
'Toxic' words in T5-UD-DA-MinLoss: ['itch', 'was', 'B']

Source: Pitiful choice scalia was a hypocrite!
Target: Scalia was a bad person
T5-UD-DA-MinLoss: Choice scalia was a hypocrite!
T5-UD-DA: Choice scalia was not a good person!
'Toxic' words in Source: ['sc', 'was', 'hypocr']
'Toxic' words in T5-UD-DA-MinLoss: ['was', 'sc', 'hypocr']

Source: He's a fucking street thug.
Target: He's a street gang member.
T5-UD-DA-MinLoss: He's a street thug
T5-UD-DA: He's a street guy.
'Toxic' words in Source: ['fucking', 'He', "'s"]
'Toxic' words in T5-UD-DA-MinLoss: ['He', 'thug', "'s"]

Source: Doosh doosh doosh doosh doosh fucking cock sucker queir bate ass wipe dont tell me what the fuck to write
Target: Don't tell me what to write.
T5-UD-DA-MinLoss: Doosh doosh doosh doosh cock sucker queir bate wipe dont tell me what to write
T

### No Toxicity Filter

In [106]:
# Get predictions and metrics
t5_ud_da_notox_val_preds, t5_ud_da_notox_val_metrics = get_t5_preds_metrics(
    model_checkpoint=model_checkpoints["t5_small_aug_notox"],
    use_validation=True)

# Add predictions to dataframe
df_val_preds = add_preds_to_df("T5-UD-DA-NoTOX", t5_ud_da_notox_val_preds, use_validation=True, load_csv=True)

# Add metrics to dataframe
df_val_metrics = add_metrics_to_df("T5-UD-DA-NoTOX", t5_ud_da_notox_val_metrics, use_validation=True, load_csv=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### No Semantic Similarity Filter

In [107]:
# Get predictions and metrics
t5_ud_da_nosem_val_preds, t5_ud_da_nosem_val_metrics = get_t5_preds_metrics(
    model_checkpoint=model_checkpoints["t5_small_aug_nosim"],
    use_validation=True)

# Add predictions to dataframe
df_val_preds = add_preds_to_df("T5-UD-DA-NoSEM", t5_ud_da_nosem_val_preds, use_validation=True, load_csv=True)

# Add metrics to dataframe
df_val_metrics = add_metrics_to_df("T5-UD-DA-NoSEM", t5_ud_da_nosem_val_metrics, use_validation=True, load_csv=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### No Fluency Filter

In [108]:
# Get predictions and metrics
t5_ud_da_noflu_val_preds, t5_ud_da_noflu_val_metrics = get_t5_preds_metrics(
    model_checkpoint=model_checkpoints["t5_small_aug_noaccept"],
    use_validation=True)

# Add predictions to dataframe
df_val_preds = add_preds_to_df("T5-UD-DA-NoFLU", t5_ud_da_noflu_val_preds, use_validation=True, load_csv=True)

# Add metrics to dataframe
df_val_metrics = add_metrics_to_df("T5-UD-DA-NoFLU", t5_ud_da_noflu_val_metrics, use_validation=True, load_csv=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [27]:
df_val_metrics

Unnamed: 0,Model,BLEURT,BLEU,STA,FLU,SEM,J
0,DELETE,-0.22743,0.529101,0.659681,0.478651,0.911821,0.647787
1,BART,0.466564,0.701595,0.917854,0.718025,0.945139,0.840093
2,T5-BD,0.217048,0.611682,0.912825,0.711991,0.926827,0.81523
3,T5-UD-DA,0.204206,0.593218,0.916178,0.714662,0.925547,0.813157
4,T5-UD-DA-NoTOX,0.208252,0.598625,0.892707,0.713698,0.925555,0.804659
5,T5-UD-DA-NoSEM,0.211811,0.591599,0.917854,0.719671,0.924972,0.81439
6,T5-UD-DA-NoFLU,0.192281,0.595025,0.903604,0.709992,0.925311,0.807508
7,T5-UD,0.206119,0.60615,0.902766,0.707668,0.925923,0.809055


# T5-Small (UD, NLCD)

In [80]:
def generate_pred_nlcd(raw_sentence, model, tokenizer, tokenizer_prefix_space, num_layers=3, top_k=3):
    """
    Generate predictions for a single raw sentence using the NLCD (Negative Lexically Constrained Decoding) method.

    Args:
        raw_sentence (str): The raw sentence to generate predictions for.
        model (T5ForConditionalGeneration): The T5 model for generation.
        tokenizer (T5Tokenizer): The tokenizer for tokenizing the input.
        tokenizer_prefix_space (T5Tokenizer): The tokenizer for tokenizing the input with prefix space.
        num_layers (int): The number of layers to consider for generating bad words.
        top_k (int): The number of top bad words to consider.

    Returns:
        str: The generated prediction for the input sentence.
    """

    # Tokenize inputs
    inputs = tokenizer([raw_sentence], return_tensors="pt")

    def get_tokens_as_list(word_list):
        "Converts a sequence of words into a list of tokens"
        tokens_list = []
        for word in word_list:
            tokenized_word = tokenizer_prefix_space([word], add_special_tokens=False).input_ids[0]
            tokens_list.append(tokenized_word)
        return tokens_list

    # Get toxic words
    bad_words = get_bad_words_list([raw_sentence], tokenizer_toxicity, model_toxicity, num_layers=num_layers, top_k=top_k)[0]
    bad_words_ids = get_tokens_as_list(word_list=bad_words)

    # Generate text while enforcing the bad words to not appear
    encoded_preds = model.generate(
        inputs["input_ids"],
        bad_words_ids=bad_words_ids,
        max_length=MAX_OUTPUT_LENGTH,
        num_beams=NUM_BEAMS,
        early_stopping=True,
        eos_token_id=model.config.eos_token_id,
        bos_token_id=model.config.bos_token_id,
        pad_token_id=model.config.pad_token_id,
        decoder_start_token_id=model.config.pad_token_id
    )
    
    # Decode the generated text
    decoded_preds = tokenizer.batch_decode(encoded_preds, skip_special_tokens=True)
    decoded_preds = [pred.strip() for pred in decoded_preds][0]

    return decoded_preds

def generate_preds_nlcd(raw_sentences,
                        model_checkpoint,
                        num_layers=3,
                        top_k=3):
    """
    Generate predictions for a list of raw sentences using the NLCD (Negative Lexically Constrained Decoding) method.

    Args:
        raw_sentences (list): A list of raw sentences to generate predictions for.
        model (T5ForConditionalGeneration): The T5 model for generation.
        tokenizer (T5Tokenizer): The tokenizer for tokenizing the input.
        tokenizer_prefix_space (T5Tokenizer): The tokenizer for tokenizing the bad words with prefix space.
        num_layers (int): The number of layers to consider for generating bad words.
        top_k (int): The number of top bad words to consider.

    Returns:
        list: A list of generated predictions for the input sentences.
    """

    # Load model and tokenizers based on the checkpoint
    model = T5ForConditionalGeneration.from_pretrained(model_checkpoint)
    tokenizer = T5Tokenizer.from_pretrained(model_checkpoint, add_prefix_space=False)
    tokenizer_prefix_space = T5Tokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

    # Create a list to store the predictions
    preds = []

    # Iterate through each raw sentence
    for raw_sentence in tqdm(raw_sentences):
        # Generate predictions with NLCD
        decoded_preds = generate_pred_nlcd(raw_sentence, model, tokenizer, tokenizer_prefix_space, num_layers, top_k)

        # Append the predictions to the list of predictions
        preds.append(decoded_preds)

    return preds

Let's test that this works by filtering to cases where T5-UD_STA == 0

In [83]:
def nlcd_test():
    # Prefix the dataset
    prefixed_datasets = add_prefix(raw_datasets)

    # Get the raw sentences
    ## Get indices of rows where T5-UD has STA == 0
    t5_ud_sta_0_indices = df_val_preds[df_val_preds["T5-UD_STA"] == 0].index.tolist()

    ## Get predictions from T5-UD where STA == 0
    preds_no_nlcd = df_val_preds[df_val_preds["T5-UD_STA"] == 0]["T5-UD_preds"].tolist()

    ## Filter the raw sentences to just the ones where T5-UD has STA == 0
    input_sentences = [prefixed_datasets["validation"]["source"][i] for i in t5_ud_sta_0_indices]

    ## Get the references
    refs = [prefixed_datasets["validation"]["target"][i] for i in t5_ud_sta_0_indices]

    # Get toxic words for each sentence
    toxic_words_per_sentence = get_bad_words_list(input_sentences, tokenizer_toxicity, model_toxicity, num_layers=3, top_k=2)

    # Get predictions using NLCD
    preds_nlcd = generate_preds_nlcd(input_sentences, model_checkpoints["t5_small_unidir"], num_layers=3, top_k=2)

    # Count the number of cases where preds_no_nlcd contains a toxic word and calculate success rate
    num_toxic_words_no_nlcd = 0
    for i in range(len(preds_no_nlcd)):
        if any(toxic_word in preds_no_nlcd[i] for toxic_word in toxic_words_per_sentence[i]):
            num_toxic_words_no_nlcd += 1
    success_rate_no_nlcd = (len(preds_no_nlcd) - num_toxic_words_no_nlcd) / len(preds_no_nlcd)
    print(f"Count of sentences with toxic words in preds_no_nlcd: {num_toxic_words_no_nlcd}")
    print(f"Success rate of no_nlcd: {success_rate_no_nlcd}")

    # Count the number of cases where preds_nlcd contains a toxic word and calculate success rate
    num_toxic_words_nlcd = 0
    for i in range(len(preds_nlcd)):
        if any(toxic_word in preds_nlcd[i] for toxic_word in toxic_words_per_sentence[i]):
            num_toxic_words_nlcd += 1
    success_rate_nlcd = (len(preds_nlcd) - num_toxic_words_nlcd) / len(preds_nlcd)
    print(f"Count of sentences with toxic words in preds_nlcd: {num_toxic_words_nlcd}")
    print(f"Success rate of nlcd: {success_rate_nlcd}")

    # Print out the first 5 predictions
    for i in range(5):
        print()
        print(f"Source: {input_sentences[i]}")
        print(f"Target: {refs[i]}")
        print(f"Bad words: {toxic_words_per_sentence[i]}")
        print(f"Prediction without NLCD: {preds_no_nlcd[i]}")
        print(f"Prediction with NLCD: {preds_nlcd[i]}")

nlcd_test()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|██████████| 116/116 [00:38<00:00,  3.02it/s]

Count of sentences with toxic words in preds_no_nlcd: 105
Success rate of no_nlcd: 0.09482758620689655
Count of sentences with toxic words in preds_nlcd: 32
Success rate of nlcd: 0.7241379310344828

Source: to_neutral: You straighten out that chicago problem and then well tell our cops to stop killing unarmed blacks.
Target: There are problems in Chicago that need to be dealt with as well as police brutality.
Bad words: ['blacks', 'killing']
Prediction without NLCD: You straighten out that chicago problem and then well tell our cops to stop killing unarmed blacks
Prediction with NLCD: You straighten out that chicago problem and then well tell our cops to stop shooting unarmed people.

Source: to_neutral: You have cold dead hands.
Target: You have cold hands.
Bad words: ['You', 'have']
Prediction without NLCD: You have cold dead hands
Prediction with NLCD: Your hands are not good.

Source: to_neutral: This is ridiculous and looks terrible if true.
Target: It's ridiculously terrible if t




Next we work on the full validation data and perform hyperparameter optimization to find optimal number of layers and top k

In [85]:
def nlcd_hyperparameter_optimization(raw_datasets=raw_datasets,
                                     model_checkpoint=model_checkpoints["t5_small_unidir"],
                                     num_bw_layers_list=[2, 3, 4],
                                     bw_top_k_list=[1, 2, 3, 4, 5],
                                     default_bw_top_k=3,
                                     include_bleurt=False):
    """
    Perform hyperparameter optimization for NLCD by trying different values for num_bw_layers and bw_top_k.

    Args:
    - model_checkpoint: The checkpoint of the T5 model to use.
    - raw_datasets: The raw datasets dictionary object containing the source and target sentences.
    - num_bw_layers_list: A list of values to try for num_bw_layers.
    - bw_top_k_list: A list of values to try for bw_top_k.

    Returns:
    - metrics_df: A DataFrame containing the evaluation metrics for different hyperparameter combinations.
    """

    # Initialize an empty DataFrame
    metrics_df = pd.DataFrame(columns=["num_layers", "top_k", "BLEU", "STA", "FLU", "SEM", "J"])
    best_J_score = -1
    best_num_layers = None
    best_top_k = None

    # Try different values for num_bw_layers
    for num_bw_layers in num_bw_layers_list:
        print(f"Number of Attention Layers: {num_bw_layers}, Top K: {default_bw_top_k}")
        t5_ud_nlcd_preds = generate_preds_nlcd(raw_sentences=raw_datasets["validation"]["source"],
                                                model_checkpoint=model_checkpoint,
                                                num_layers=num_bw_layers,
                                                top_k=default_bw_top_k)
        t5_ud_nlcd_metrics = evaluate_metrics(raw_datasets["validation"]["target"], t5_ud_nlcd_preds, include_bleurt=include_bleurt)
        print(t5_ud_nlcd_metrics)
        
        J_score = t5_ud_nlcd_metrics["J"]

        metrics_df = pd.concat([metrics_df, pd.DataFrame({
            "num_layers": [num_bw_layers],
            "top_k": [default_bw_top_k],
            "BLEU": [t5_ud_nlcd_metrics["BLEU"]],
            "STA": [t5_ud_nlcd_metrics["STA"]],
            "FLU": [t5_ud_nlcd_metrics["FLU"]],
            "SEM": [t5_ud_nlcd_metrics["SEM"]],
            "J": [J_score]
        })], ignore_index=True)

        if J_score > best_J_score:
            best_J_score = J_score
            best_num_layers = num_bw_layers

    # Iterate through bw_top_k while keeping the best num_bw_layers
    for bw_top_k in bw_top_k_list:
        print(f"Number of Attention Layers: {best_num_layers}, Top K: {bw_top_k}")
        t5_ud_nlcd_preds = generate_preds_nlcd(raw_sentences=raw_datasets["validation"]["source"],
                                                model_checkpoint=model_checkpoint,
                                                num_layers=best_num_layers,
                                                top_k=bw_top_k)
        t5_ud_nlcd_metrics = evaluate_metrics(raw_datasets["validation"]["target"], t5_ud_nlcd_preds, include_bleurt=include_bleurt)
        print(t5_ud_nlcd_metrics)
        
        J_score = t5_ud_nlcd_metrics["J"]

        metrics_df = pd.concat([metrics_df, pd.DataFrame({
            "num_layers": [best_num_layers],
            "top_k": [bw_top_k],
            "BLEU": [t5_ud_nlcd_metrics["BLEU"]],
            "STA": [t5_ud_nlcd_metrics["STA"]],
            "FLU": [t5_ud_nlcd_metrics["FLU"]],
            "SEM": [t5_ud_nlcd_metrics["SEM"]],
            "J": [t5_ud_nlcd_metrics["J"]]
        })], ignore_index=True)

        if J_score > best_J_score:
            best_J_score = J_score
            best_top_k = bw_top_k

    return metrics_df, best_num_layers, best_top_k

nlcd_metrics_df, best_num_layers, best_top_k = nlcd_hyperparameter_optimization()
print(nlcd_metrics_df)

Number of Attention Layers: 2, Top K: 3


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
 16%|█▌        | 193/1193 [01:04<07:07,  2.34it/s]

## T5-Small (Unidirectional, Negative Lexically Constrained Decoding)

Get predictions

Here we perform a greedy search to identify the optimal number of attention layers to average over and the top k tokens in identifying bad words

In [14]:
def nlcd_hyperparameter_optimization(model_checkpoint=model_checkpoints["t5_small_unidir"],
                                     raw_datasets=raw_datasets,
                                     num_bw_layers_list=[2, 3, 4],
                                     bw_top_k_list=[1, 2, 3, 4, 5],
                                     default_bw_top_k=3,
                                     include_bleurt=False):
    """
    Perform hyperparameter optimization for NLCD by trying different values for num_bw_layers and bw_top_k.

    Args:
    - model_checkpoint: The checkpoint of the T5 model to use.
    - raw_datasets: The raw datasets dictionary object containing the source and target sentences.
    - num_bw_layers_list: A list of values to try for num_bw_layers.
    - bw_top_k_list: A list of values to try for bw_top_k.

    Returns:
    - metrics_df: A DataFrame containing the evaluation metrics for different hyperparameter combinations.
    """

    # Initialize an empty DataFrame
    metrics_df = pd.DataFrame(columns=["num_layers", "top_k", "BLEU", "STA", "FLU", "SEM", "J"])
    best_J_score = -1
    best_num_layers = None
    best_top_k = None

    # Try different values for num_bw_layers
    for num_bw_layers in num_bw_layers_list:
        print(f"Number of Attention Layers: {num_bw_layers}, Top K: {default_bw_top_k}")
        t5_ud_nlcd_preds = get_preds_nlcd(use_validation=True,
                                          model_checkpoint=model_checkpoint,
                                          num_bw_layers=num_bw_layers,
                                          bw_top_k=default_bw_top_k)
        t5_ud_nlcd_metrics = evaluate_metrics(raw_datasets["validation"]["target"], t5_ud_nlcd_preds, include_bleurt=include_bleurt)
        print(t5_ud_nlcd_metrics)
        
        J_score = t5_ud_nlcd_metrics["J"]

        metrics_df = pd.concat([metrics_df, pd.DataFrame({
            "num_layers": [num_bw_layers],
            "top_k": [default_bw_top_k],
            "BLEU": [t5_ud_nlcd_metrics["BLEU"]],
            "STA": [t5_ud_nlcd_metrics["STA"]],
            "FLU": [t5_ud_nlcd_metrics["FLU"]],
            "SEM": [t5_ud_nlcd_metrics["SEM"]],
            "J": [J_score]
        })], ignore_index=True)

        if J_score > best_J_score:
            best_J_score = J_score
            best_num_layers = num_bw_layers

    # Iterate through bw_top_k while keeping the best num_bw_layers
    for bw_top_k in bw_top_k_list:
        print(f"Number of Attention Layers: {best_num_layers}, Top K: {bw_top_k}")
        t5_ud_nlcd_preds = get_preds_nlcd(use_validation=True,
                                          model_checkpoint=model_checkpoint,
                                          num_bw_layers=best_num_layers,
                                          bw_top_k=bw_top_k)
        t5_ud_nlcd_metrics = evaluate_metrics(raw_datasets["validation"]["target"], t5_ud_nlcd_preds, include_bleurt=include_bleurt)
        print(t5_ud_nlcd_metrics)
        
        J_score = t5_ud_nlcd_metrics["J"]

        metrics_df = pd.concat([metrics_df, pd.DataFrame({
            "num_layers": [best_num_layers],
            "top_k": [bw_top_k],
            "BLEU": [t5_ud_nlcd_metrics["BLEU"]],
            "STA": [t5_ud_nlcd_metrics["STA"]],
            "FLU": [t5_ud_nlcd_metrics["FLU"]],
            "SEM": [t5_ud_nlcd_metrics["SEM"]],
            "J": [t5_ud_nlcd_metrics["J"]]
        })], ignore_index=True)

        if J_score > best_J_score:
            best_J_score = J_score
            best_top_k = bw_top_k

    return metrics_df, best_num_layers, best_top_k

nlcd_metrics_df, best_num_layers, best_top_k = nlcd_hyperparameter_optimization()
print(nlcd_metrics_df)

Number of Attention Layers: 2, Top K: 3


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


KeyboardInterrupt: 

Based on the above the optimal number of attention layers = 1, and top-k = 3

In [None]:
# Get predictions based on optimal hyperparameters
t5_ud_nlcd_val_preds = generate_preds_nlcd(raw_sentences=raw_datasets["validation"]["source"],
                                             model_checkpoint=model_checkpoints["t5_small_unidir"],
                                             num_layers=best_num_layers,
                                             top_k=best_top_k)

t5_ud_nlcd_val_metrics = evaluate_metrics(refs=raw_datasets["validation"]["target"],
                                          preds=t5_ud_nlcd_val_preds,
                                          include_bleurt=True)

# Add predictions to dataframe
df_val_preds = add_preds_to_df("T5-UD-NLCD", t5_ud_nlcd_val_preds, use_validation=True, load_csv=True, replace_existing=True)

# Calculate per-row metrics
df_val_preds = add_metric_cols_to_preds("T5-UD-NLCD_preds", use_validation=True, replace_existing=True)

# Add metrics to dataframe
df_val_metrics = add_metrics_to_df("T5-UD-NLCD", t5_ud_nlcd_val_metrics, use_validation=True, load_csv=True, replace_existing=True)

In [115]:
# Filter to rows where T5-UD-NLCD has STA == 1 and T5-UD-DA has STA == 0
df_STA_T5_UD_NLCD_vs_T5_UD_DA = df_val_preds[(df_val_preds["T5-UD-NLCD_STA"] == 1) & (df_val_preds["T5-UD-DA_STA"] == 0)]

# Compare outputs
compare_outputs(
    df=df_STA_T5_UD_NLCD_vs_T5_UD_DA,
    cols_to_compare=['source','target','T5-UD-NLCD_preds', 'T5-UD-DA_preds'],
    bad_words_cols=['source'],
    random_state=1
)

Source: Should have thought out a few more moves ahead of their arrogant bully tactics.
Target: Should have thought out a few more moves ahead of their bully tactics.
T5-UD-NLCD: Should have thought out a few more moves ahead of their tactics.
T5-UD-DA: Should have thought a few more moves ahead of their arrogant tactics.
'Toxic' words in Source: ['their', 'arrogant', 'bully']

Source: Your post is no less trollish than old soul.
Target: Your post is not good enough than vintage soul.
T5-UD-NLCD: Your post is no less bad
T5-UD-DA: Your post is no less bad than old soul.
'Toxic' words in Source: ['troll', 'Your', 'post']

Source: Lola is a fucking beast ok
Target: Lola is a beast ok
T5-UD-NLCD: Lola is ok
T5-UD-DA: Lola is a beast
'Toxic' words in Source: ['fucking', 'ola', 'is']

Source: Pence is a despicable toady who fails every time to stand up to trump.
Target: Pence can’t stand up to Trump
T5-UD-NLCD: Pence fails every time to stand up to Trump.
T5-UD-DA: Pence is a naive person w

In [156]:
model = T5ForConditionalGeneration.from_pretrained(model_checkpoints["t5_small_unidir"]).to(DEVICE)
tokenizer = T5Tokenizer.from_pretrained(model_checkpoints["t5_small_unidir"], add_prefix_space=True)

# Prefix the dataset
prefixed_datasets = add_prefix(raw_datasets)

# Get the raw sentences
raw_sentences = prefixed_datasets["validation"]["source"][:10]

# Tokenize the raw sentences
input_ids = tokenizer(raw_sentences, return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_LENGTH, add_special_tokens=False)['input_ids'].to(DEVICE)

# Get list of bad words as identified using attention from toxicity classifier
bad_words_list = get_bad_words_list(raw_sentences, num_layers=3, top_k=3)

# Convert each list of bad words to a string
bad_words_str_list = [" ".join(bad_words) for bad_words in bad_words_list]

# Get tokens of words that should not be generated
bad_word_ids = [tokenizer(bad_words, add_special_tokens=False).input_ids for bad_words in bad_words_str_list]

# Decode the bad word ids
bad_word_ids_decoded = [tokenizer.decode(bad_word_id) for bad_word_id in bad_word_ids]

# Generate predictions
encoded_preds = model.generate(
    inputs=input_ids,
    max_length=MAX_OUTPUT_LENGTH,
    num_beams=NUM_BEAMS,
    early_stopping=True,
    eos_token_id=model.config.eos_token_id,
    bos_token_id=model.config.bos_token_id,
    pad_token_id=model.config.pad_token_id,
    decoder_start_token_id=model.config.pad_token_id,
    bad_words_ids=bad_word_ids
)

# Decode the predictions
decoded_preds = tokenizer.batch_decode(encoded_preds, skip_special_tokens=True)

# Post-process the predictions
decoded_preds = [pred.strip() for pred in decoded_preds]

# Print bad words list and bad word ids
for i in range(len(raw_sentences)):
    print(f"Source: {raw_sentences[i]}")
    print(f"Bad words: {bad_words_list[i]}")
    print(f"Bad word ids: {bad_word_ids[i]}")
    print(f"Bad word ids decoded: {bad_word_ids_decoded[i]}")
    print(f"Prediction: {decoded_preds[i]}")
    print()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Source: to_neutral: Rt my bitch got fans give em an autograph smh i love it tho
Bad words: ['bitch', 'tho', 't']
Bad word ids: [720, 524, 3, 189, 32, 3, 17]
Bad word ids decoded: bitch tho t
Prediction: Rt my girl got fans give em an autograph smh i love it thoo thoo i love it tho

Source: to_neutral: " redditor for 26 minutes " fuck off
Bad words: ['fuck', '"', '"']
Bad word ids: [3, 89, 4636, 96, 96]
Bad word ids decoded: fuck " "
Prediction: Redditor for 26 minutes " fuck off off"

Source: to_neutral: Yup then i'll unfollow you, just to be a bitch
Bad words: ['bitch', 'you', 'a']
Bad word ids: [720, 524, 25, 3, 9]
Bad word ids decoded: bitch you a
Prediction: Yup then i'll unfollow you, just to be a bitchch.

Source: to_neutral: And don't say he can't fucking do it.
Bad words: ['fucking', 'don', 'it']
Bad word ids: [3, 89, 4636, 53, 278, 34]
Bad word ids decoded: fucking don it
Prediction: And don't say he can't do it. And don't say he can't do it.

Source: to_neutral: In the words 

In [142]:
# Debug

# Load model and tokenizer
model = T5ForConditionalGeneration.from_pretrained(model_checkpoints["t5_small_unidir"]).to(DEVICE)
tokenizer = T5Tokenizer.from_pretrained(model_checkpoints["t5_small_unidir"], add_prefix_space=True)

# Prefix the dataset
prefixed_datasets = add_prefix(raw_datasets)

# Get the raw sentences
raw_sentences = prefixed_datasets["validation"]["source"][:50]


# Tokenize the raw sentences
input_ids = tokenizer(raw_sentences, return_tensors="pt", padding=True, truncation=True, max_length=MAX_INPUT_LENGTH)['input_ids'].to(DEVICE)

# Get toxic words for each sentence
bad_words_list = get_bad_words_list(raw_sentences, num_layers=3, top_k=3)

# # Define bad word ids
# bad_word_ids = get_bad_word_ids(raw_sentences, num_layers=3, top_k=3)

# # Generate predictions for the batch
# encoded_preds = model.generate(
#     inputs=input_ids,
#     max_length=MAX_OUTPUT_LENGTH,
#     num_beams=5,
#     early_stopping=True,
#     eos_token_id=model.config.eos_token_id,
#     bos_token_id=model.config.bos_token_id,
#     pad_token_id=model.config.pad_token_id,
#     decoder_start_token_id=model.config.pad_token_id,
#     bad_words_ids=bad_word_ids
# )

# # Decode the predictions for the batch
# decoded_preds = tokenizer.batch_decode(encoded_preds, skip_special_tokens=True)

# # Post-process the predictions for the batch
# decoded_preds = [pred.strip() for pred in decoded_preds]

# # Consolidate variables into a dataframe
# df = pd.DataFrame({
#     "source": raw_sentences,
#     "bad_words": bad_words_list,
#     "decoded_preds": decoded_preds
# })

# # Filter to rows where the decoded_preds contain at least one bad word
# df_bad_words = df[df["decoded_preds"].apply(lambda x: any([bad_word in x for bad_word in bad_words]))]
# df_bad_words

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
