In [1]:
from datasets import DatasetDict, Dataset
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    MarianMTModel,
    MarianTokenizer,
)
import evaluate
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from functools import partial
import pandas as pd
from tqdm import tqdm
import gc
import GPUtil

2023-11-09 13:18:57.908478: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-09 13:18:57.908536: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-09 13:18:57.908576: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Setting the DEVICE to cuda
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Set path for raw dataset dictionary
RAW_DATASET_PATH = "../data/processed/raw_dataset.pkl"

# Load tokenizers and models
tokenizer_toxicity = RobertaTokenizer.from_pretrained("SkolkovoInstitute/roberta_toxicity_classifier")
model_toxicity = RobertaForSequenceClassification.from_pretrained("SkolkovoInstitute/roberta_toxicity_classifier").to(DEVICE)
tokenizer_acceptability = AutoTokenizer.from_pretrained("iproskurina/tda-bert-en-cola")
model_acceptability = AutoModelForSequenceClassification.from_pretrained("iproskurina/tda-bert-en-cola").to(DEVICE)

# Load dataset
raw_datasets = DatasetDict.load_from_disk(RAW_DATASET_PATH)

# Set batch size
BATCH_SIZE = 32

# Random seed
RANDOM_SEED = 42

# Thresholds for toxicity and acceptability
ACCEPTABILITY_THRESHOLD_SOURCE = 0.6541633009910583
ACCEPTABILITY_THRESHOLD_TARGET = 0.7226778864860535
SIMILARITY_THRESHOLD = 0.9040371657728449

Some weights of the model checkpoint at SkolkovoInstitute/roberta_toxicity_classifier were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
def get_gpu_memory():
    """
    Gets the GPU memory information.
    """
    gpus = GPUtil.getGPUs()
    gpu = gpus[0]
    print(f"Total GPU memory: {gpu.memoryTotal}MB")
    print(f"Free GPU memory: {gpu.memoryFree}MB")
    print(f"Used GPU memory: {gpu.memoryUsed}MB")

def cleanup():
    """
    Cleans up the GPU memory.
    """
    gc.collect()
    torch.cuda.empty_cache()

In [4]:
get_gpu_memory()

Total GPU memory: 23034.0MB
Free GPU memory: 20429.0MB
Used GPU memory: 2088.0MB


# Evaluation Functions

Not batched

In [26]:
model_bertscore = None

def calc_bert_score_nobatch(
    refs, preds, model_type="microsoft/deberta-large-mnli", output_mean=True
    ):
    """
    Calculates BERT score per line. Note: https://docs.google.com/spreadsheets/d/1RKOVpselB98Nnh_EOC4A2BYn8_201tmPODpNWu4w7xI/edit#gid=0 lists the best performing models
    Args:
        refs (list): List of reference sentences.
        y_pred (list): List of predicted sentences.
        model_type (str): Type of BERT model to use.
        output_mean (bool): Whether to output the mean of the scores.

    Returns:
        list of precision, recall, f1 scores.

    """
    global model_bertscore

    if model_bertscore is None:
        model_bertscore = evaluate.load("bertscore")
        
    results = model_bertscore.compute(predictions=preds, references=refs, model_type=model_type)
    precision = np.array(results["precision"])
    recall = np.array(results["recall"])
    f1 = np.array(results["f1"])
    
    if output_mean:
        precision = precision.mean()
        recall = recall.mean()
        f1 = f1.mean()

    return precision, recall, f1

def calc_tox_acceptability_nobatch(
    data,
    tokenizer,
    model,
    output_score=True,
    output_mean=True):
    """
    Calculates toxicity and acceptability scores for a given dataset.

    Args:
        data = list of strings to be evaluated
        tokenizer = tokenizer for the model
        model = model to be used for evaluation
        output_score = whether to output the score or the label
        output_mean = whether to output the mean of the scores or the scores for each sentence
    
    Returns:
        array of toxicity and acceptability scores.
    """  
    inputs = tokenizer(data, return_tensors="pt", padding=True).to(DEVICE)
    with torch.no_grad():
        logits = model(**inputs)["logits"]
        if output_score:
            result = torch.nn.functional.softmax(logits, dim=1)[:, 1]
        else:
            result = logits.argmax(1).data
        result = result.cpu().numpy()

    if output_mean:
        result = result.mean()
        
    return result

Batched form

In [24]:
# Initialize model variables
model_bertscore = None

# def calc_bert_score(
#     refs, preds, model_type="microsoft/deberta-large-mnli", output_mean=True
#     ):
#     """
#     Calculates BERT score per line. Note: https://docs.google.com/spreadsheets/d/1RKOVpselB98Nnh_EOC4A2BYn8_201tmPODpNWu4w7xI/edit#gid=0 lists the best performing models
#     Args:
#         refs (list): List of reference sentences.
#         y_pred (list): List of predicted sentences.
#         model_type (str): Type of BERT model to use.
#         output_mean (bool): Whether to output the mean of the scores.

#     Returns:
#         list of precision, recall, f1 scores.

#     """
#     global model_bertscore

#     if model_bertscore is None:
#         model_bertscore = evaluate.load("bertscore")
        
#     results = model_bertscore.compute(predictions=preds, references=refs, model_type=model_type)
#     precision = np.array(results["precision"])
#     recall = np.array(results["recall"])
#     f1 = np.array(results["f1"])
    
#     if output_mean:
#         precision = precision.mean()
#         recall = recall.mean()
#         f1 = f1.mean()

#     return precision, recall, f1

def calc_bert_score(
    refs, preds, model_type="microsoft/deberta-large-mnli", batch_size=32, output_mean=True
    ):
    """
    Calculates BERT score per line in batches.
    Args:
        refs (list): List of reference sentences.
        preds (list): List of predicted sentences.
        model_type (str): Type of BERT model to use.
        batch_size (int): Number of examples per batch.
        output_mean (bool): Whether to output the mean of the scores.

    Returns:
        list of precision, recall, f1 scores if output_mean=False.
        mean of precision, recall, f1 scores if output_mean=True.
    """
    global model_bertscore

    if model_bertscore is None:
        model_bertscore = evaluate.load("bertscore")

    all_precision, all_recall, all_f1 = [], [], []
    for i in range(0, len(refs), batch_size):
        batch_refs = refs[i:i+batch_size]
        batch_preds = preds[i:i+batch_size]
        batch_results = model_bertscore.compute(predictions=batch_preds, references=batch_refs, model_type=model_type)
        all_precision.extend(batch_results["precision"])
        all_recall.extend(batch_results["recall"])
        all_f1.extend(batch_results["f1"])

    if output_mean:
        precision = np.mean(all_precision)
        recall = np.mean(all_recall)
        f1 = np.mean(all_f1)
        return precision, recall, f1

    return all_precision, all_recall, all_f1

def calc_tox_acceptability(
    data,
    tokenizer,
    model,
    batch_size=32,
    output_score=True,
    output_mean=True):
    """
    Calculates toxicity and acceptability scores for a given dataset in batches.

    Args:
        data: list of strings to be evaluated
        tokenizer: tokenizer for the model
        model: model to be used for evaluation
        batch_size: size of the batch for processing
        output_score: whether to output the score or the label
        output_mean: whether to output the mean of the scores or the scores for each sentence
    
    Returns:
        Array of toxicity and acceptability scores.
    """  
    all_results = []
    for i in range(0, len(data), batch_size):
        batch_data = data[i:i+batch_size]
        inputs = tokenizer(batch_data, return_tensors="pt", padding=True).to(DEVICE)
        with torch.no_grad():
            logits = model(**inputs)["logits"]
            if output_score:
                batch_results = torch.nn.functional.softmax(logits, dim=1)[:, 1]
            else:
                batch_results = logits.argmax(1)
            all_results.append(batch_results.cpu().numpy())

        del inputs
        del logits
        torch.cuda.empty_cache()

    if not output_mean:
        return np.concatenate(all_results)

    return np.mean(np.concatenate(all_results))


In [29]:
# Example reference sentences
refs = [
    "The quick brown fox jumps over the lazy dog.",
    "Artificial intelligence has transformed many technological domains.",
    "London is known for its rich history and cultural heritage."
]

# Example predicted sentences
preds = [
    "A fast dark-colored fox leaps above the inactive canine.",
    "AI has revolutionized multiple aspects of technology.",
    "London is renowned for its historical and cultural legacy."
]

assert calc_bert_score(refs=refs, preds=preds, model_type="microsoft/deberta-large-mnli", output_mean=True) == calc_bert_score_nobatch(refs=refs, preds=preds, model_type="microsoft/deberta-large-mnli", output_mean=True)
assert calc_tox_acceptability(data=refs, tokenizer=tokenizer_toxicity, model=model_toxicity, output_score=True, output_mean=True) == calc_tox_acceptability_nobatch(data=refs, tokenizer=tokenizer_toxicity, model=model_toxicity, output_score=True, output_mean=True)

# EDA to set tresholds for filters

In [5]:
get_gpu_memory()

Total GPU memory: 23034.0MB
Free GPU memory: 20429.0MB
Used GPU memory: 2088.0MB


In [6]:
# # Get acceptability scores for raw source and target sentences
# raw_src_acceptability = calc_tox_acceptability(raw_datasets["train"]["source"], tokenizer_acceptability, model_acceptability, output_score=True, output_mean=False)
# raw_tgt_acceptability = calc_tox_acceptability(raw_datasets["train"]["target"], tokenizer_acceptability, model_acceptability, output_score=True, output_mean=False)

# # Calculate semantic similarity scores for raw source and target sentences
# raw_similarity = calc_bert_score(raw_datasets["train"]["source"], raw_datasets["train"]["target"], model_type="distilbert-base-uncased", output_mean=False)[2]

# # Plot in density plots
# fig, axs = plt.subplots(1, 2, figsize=(15, 5))

# axs[0].set_title("Acceptability scores for raw source and target sentences")
# axs[0].set_xlabel("Acceptability score")
# axs[0].set_ylabel("Density")
# axs[0].hist(raw_src_acceptability, bins=20, alpha=0.5, label="Source")
# axs[0].hist(raw_tgt_acceptability, bins=20, alpha=0.5, label="Target")
# axs[0].legend()

# axs[1].set_title("Semantic similarity scores for raw source and target sentences")
# axs[1].set_xlabel("Semantic similarity score")
# axs[1].set_ylabel("Density")
# axs[1].hist(raw_similarity, bins=20, alpha=0.5)

# plt.show()

# # Calculate acceptability thresholds
# ACCEPTABILITY_THRESHOLD_SOURCE = np.mean(raw_src_acceptability)
# ACCEPTABILITY_THRESHOLD_TARGET = np.mean(raw_tgt_acceptability)

# print(f"acceptability_threshold_source: {ACCEPTABILITY_THRESHOLD_SOURCE}")
# print(f"acceptability_threshold_target: {ACCEPTABILITY_THRESHOLD_TARGET}")

# # Calculate semantic similarity threshold
# SIMILARITY_THRESHOLD = np.mean(raw_similarity)
# print(f"similarity_threshold: {SIMILARITY_THRESHOLD}")

# Back-Translation Implementation

## 1. Make back-translated dataframe

In [57]:
# Helper function to download data for a language
def download(model_name):
  tokenizer = MarianTokenizer.from_pretrained(model_name)
  model = MarianMTModel.from_pretrained(model_name)
  return tokenizer, model

# download model for English -> Romance
tmp_lang_tokenizer, tmp_lang_model = download('Helsinki-NLP/opus-mt-en-ROMANCE')

# download model for Romance -> English
src_lang_tokenizer, src_lang_model = download('Helsinki-NLP/opus-mt-ROMANCE-en')

# Move models to GPU
src_lang_model.to(DEVICE)
tmp_lang_model.to(DEVICE)

def translate(batch_texts, model, tokenizer, language):
    """
    Translate texts into a target language
    
    Args:
        batch_texts (list): list of texts to be translated
        model (model): MarianMTModel
        tokenizer (tokenizer): MarianTokenizer
        language (str): target language

    Returns:
        list of translated texts
    """
    formatter_fn = lambda txt: f">>{language}<<" + txt if language != "en" else txt
    formatted_texts = [formatter_fn(txt) for txt in batch_texts]

    tokens = tokenizer(formatted_texts, return_tensors="pt", padding=True, truncation=True).to(DEVICE)

    with torch.no_grad():
        translated = model.generate(**tokens, num_return_sequences=2)

    translated_texts = tokenizer.batch_decode(translated, skip_special_tokens=True)

    del tokens
    del translated
    cleanup()

    return translated_texts

def back_translate(texts, language_src, language_dst, batch_size=BATCH_SIZE):
    """
    Implements back translation using batch processing
    
    Args:
        texts (list): list of texts to be back translated
        language_src (str): source language
        language_dst (list): list of target languages
        batch_size (int): batch size

    Returns:
        list of back translated texts    
    """
    all_back_translated_texts = []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i + batch_size]
        for lang in language_dst:
            translated_batch = translate(batch, tmp_lang_model, tmp_lang_tokenizer, lang)
            back_translated_batch = translate(translated_batch, src_lang_model, src_lang_tokenizer, language_src)
            all_back_translated_texts.extend(back_translated_batch)

    return all_back_translated_texts

def make_back_translate_df(raw_datasets,
                           language_src = "en",
                           language_dst = ['fr', 'es', 'it']):
    
    # Create a pandas dataframe
    df = pd.DataFrame()

    # Back translate the sentences and add to pandas dataframe
    df['source_bt'] = back_translate(raw_datasets['train']['source'], language_src, language_dst)
    df['target_bt'] = back_translate(raw_datasets['train']['target'], language_src, language_dst)

    # Delete rows with duplicate 'source_bt' or 'target_bt'
    df = df.drop_duplicates(subset=['source_bt'])
    df = df.drop_duplicates(subset=['target_bt'])

    # Delete rows that are not distinct from raw_train_dataset['source'] or raw_train_dataset['target']
    df = df[~df['source_bt'].isin(raw_datasets['train']['source'])]
    df = df[~df['target_bt'].isin(raw_datasets['train']['target'])]
    
    return df

# Create a dataframe for back translated sentences
df_backtranslated = make_back_translate_df()
print(f"Length of df_backtranslated: {len(df_backtranslated)}")

# Delete models and clear GPU memory
del tmp_lang_model
del tmp_lang_tokenizer
del src_lang_model
del src_lang_tokenizer
cleanup()

100%|██████████| 16/16 [02:59<00:00, 11.20s/it]
100%|██████████| 16/16 [01:53<00:00,  7.07s/it]


In [78]:
# Export df_backtranslated to pickle
df_backtranslated.to_pickle("../data/processed/df_backtranslated.pkl")

## 2. Add filters to the back-translated dataframe

In [29]:
def calc_filters(df, acceptability_threshold_source=ACCEPTABILITY_THRESHOLD_SOURCE, acceptability_threshold_target=ACCEPTABILITY_THRESHOLD_TARGET, similarity_threshold=SIMILARITY_THRESHOLD):
    
    # Convert source_bt and target_bt to lists
    source_bt = df["source_bt"].tolist()
    target_bt = df["target_bt"].tolist()
    
    # Calculate toxicity scores for the candidate sentence pairs
    df['source_bt_toxicity'] = calc_tox_acceptability(source_bt, tokenizer_toxicity, model_toxicity, output_score=False, output_mean=False)
    df['target_bt_toxicity'] = calc_tox_acceptability(target_bt, tokenizer_toxicity, model_toxicity, output_score=False, output_mean=False)

    # Calculate acceptability scores for the candidate sentence pairs
    df['source_bt_acceptability'] = calc_tox_acceptability(source_bt, tokenizer_acceptability, model_acceptability, output_score=True, output_mean=False)
    df['target_bt_acceptability'] = calc_tox_acceptability(target_bt, tokenizer_acceptability, model_acceptability, output_score=True, output_mean=False)

    # Calculate similarity scores for the candidate sentence pairs - return the F1 score
    df['bt_similarity'] = calc_bert_score(source_bt, target_bt, model_type="distilbert-base-uncased", output_mean=False)[2]

    # Create filters for the candidate sentence pairs
    ## Filter 1: Toxicity
    df['f_toxicity'] = (df['source_bt_toxicity'] == 1) & (df['target_bt_toxicity'] == 0)

    ## Filter 2: Acceptability
    df['f_acceptability'] = (df['source_bt_acceptability'] >= acceptability_threshold_source) & (df['target_bt_acceptability'] >= acceptability_threshold_target)

    ## Filter 3: Similarity
    df['f_similarity'] = (df['bt_similarity'] >= similarity_threshold)
  
    # Delete redundant columns
    df = df.drop(columns=['source_bt_toxicity', 'target_bt_toxicity', 'source_bt_acceptability', 'target_bt_acceptability', 'bt_similarity'])

    return df

df_backtranslated = pd.read_pickle("../data/processed/df_backtranslated.pkl")
df_backtranslated_with_filters = calc_filters(df_backtranslated)
df_backtranslated_with_filters.to_pickle("../data/processed/df_backtranslated_with_filters.pkl")
print(f"Length of df_backtranslated: {len(df_backtranslated)}")

# Delete models and clear GPU memory
del tokenizer_toxicity
del model_toxicity
del tokenizer_acceptability
del model_acceptability
cleanup()

Length of df_backtranslated: 3150


## 3. Create dataframes with filters applied

In [32]:
def create_filtered_df(df, f_toxicity, f_acceptability, f_similarity, raw_train_dataset=raw_train_dataset):
    """
    Creates a filtered dataframe based on the filters provided, adds the original source and target sentences, removes duplicates, and returns the dataframe.

    Args:
        df (dataframe): dataframe to be filtered
        f_toxicity (bool): whether to filter based on toxicity
        f_acceptability (bool): whether to filter based on acceptability
        f_similarity (bool): whether to filter based on similarity

    Returns:
        filtered dataframe
    """
    # Create a copy of the dataframe
    df = df.copy()

    # Apply filters
    if f_toxicity:
        df = df[df['f_toxicity'] == True]
    if f_acceptability:
        df = df[df['f_acceptability'] == True]
    if f_similarity:
        df = df[df['f_similarity'] == True]

    # Drop filter columns
    df = df.drop(columns=['f_toxicity', 'f_acceptability', 'f_similarity'])

    # Rename columns
    df = df.rename(columns={'source_bt': 'source', 'target_bt': 'target'})
    
    # Reset index
    df = df.reset_index(drop=True)

    return df

# Create a dataframe applying all filters
df_all_filters = create_filtered_df(df=df_backtranslated_with_filters, f_toxicity=True, f_acceptability=True, f_similarity=True)
print(f"Number of rows with all filters: {len(df_all_filters)}")

# Create a dataframe without toxicity filter
df_no_toxicity_filter = create_filtered_df(df=df_backtranslated_with_filters, f_toxicity=False, f_acceptability=True, f_similarity=True)
print("Number of rows with no toxicity filter: ", len(df_no_toxicity_filter))

# Create a dataframe without acceptability filter
df_no_acceptability_filter = create_filtered_df(df=df_backtranslated_with_filters, f_toxicity=True, f_acceptability=False, f_similarity=True)
print("Number of rows with no acceptability filter: ", len(df_no_acceptability_filter))

# Create a dataframe without similarity filter
df_no_similarity_filter = create_filtered_df(df=df_backtranslated_with_filters, f_toxicity=True, f_acceptability=True, f_similarity=False)
print("Number of rows with no similarity filter: ", len(df_no_similarity_filter))

Number of rows with all filters: 636
Number of rows with no toxicity filter:  1002
Number of rows with no acceptability filter:  911
Number of rows with no similarity filter:  1606


In [33]:
# Save the length of df_all_filters to use as 
MAX_SAMPLE_SIZE = len(df_all_filters)

## 4. Add original data to augmented data and create dataset dictionaries

In [43]:
def combine_data(df_aug, raw_datasets, sample_size=MAX_SAMPLE_SIZE, random_state=RANDOM_SEED):
    """
    Add original data to the top of the dataframe and return a dataset dictionary

    Args:
        df_aug (dataframe): augmented dataframe with 'source' and 'target' columns
        raw_datasets (dataset dictionary): original dataset dictionary
        aug_sample (int): number of augmented samples to include
    """
    # Create a copy of the dataframe
    df_aug = df_aug.copy()
    print(f"Length of augmented dataframe: {len(df_aug)}")

    # Randomly sample from the augmented dataframe, setting a seed for reproducibility
    df_aug = df_aug.sample(n=sample_size, random_state=random_state)
    print(f"Length of augmented dataframe after sampling: {len(df_aug)}")

    # Create a dataframe with original source and target sentences
    df_orig = pd.DataFrame({'source': raw_datasets['train']['source'], 'target': raw_datasets['train']['target']})
    print(f"Length of original dataframe: {len(df_orig)}")

    # Concatenate the original source and target sentences into source_bt and target_bt
    df_aug = pd.concat([df_orig, df_aug], axis=0)
    print(f"Length of augmented dataframe after concatenation: {len(df_aug)}")

    # Drop duplicates
    df_aug = df_aug.drop_duplicates(subset=['source'])
    df_aug = df_aug.drop_duplicates(subset=['target'])
    print(f"Length of augmented dataframe after dropping duplicates: {len(df_aug)}")

    # Reset index
    df_aug = df_aug.reset_index(drop=True)

    # Create a dataset dictionary
    dataset_dict = DatasetDict({
        "train": Dataset.from_pandas(df_aug),
        "validation": raw_datasets["validation"],
        "test": raw_datasets["test"],
    })

    return dataset_dict

aug_datasets_all_filters = combine_data(df_all_filters, raw_datasets)
aug_datasets_no_toxicity_filter = combine_data(df_no_toxicity_filter, raw_datasets)
aug_datasets_no_acceptability_filter = combine_data(df_no_acceptability_filter, raw_datasets)
aug_datasets_no_similarity_filter = combine_data(df_no_similarity_filter, raw_datasets)

# Export augmented datasets to pickle
aug_datasets_all_filters.save_to_disk("../data/processed/aug_datasets_all_filters")
aug_datasets_no_toxicity_filter.save_to_disk("../data/processed/aug_datasets_no_toxicity_filter")
aug_datasets_no_acceptability_filter.save_to_disk("../data/processed/aug_datasets_no_acceptability_filter")
aug_datasets_no_similarity_filter.save_to_disk("../data/processed/aug_datasets_no_similarity_filter")

Length of augmented dataframe: 636
Length of augmented dataframe after sampling: 636
Length of original dataframe: 10733
Length of augmented dataframe after concatenation: 11369
Length of augmented dataframe after dropping duplicates: 11347


DatasetDict({
    train: Dataset({
        features: ['source', 'target'],
        num_rows: 11347
    })
    validation: Dataset({
        features: ['source', 'target'],
        num_rows: 1193
    })
    test: Dataset({
        features: ['source', 'target'],
        num_rows: 671
    })
})