# Fine tunning BERT

In [1]:
import os

from transformers import (
    DataCollatorForSeq2Seq,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    AutoTokenizer,
    AutoModel,
)
import evaluate
from datasets import Dataset
import numpy as np
import pandas as pd
import torch
from torch.utils.data import random_split

import warnings

In [2]:
MANUAL_SEED = 42

warnings.filterwarnings("ignore")

## Data loading and preprocessing

In [3]:
df = pd.read_csv("../data/raw/dataset_xs.csv")
print(f"{len(df)=}")
df.head()

len(df)=9462


Unnamed: 0,toxic,nontoxic
0,I like that shit.,I love it.
1,"Now, I understand you got your grievances with...","I understand you don't have to cut your bills,..."
2,Damn It!,"oh, my God."
3,"Help me, you cunt!","Aitchi, help me!"
4,Look at that shit.,look at this.


## Model building

In [4]:
checkpoint = "eugenesiow/bart-paraphrase"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [5]:
PREFIX = "paraphrase following to be nontoxic: \n"

BATCH_SIZE = 8

training_args = Seq2SeqTrainingArguments(
    output_dir="../models/bart",
    evaluation_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    weight_decay=0.01,
    save_total_limit=3,
    # num_train_epochs=10,
    num_train_epochs=2,
    predict_with_generate=True,
    report_to="none",
)

In [6]:
def preprocess_function(examples):
    inputs = [PREFIX + example for example in examples["toxic"]]
    targets = examples["nontoxic"]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    return preds, labels


bleu_metric = evaluate.load("bleu")


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = {}
    result.update(
        {
            "bleu": bleu_metric.compute(
                predictions=decoded_preds, references=decoded_labels
            )["bleu"]
        }
    )

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [7]:
train_indices, val_indices, test_indices = random_split(
    range(len(df)),
    [0.85, 0.1, 0.05],
    generator=torch.Generator().manual_seed(MANUAL_SEED),
)
train_indices, val_indices, test_indices = (
    train_indices.indices,
    val_indices.indices,
    test_indices.indices,
)
print(f"{len(train_indices)=}")
print(f"{len(val_indices)=}")
print(f"{len(test_indices)=}")

len(train_indices)=8043
len(val_indices)=946
len(test_indices)=473


In [8]:
train_dataset = Dataset.from_pandas(df.iloc[train_indices]).map(
    preprocess_function, batched=True
)
val_dataset = Dataset.from_pandas(df.iloc[val_indices]).map(
    preprocess_function, batched=True
)

Map:   0%|          | 0/8043 [00:00<?, ? examples/s]

Map:   0%|          | 0/946 [00:00<?, ? examples/s]

In [9]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [10]:
trainer.train()

  0%|          | 0/2012 [00:00<?, ?it/s]

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


OutOfMemoryError: CUDA out of memory. Tried to allocate 198.00 MiB. GPU 0 has a total capacty of 4.00 GiB of which 0 bytes is free. Of the allocated memory 3.24 GiB is allocated by PyTorch, and 191.07 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
trainer.save_model("bart_model")

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("bart_model")
model.eval()
model.config.use_cache = False

In [None]:
def translate(model, inference_request, tokenizer=tokenizer):
    input_ids = tokenizer(inference_request, return_tensors="pt").input_ids
    outputs = model.generate(input_ids=input_ids)
    print(tokenizer.decode(outputs[0], skip_special_tokens=True, temperature=0))

In [None]:
inference_request = PREFIX + "you can t talk to these old ass ladies like that"
translate(model, inference_request, tokenizer)