In [None]:
# installing huggingface libraries for dataset, models and metrics
!pip install datasets transformers[sentencepiece] sacrebleu

!pip install numpy==1.24.3

In [None]:
# I had some strange bugs and these dependencies resolved it
!pip install datasets -U
!pip install transformers[torch] -U

In [1]:
# necessary imports
import warnings

from datasets import load_dataset, load_metric
import transformers
import datasets
import random
import numpy as np
import pandas as pd
from IPython.display import display, HTML

warnings.filterwarnings('ignore')

## Dataset loading

In [5]:
import pandas as pd

dir_path = 'data/interim/'

train_df = pd.read_csv(dir_path + 'train.csv', index_col=0)
train_df.reset_index(drop=True, inplace=True)
train_df.head()

val_df = pd.read_csv(dir_path + 'validate.csv', index_col=0)
val_df.reset_index(drop=True, inplace=True)
val_df.head()

Unnamed: 0,source,target
0,take your fucking hands off my foot!,take your manoos of my shoes!
1,you think you're a dope boy?,you think you're a diler?
2,she has a broken nose vaginal tears,he has a broken nose and a torn scabbard.
3,bella the only thing that can hurt me is you.,bella you can only hurt me.
4,let's kill your friend see how you feel!,if she killed your friend how would you feel?


## T5 tuning

In [4]:
# selecting model checkpoint
model_checkpoint = "t5-small"

In [7]:
# setting random seed for transformers library
transformers.set_seed(42)

# Load the BLUE metric
metric = load_metric("sacrebleu")

Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

In [5]:
from transformers import AutoTokenizer

# we will use autotokenizer for this purpose
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [22]:
# prefix for model input
prefix = "make sentence neutral:"

In [10]:
max_input_length = 128
max_target_length = 128

target = "target"
source = "source"

def preprocess_function(example):

    inputs = [prefix + ex for ex in example[source]]
    targets = [ex for ex in example[target]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, padding=True, truncation=True)
    # Setup the tokenizer for targets
    labels = tokenizer(targets, max_length=max_target_length, padding=True, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [11]:
train_dataset = datasets.Dataset.from_pandas(train_df, split="train")
val_dataset = datasets.Dataset.from_pandas(val_df, split="train")

In [12]:
train_dataset_map = train_dataset.map(preprocess_function, batched=True)
val_dataset_map = val_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/34666 [00:00<?, ? examples/s]

Map:   0%|          | 0/11556 [00:00<?, ? examples/s]

In [6]:
# I had difficulties to run this cell first time on machine
# If you struggle too, then just run it again
# I don't know how fix it, since it happens only once, and resolves without any further actions
# That was the reason for only one possible training for this solution, 
# it was hard to manage several hours traing on Kaggle with idle error or on Colab which removes whole runtime after exit.

from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq, Seq2SeqTrainer

# create a model for the pretrained model
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [17]:
# defining the parameters for training
batch_size = 32
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source}-to-{target}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    report_to='tensorboard',
)

In [18]:
# instead of writing collate_fn function we will use DataCollatorForSeq2Seq
# similarly it implements the batch creation for training

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [19]:
import numpy as np

# simple postprocessing for text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

# compute metrics function to pass to trainer
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [20]:
# instead of writing train loop we will use Seq2SeqTrainer
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_dataset_map,
    eval_dataset=val_dataset_map,
    data_collator=data_collator,
    tokenizer=tokenizer,

    
    compute_metrics=compute_metrics
)

In [21]:
trainer.train()

  0%|          | 0/10840 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [None]:
trainer.save_model("best")

In [19]:
# loading the model and run inference for it
model = AutoModelForSeq2SeqLM.from_pretrained('checkpoint-10000-20231031T145513Z-001\checkpoint-10000')
model.eval()
model.config.use_cache = False

loading configuration file checkpoint-10000-20231031T145513Z-001\checkpoint-10000\config.json
Model config T5Config {
  "_name_or_path": "checkpoint-10000-20231031T145513Z-001\\checkpoint-10000",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size"

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0
}

All model checkpoint weights were used when initializing T5ForConditionalGeneration.

All the weights of T5ForConditionalGeneration were initialized from the model checkpoint at checkpoint-10000-20231031T145513Z-001\checkpoint-10000.
If your task is similar to the task the model of the checkpoint was trained on, you can already use T5ForConditionalGeneration for predictions without further training.
loading configuration file checkpoint-10000-20231031T145513Z-001\checkpoint-10000\generation_config.json
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0
}



In [20]:
def translate(model, inference_request, tokenizer=tokenizer):
    input_ids = tokenizer(inference_request, return_tensors="pt").input_ids
    outputs = model.generate(input_ids=input_ids)
    print(tokenizer.decode(outputs[0], skip_special_tokens=True,temperature=0))

In [25]:
inference_request = prefix + 'what the fuck'
translate(model, inference_request,tokenizer)

what the hell is it?


In [26]:
inference_request = prefix + "let's get the fuck out of here"
translate(model, inference_request,tokenizer)

let's get out of here.


In [37]:
inference_request = prefix + "goddamn what the hell are you doing?"
translate(model, inference_request,tokenizer)

what are you doing?


In [44]:
inference_request = prefix + "you are such a pussy"
translate(model, inference_request,tokenizer)

you're so a saxy.


In [46]:
inference_request = prefix + "kicks our asses and steals all the coke."
translate(model, inference_request,tokenizer)

he's stealing all the coke.


In [51]:
inference_request = prefix + "oh shit. okay."
translate(model, inference_request,tokenizer)

okay.


In [55]:
inference_request = prefix + "i don't dare take the life of your rusty sons for nothing."
translate(model, inference_request,tokenizer)

i don't want to take the life of your rusty sons for nothing
