In [5]:
import warnings

from datasets import load_dataset, load_metric
import transformers
import datasets
import random
import numpy as np
import pandas as pd
import torch
from IPython.display import display, HTML

torch.manual_seed(420)
np.random.seed(420)
warnings.filterwarnings('ignore')

In [6]:
train_df = pd.read_csv('../data/internal/train.csv')
test_df = pd.read_csv('../data/internal/test.csv')
val_df = pd.read_csv('../data/internal/validation.csv')

In [7]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

tokenizer = T5Tokenizer.from_pretrained("hetpandya/t5-small-tapaco")
model = T5ForConditionalGeneration.from_pretrained("hetpandya/t5-small-tapaco")

def get_paraphrases(sentence, prefix="paraphrase: ", n_predictions=5, top_k=120, max_length=256,device="cpu"):
    text = prefix + sentence + " </s>"
    encoding = tokenizer.encode_plus(
        text, pad_to_max_length=True, return_tensors="pt"
    )
    input_ids, attention_masks = encoding["input_ids"].to(device), encoding[
        "attention_mask"
    ].to(device)

    model_output = model.generate(
        input_ids=input_ids,
        attention_mask=attention_masks,
        do_sample=True,
        max_length=max_length,
        top_k=top_k,
        top_p=0.98,
        early_stopping=True,
        num_return_sequences=n_predictions,
    )

    outputs = []
    for output in model_output:
        generated_sent = tokenizer.decode(
            output, skip_special_tokens=True, clean_up_tokenization_spaces=True
        )
        if (
                generated_sent.lower() != sentence.lower()
                and generated_sent not in outputs
        ):
            outputs.append(generated_sent)
    return outputs

paraphrases = get_paraphrases("The house will be cleaned by me every Saturday.")

for sent in paraphrases:
    print(sent)


Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

My house will be cleaned on Saturday.
I will clean a house Saturday.
I'll clean the house every Saturday.
I will clean the house every Saturday.


In [14]:
prefix = "detoxify:"
source = "reference"
target = "translation"
max_input_length = 128
max_target_length = 128
batch_size = 32

def preprocess_function(examples):
    inputs = [prefix + ex for ex in examples[source]]
    targets = [ex for ex in examples[target]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [37]:
from datasets import Dataset, DatasetDict

tds = Dataset.from_pandas(train_df)
vds = Dataset.from_pandas(val_df)
ds = DatasetDict()
ds['train'] = tds
ds['validation'] = vds

In [38]:
ds = ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/105262 [00:00<?, ? examples/s]

Map:   0%|          | 0/35087 [00:00<?, ? examples/s]

In [39]:
ds['train']

Dataset({
    features: ['Unnamed: 0', 'id', 'Unnamed: 0.1', 'id.1', 'reference', 'translation', 'similarity', 'lenght_diff', 'ref_tox', 'trn_tox', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 105262
})

In [40]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model_name = 't5-small'

args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source}-to-{target}",
    disable_tqdm=True,
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    report_to='tensorboard',
)

In [41]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [42]:
import numpy as np
metric = load_metric("sacrebleu")

# simple postprocessing for text
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

# compute metrics function to pass to trainer
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [43]:
ds_train_pieces = []
ds_val_pieces = []
n = 500
dn = 5
for i in range(0,n,dn):
    ds_train_pieces.append(ds['train'].select(range(i, i+dn)))
    ds_val_pieces.append(ds['validation'].select(range(i, i+dn)))

In [44]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=ds['train'],
    eval_dataset=ds['validation'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [45]:
def training_small_pieces():
    for i in range(len(ds_train_pieces)):
        trainer = Seq2SeqTrainer(
            model,
            args,
            train_dataset=ds_train_pieces[i],
            eval_dataset=ds_val_pieces[i],
            data_collator=data_collator,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics
        )
        trainer.train()
        print(f'{i} out of {len(ds_train_pieces)} trained')

In [49]:
epochs = 5
for epoch in range(epochs):
    print(f"Epoch: {epoch+1} out of {epochs}")
    training_small_pieces()


Epoch: 1 out of 5
{'eval_loss': 0.6515387892723083, 'eval_bleu': 38.3639, 'eval_gen_len': 12.6, 'eval_runtime': 0.7063, 'eval_samples_per_second': 7.08, 'eval_steps_per_second': 1.416, 'epoch': 1.0}
{'eval_loss': 0.6513894200325012, 'eval_bleu': 38.3639, 'eval_gen_len': 12.6, 'eval_runtime': 0.5728, 'eval_samples_per_second': 8.729, 'eval_steps_per_second': 1.746, 'epoch': 2.0}
{'eval_loss': 0.651248574256897, 'eval_bleu': 38.3639, 'eval_gen_len': 12.6, 'eval_runtime': 0.5679, 'eval_samples_per_second': 8.805, 'eval_steps_per_second': 1.761, 'epoch': 3.0}
{'eval_loss': 0.6511314511299133, 'eval_bleu': 38.3639, 'eval_gen_len': 12.6, 'eval_runtime': 0.5629, 'eval_samples_per_second': 8.882, 'eval_steps_per_second': 1.776, 'epoch': 4.0}
{'eval_loss': 0.6510230302810669, 'eval_bleu': 38.3639, 'eval_gen_len': 12.6, 'eval_runtime': 0.5652, 'eval_samples_per_second': 8.846, 'eval_steps_per_second': 1.769, 'epoch': 5.0}
{'eval_loss': 0.6509596705436707, 'eval_bleu': 38.3639, 'eval_gen_len': 12

In [50]:
import torch
trainer.save_model('../models/t5small_tuned')

Git LFS does not upload my files, so I store them in Yandex Disk.
You can download the model from [Here](https://disk.yandex.com/d/aP_z72Ew8CQs2A)
