# Set up

In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
from typing import List, Tuple

import evaluate
import numpy as np
import torch
from datasets import load_dataset
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq

  from .autonotebook import tqdm as notebook_tqdm


# Model

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("tuner007/pegasus_paraphrase")
model = AutoModelForSeq2SeqLM.from_pretrained("tuner007/pegasus_paraphrase").to(device)

# Dataset

In [4]:
def get_feature(batch):
    encoding = tokenizer(
        batch["text"],
        text_target=batch["paraphrase"],
        max_length=60,
        truncation=True,
        padding=True,
    )
    return encoding

In [5]:
train_ds = load_dataset(
    "json",
    data_files="datasets/violent_speech_dataset.json",
    field="data",
)
split = train_ds["train"].train_test_split(test_size=0.2, seed=42)
tokenized_dataset = split.map(get_feature, batched=True)
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'paraphrase', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 11265
    })
    test: Dataset({
        features: ['text', 'paraphrase', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2817
    })
})


In [6]:
columns = ["input_ids", "attention_mask", "labels"]
tokenized_dataset.set_format(type="torch", columns=columns)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Fine-tuning

In [7]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred: Tuple[List[int], List[int]]):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return result

In [8]:
training_args = Seq2SeqTrainingArguments(
    output_dir="training_results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",  # Log at each step
    logging_steps=100,
    logging_first_step=True,   # Log starting from the first step
    num_train_epochs=4,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [9]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.6187,0.641188,0.366053,0.190712,0.338719,0.339296,22.804402
2,0.6224,0.622975,0.372329,0.196898,0.344705,0.344776,22.773873
3,0.5093,0.623868,0.382734,0.202916,0.354698,0.355022,22.55875
4,0.4965,0.633438,0.386696,0.206172,0.358003,0.358575,22.531061


Non-default generation parameters: {'max_length': 60, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}
Non-default generation parameters: {'max_length': 60, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}
Non-default generation parameters: {'max_length': 60, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}
Non-default generation parameters: {'max_length': 60, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}


TrainOutput(global_step=45060, training_loss=0.6100168643926865, metrics={'train_runtime': 15701.9204, 'train_samples_per_second': 2.87, 'train_steps_per_second': 2.87, 'total_flos': 7628863271731200.0, 'train_loss': 0.6100168643926865, 'epoch': 4.0})

In [10]:
trainer.save_model("pegasus_alacen")

Non-default generation parameters: {'max_length': 60, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}


In [11]:
del model
del trainer
torch.cuda.empty_cache()

# Evaluate the models

In [3]:
import os

from transformers import PegasusTokenizer

if not os.path.exists("pegasus_alacen"):
    from src.alacen.paraphrase.pegasus import PegasusAlacen

    _ = PegasusAlacen()
    model_path = "src/alacen/paraphrase/pegasus_alacen"
else:
    model_path = "pegasus_alacen"

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
meteor = evaluate.load("meteor")
bertscore = evaluate.load("bertscore")
tokenizer: AutoTokenizer

[nltk_data] Downloading package wordnet to
[nltk_data]     /mnt/home/20200884/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /mnt/home/20200884/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /mnt/home/20200884/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [5]:
import re

import cmudict
import syllables


cmu_dict = cmudict.dict()


def count_syllables_in_word(word: str) -> int:
    phones = cmu_dict.get(word.lower())
    if phones:
        return len([p for p in phones[0] if p[-1].isdigit()])
    return syllables.estimate(word)


def count_syllables(text: str) -> int:
    words = re.findall(r"\w+", text)
    return sum(count_syllables_in_word(word) for word in words)


def get_feature(batch):
    encoding = tokenizer(
        batch["text"],
        text_target=batch["paraphrase"],
        max_length=60,
        truncation=True,
        padding=True,
    )
    return encoding


def compute_metrics(eval_pred: Tuple[List[int], List[int]]):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    result |= rouge.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    result |= meteor.compute(predictions=decoded_preds, references=decoded_labels)
    bert_result = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
    del bert_result["hashcode"]
    result |= {"bertscore_" + k: np.mean(v) for k, v in bert_result.items()}

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions
    ]
    result["gen_len"] = np.mean(prediction_lens)

    return result


def syllable_count_difference(eval_pred: Tuple[List[int], List[int]]):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = {
        "syllable_difference": np.mean(
            [
                abs(count_syllables(pred) - count_syllables(label))
                for pred, label in zip(decoded_preds, decoded_labels)
            ]
        )
    }
    result |= {
        "normalized_syllable_difference": np.mean(
            [
                abs(count_syllables(pred) - count_syllables(label)) / count_syllables(label)
                for pred, label in zip(decoded_preds, decoded_labels)
            ]
        )
    }

    return result

## Fine-tuned model

In [6]:
tokenizer = PegasusTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)

### BLEU, ROUGE, METEOR, and BERTScore

In [7]:
train_ds = load_dataset(
    "json",
    data_files="datasets/violent_speech_dataset.json",
    field="data",
)
split = train_ds["train"].train_test_split(test_size=0.2, seed=42)
tokenized_dataset = split.map(get_feature, batched=True)
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'paraphrase', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 11265
    })
    test: Dataset({
        features: ['text', 'paraphrase', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2817
    })
})


In [8]:
columns = ["input_ids", "attention_mask", "labels"]
tokenized_dataset.set_format(type="torch", columns=columns)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [9]:
training_args = Seq2SeqTrainingArguments(
    output_dir="eval_results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [10]:
trainer.evaluate(tokenized_dataset["test"])

### Syllable difference

In [6]:
train_ds = load_dataset(
    "json",
    data_files="datasets/violent_speech_dataset.json",
    field="data",
)
train_ds = train_ds.map(lambda x: {"text": x["text"], "paraphrase": x["text"]})
split = train_ds["train"].train_test_split(test_size=0.2, seed=42)
tokenized_dataset = split.map(get_feature, batched=True)
print(tokenized_dataset)

Map: 100%|██████████| 2817/2817 [00:00<00:00, 16148.12 examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'paraphrase', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 11265
    })
    test: Dataset({
        features: ['text', 'paraphrase', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2817
    })
})





In [7]:
columns = ["input_ids", "attention_mask", "labels"]
tokenized_dataset.set_format(type="torch", columns=columns)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [8]:
training_args = Seq2SeqTrainingArguments(
    output_dir="eval_results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=syllable_count_difference,
)

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [9]:
trainer.evaluate(tokenized_dataset["test"])

{'eval_loss': 0.33062276244163513,
 'eval_syllable_difference': 6.484558040468584,
 'eval_normalized_syllable_difference': 0.7398379896533884,
 'eval_runtime': 251.3116,
 'eval_samples_per_second': 11.209,
 'eval_steps_per_second': 0.704}

## Original model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("tuner007/pegasus_paraphrase")
model = AutoModelForSeq2SeqLM.from_pretrained("tuner007/pegasus_paraphrase").to(device)

### BLEU, ROUGE, METEOR, and BERTScore

In [15]:
train_ds = load_dataset(
    "json",
    data_files="datasets/violent_speech_dataset.json",
    field="data",
)
split = train_ds["train"].train_test_split(test_size=0.2, seed=42)
tokenized_dataset = split.map(get_feature, batched=True)
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'paraphrase', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 11265
    })
    test: Dataset({
        features: ['text', 'paraphrase', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2817
    })
})


In [16]:
columns = ["input_ids", "attention_mask", "labels"]
tokenized_dataset.set_format(type="torch", columns=columns)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [17]:
training_args = Seq2SeqTrainingArguments(
    output_dir="eval_results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)



In [18]:
trainer.evaluate(tokenized_dataset["test"])

Trainer is attempting to log a value of "[0.40100487875919316, 0.135989289626354, 0.07190949480144734, 0.04033146273666544]" of type <class 'list'> for key "eval/precisions" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 8.794661521911621,
 'eval_bleu': 0.03496953325639905,
 'eval_precisions': [0.40100487875919316,
  0.135989289626354,
  0.07190949480144734,
  0.04033146273666544],
 'eval_brevity_penalty': 0.3118305729279578,
 'eval_length_ratio': 0.46183077750874363,
 'eval_translation_length': 27466,
 'eval_reference_length': 59472,
 'eval_rouge1': 0.245675553050952,
 'eval_rouge2': 0.09259513716052695,
 'eval_rougeL': 0.21969443514731285,
 'eval_rougeLsum': 0.21962926971419977,
 'eval_meteor': 0.1850567738813004,
 'eval_bertscore_precision': 0.88753273821825,
 'eval_bertscore_recall': 0.8672569695626077,
 'eval_bertscore_f1': 0.8769930402014667,
 'eval_gen_len': 11.75612353567625,
 'eval_runtime': 171.5643,
 'eval_samples_per_second': 16.42,
 'eval_steps_per_second': 1.032}

### Syllable difference

In [11]:
train_ds = load_dataset(
    "json",
    data_files="datasets/violent_speech_dataset.json",
    field="data",
)
train_ds = train_ds.map(lambda x: {"text": x["text"], "paraphrase": x["text"]})
split = train_ds["train"].train_test_split(test_size=0.2, seed=42)
tokenized_dataset = split.map(get_feature, batched=True)
print(tokenized_dataset)

Map: 100%|██████████| 2817/2817 [00:00<00:00, 17528.77 examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'paraphrase', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 11265
    })
    test: Dataset({
        features: ['text', 'paraphrase', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2817
    })
})





In [12]:
columns = ["input_ids", "attention_mask", "labels"]
tokenized_dataset.set_format(type="torch", columns=columns)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [13]:
training_args = Seq2SeqTrainingArguments(
    output_dir="eval_results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=syllable_count_difference,
)

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [14]:
trainer.evaluate(tokenized_dataset["test"])

{'eval_loss': 8.628454208374023,
 'eval_syllable_difference': 9.133830315938942,
 'eval_normalized_syllable_difference': 0.4424274370256228,
 'eval_runtime': 161.2969,
 'eval_samples_per_second': 17.465,
 'eval_steps_per_second': 1.097}

# Try models

In [3]:
from transformers import pipeline

inputs = [
    "Oh holy fuck.",
    "You know damn**n well who it is.",
    "That cocksuckin' faggot!  He's got me by the balls.",
    "No seriously.  I mean, they're just breasts.  Every second person in the world has got them...",
]
gen_kwargs = {
    "num_beams": 8,
    "batch_size": len(inputs)
}

## Fine-tuned model

In [4]:
pipe = pipeline("summarization", model=model_path)

In [5]:
outputs = pipe(inputs, **gen_kwargs)
for inp, output in zip(inputs, outputs):
    print(f"Input text: {inp}")
    print(f"Paraphrase: {output['summary_text']}")
    print()

Your max_length is set to 60, but your input_length is only 23. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=11)


Input text: Oh holy fuck.
Paraphrase: Goodness gracious, this is unbelievable.

Input text: You know damn**n well who it is.
Paraphrase: It's quite clear who it is.

Input text: That cocksuckin' faggot!  He's got me by the balls.
Paraphrase: That person is really getting on my nerves! He's taking advantage of me.

Input text: No seriously.  I mean, they're just breasts.  Every second person in the world has got them...
Paraphrase: Absolutely not. I mean, they're just breasts. Every single person in the world has them...



## Original model

In [None]:
pipe = pipeline("summarization", model="tuner007/pegasus_paraphrase")

In [7]:
outputs = pipe(inputs, **gen_kwargs)
for inp, output in zip(inputs, outputs):
    print(f"Input text: {inp}")
    print(f"Paraphrase: {output['summary_text']}")
    print()

Your max_length is set to 60, but your input_length is only 23. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=11)


Input text: Oh holy fuck.
Paraphrase: Oh my gosh.

Input text: You know damn**n well who it is.
Paraphrase: You know who it is.

Input text: That cocksuckin' faggot!  He's got me by the balls.
Paraphrase: He's got me by the balls.

Input text: No seriously.  I mean, they're just breasts.  Every second person in the world has got them...
Paraphrase: They're just breasts, every second person in the world has them.

