In [1]:
# Install these packages if running from colab
!pip install tensorflow-datasets --quiet
!pip install pydot --quiet
!pip install transformers --quiet

# install huggingface datasets
!pip install datasets --quiet

! pip install rouge-score nltk --quiet
! pip install huggingface_hub --quiet

[K     |████████████████████████████████| 5.5 MB 4.1 MB/s 
[K     |████████████████████████████████| 163 kB 56.7 MB/s 
[K     |████████████████████████████████| 7.6 MB 54.8 MB/s 
[K     |████████████████████████████████| 441 kB 4.0 MB/s 
[K     |████████████████████████████████| 115 kB 63.9 MB/s 
[K     |████████████████████████████████| 212 kB 67.2 MB/s 
[K     |████████████████████████████████| 95 kB 5.4 MB/s 
[K     |████████████████████████████████| 127 kB 76.0 MB/s 
[K     |████████████████████████████████| 115 kB 72.2 MB/s 
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


In [2]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Embedding, Input, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
import tensorflow_datasets as tfds

import sklearn as sk
import os
import nltk
from nltk.data import find

import matplotlib.pyplot as plt

import re

## the Detox Dataset

In [3]:
# get paradetox dataset
# the toxic parallel dataset
from datasets import load_dataset, load_metric

dataset = load_dataset("SkolkovoInstitute/paradetox", split="train")
dataset = dataset.train_test_split(test_size=0.1, shuffle=True)
metric = load_metric("rouge")

Downloading readme:   0%|          | 0.00/5.15k [00:00<?, ?B/s]



Downloading and preparing dataset csv/SkolkovoInstitute--paradetox to /root/.cache/huggingface/datasets/SkolkovoInstitute___csv/SkolkovoInstitute--paradetox-2d7856e905be458c/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.04M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/SkolkovoInstitute___csv/SkolkovoInstitute--paradetox-2d7856e905be458c/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  import sys


Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

## Examine the Detox dataset

In [4]:
dataset


DatasetDict({
    train: Dataset({
        features: ['en_toxic_comment', 'en_neutral_comment'],
        num_rows: 17789
    })
    test: Dataset({
        features: ['en_toxic_comment', 'en_neutral_comment'],
        num_rows: 1977
    })
})

In [5]:
metric

Metric(name: "rouge", features: {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}, usage: """
Calculates average rouge scores for a list of hypotheses and references
Args:
    predictions: list of predictions to score. Each prediction
        should be a string with tokens separated by spaces.
    references: list of reference for each prediction. Each
        reference should be a string with tokens separated by spaces.
    rouge_types: A list of rouge types to calculate.
        Valid names:
        `"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring,
        `"rougeL"`: Longest common subsequence based scoring.
        `"rougeLSum"`: rougeLsum splits text using `"
"`.
        See details in https://github.com/huggingface/datasets/issues/617
    use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes.
    use_aggregator: Return aggregates if this is set to True
Retu

### test ROUGE metric

In [6]:
# test ROUGE metric
fake_preds = ["hello there", "general kenobi"]
fake_labels = ["hello there", "general kenobi"]
metric.compute(predictions=fake_preds, references=fake_labels)

{'rouge1': AggregateScore(low=Score(precision=1.0, recall=1.0, fmeasure=1.0), mid=Score(precision=1.0, recall=1.0, fmeasure=1.0), high=Score(precision=1.0, recall=1.0, fmeasure=1.0)),
 'rouge2': AggregateScore(low=Score(precision=1.0, recall=1.0, fmeasure=1.0), mid=Score(precision=1.0, recall=1.0, fmeasure=1.0), high=Score(precision=1.0, recall=1.0, fmeasure=1.0)),
 'rougeL': AggregateScore(low=Score(precision=1.0, recall=1.0, fmeasure=1.0), mid=Score(precision=1.0, recall=1.0, fmeasure=1.0), high=Score(precision=1.0, recall=1.0, fmeasure=1.0)),
 'rougeLsum': AggregateScore(low=Score(precision=1.0, recall=1.0, fmeasure=1.0), mid=Score(precision=1.0, recall=1.0, fmeasure=1.0), high=Score(precision=1.0, recall=1.0, fmeasure=1.0))}

In [7]:
dataset['train'][0]

{'en_toxic_comment': 'are you truly that moronic , peter ? .',
 'en_neutral_comment': "Don't you understand Peter?"}

In [8]:
# try on actual detox data
try_preds = [dataset['train']['en_toxic_comment'][0]]
try_labels = [dataset['train']['en_neutral_comment'][0]]
metric.compute(predictions=try_preds, references=try_labels)

{'rouge1': AggregateScore(low=Score(precision=0.3333333333333333, recall=0.4, fmeasure=0.3636363636363636), mid=Score(precision=0.3333333333333333, recall=0.4, fmeasure=0.3636363636363636), high=Score(precision=0.3333333333333333, recall=0.4, fmeasure=0.3636363636363636)),
 'rouge2': AggregateScore(low=Score(precision=0.0, recall=0.0, fmeasure=0.0), mid=Score(precision=0.0, recall=0.0, fmeasure=0.0), high=Score(precision=0.0, recall=0.0, fmeasure=0.0)),
 'rougeL': AggregateScore(low=Score(precision=0.3333333333333333, recall=0.4, fmeasure=0.3636363636363636), mid=Score(precision=0.3333333333333333, recall=0.4, fmeasure=0.3636363636363636), high=Score(precision=0.3333333333333333, recall=0.4, fmeasure=0.3636363636363636)),
 'rougeLsum': AggregateScore(low=Score(precision=0.3333333333333333, recall=0.4, fmeasure=0.3636363636363636), mid=Score(precision=0.3333333333333333, recall=0.4, fmeasure=0.3636363636363636), high=Score(precision=0.3333333333333333, recall=0.4, fmeasure=0.36363636363

In [9]:
dataset.shape

{'train': (17789, 2), 'test': (1977, 2)}

### Examine detox records

In [10]:
pd.DataFrame(dataset['train'])

Unnamed: 0,en_toxic_comment,en_neutral_comment
0,"are you truly that moronic , peter ? .",Don't you understand Peter?
1,cut them off and they will go back in mountains .,cut them off and they will go back in mountains .
2,oh give it a fucking rest already .,Please give it a rest.
3,fuck gettin older i wanna be a kid again lmao,Not getting older I wanna be a kid again lmao
4,what a shit job that must be .,I don't think that's a suitable position
...,...,...
17784,next time add it quicker you sorry sack of shit .,Next time work fast
17785,"flying to houston , though right now we are ov...","Flying to Houston,though right now we are over..."
17786,might i suggest that tom rogan and the washing...,might i suggest that tom rogan and the washing...
17787,well hurry da fuck up then ! ! !,well hurry up then!!!


In [11]:
# 90% train, 10% test + validation
#train_testvalid = dataset.train_test_split(test=0.1)
# Split the 10% test + valid in half test, half valid
#test_valid = train_test_dataset['test'].train_test_split(test=0.5)
# gather everyone if you want to have a single DatasetDict
#train_test_valid_dataset = DatasetDict({
#    'train': train_testvalid['train'],
#    'test': test_valid['test'],
#    'valid': test_valid['train']})

## Choose a Pre-training Dataset

In [12]:
model_checkpoint = "facebook/bart-large"

In [13]:
from transformers import BartTokenizer, TFBartForConditionalGeneration

model = TFBartForConditionalGeneration.from_pretrained(model_checkpoint)
tokenizer = BartTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBartForConditionalGeneration.

All the layers of TFBartForConditionalGeneration were initialized from the model checkpoint at facebook/bart-large.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

In [14]:
model.summary()

Model: "tf_bart_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model (TFBartMainLayer)     multiple                  406291456 
                                                                 
 final_logits_bias (BiasLaye  multiple                 50265     
 r)                                                              
                                                                 
Total params: 406,341,721
Trainable params: 406,291,456
Non-trainable params: 50,265
_________________________________________________________________


## Preprocess: Encode detox dataset with BART tokenizer

In [15]:
## Encode detox train_texts with BART tokenizer
max_length = 25
#max_label_length = 128

def preprocess_function(examples):
    inputs = [doc for doc in examples['en_toxic_comment']]
    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, padding=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples['en_neutral_comment'], max_length=max_length, truncation=True, padding=True
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [16]:
# test the preprocess function
preprocess_function(dataset['train'][:2])

  "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "


{'input_ids': [[0, 1322, 47, 3127, 14, 14628, 10003, 2156, 181, 5906, 17487, 479, 2], [0, 8267, 106, 160, 8, 51, 40, 213, 124, 11, 9787, 479, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[0, 6766, 75, 47, 1346, 2155, 116, 2, 1, 1, 1, 1, 1], [0, 8267, 106, 160, 8, 51, 40, 213, 124, 11, 9787, 479, 2]]}

In [17]:
# tokenize the detox dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)

  0%|          | 0/18 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

## Fine Tuning with the detox dataset

In [18]:
# hyperparameters
batch_size = 8
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 1

In [19]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['en_toxic_comment', 'en_neutral_comment', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 17789
    })
    test: Dataset({
        features: ['en_toxic_comment', 'en_neutral_comment', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1977
    })
})

In [20]:
# huggingface data collector for ROUGE
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")
generation_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=128)

In [21]:
# convert the train and the test tokenized dataset to TF dataset
train_dataset = model.prepare_tf_dataset(
    tokenized_datasets['train'],
    batch_size = batch_size,
    shuffle=True,
    collate_fn=data_collator,
)

validation_dataset = model.prepare_tf_dataset(
    tokenized_datasets['test'],
    batch_size = batch_size,
    shuffle = False,
    collate_fn=data_collator,
)

In [22]:
# model compile
from transformers import AdamWeightDecay
import tensorflow as tf

optimizer = AdamWeightDecay(learning_rate = learning_rate, 
                            weight_decay_rate = weight_decay
                            )
model.compile(optimizer = optimizer)

#"cosine_similarity"
#loss=tf.keras.losses.CosineSimilarity(axis=1)
#metrics=[tf.keras.metrics.CosineSimilarity(axis=1)]

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [23]:
# model fit
bart_model_history = model.fit(
    train_dataset, 
    validation_data = validation_dataset,
    batch_size = batch_size,
    epochs=num_train_epochs
)



In [24]:
model.summary()

Model: "tf_bart_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model (TFBartMainLayer)     multiple                  406291456 
                                                                 
 final_logits_bias (BiasLaye  multiple                 50265     
 r)                                                              
                                                                 
Total params: 406,341,721
Trainable params: 406,291,456
Non-trainable params: 50,265
_________________________________________________________________


In [25]:
bart_model_history.history

{'loss': [1.0808100700378418], 'val_loss': [1.0017846822738647]}

## Build Rouge eval score

In [49]:
# len(dataset['test']['en_toxic_comment']) = 1977
#for i in range(len(dataset['test']['en_toxic_comment'])):
val_predictions = []
val_references = []
val_len = len(dataset['test']['en_toxic_comment'])

for i in range(val_len):
  input_tokenized = tokenizer([dataset['test']['en_toxic_comment'][i]], return_tensors="tf").input_ids
  summary_ids = model.generate(input_tokenized)
  
  prediction = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
  reference = [dataset['test']['en_neutral_comment'][i]]
  
  val_predictions.insert(i, (prediction))
  val_references.insert(i, (reference))

  if i % 100 == 0:
    print('complete', i, '/', val_len)
#print(len(val_references))





complete 1 of 1977
complete 11 of 1977
complete 21 of 1977
complete 31 of 1977
complete 41 of 1977
complete 51 of 1977
complete 61 of 1977
complete 71 of 1977
complete 81 of 1977
complete 91 of 1977
complete 101 of 1977
complete 111 of 1977
complete 121 of 1977
complete 131 of 1977
complete 141 of 1977
complete 151 of 1977
complete 161 of 1977
complete 171 of 1977
complete 181 of 1977
complete 191 of 1977
complete 201 of 1977
complete 211 of 1977
complete 221 of 1977
complete 231 of 1977
complete 241 of 1977
complete 251 of 1977
complete 261 of 1977
complete 271 of 1977
complete 281 of 1977
complete 291 of 1977
complete 301 of 1977
complete 311 of 1977
complete 321 of 1977
complete 331 of 1977
complete 341 of 1977
complete 351 of 1977
complete 361 of 1977
complete 371 of 1977
complete 381 of 1977
complete 391 of 1977
complete 401 of 1977
complete 411 of 1977
complete 421 of 1977
complete 431 of 1977
complete 441 of 1977
complete 451 of 1977
complete 461 of 1977
complete 471 of 1977
com

In [50]:
rouge_results = metric.compute(predictions=val_predictions,
                               references=val_references)
pprint(rouge_results, compact=True)  

{'rouge1': AggregateScore(low=Score(precision=0.8055344049001706, recall=0.8073854200709465, fmeasure=0.7995563687388435), mid=Score(precision=0.8146981208041866, recall=0.8157565524276031, fmeasure=0.8083572178185294), high=Score(precision=0.8243875592324668, recall=0.8250235862306852, fmeasure=0.8176897603338856)),
 'rouge2': AggregateScore(low=Score(precision=0.6914884790994429, recall=0.6881260173695619, fmeasure=0.6836840338911147), mid=Score(precision=0.7045587628383526, recall=0.7005885983116877, fmeasure=0.6965589354115909), high=Score(precision=0.7175922977222298, recall=0.7127182708019781, fmeasure=0.7087374204952435)),
 'rougeL': AggregateScore(low=Score(precision=0.7996510950757569, recall=0.8011300787788361, fmeasure=0.7941039959300656), mid=Score(precision=0.8107351201360619, recall=0.8108503367899709, fmeasure=0.8041440338855216), high=Score(precision=0.8200119663706987, recall=0.8201657786930437, fmeasure=0.8129602677414612)),
 'rougeLsum': AggregateScore(low=Score(prec

In [55]:
pd.DataFrame.from_dict(rouge_results)

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
0,"(0.8055344049001706, 0.8073854200709465, 0.799...","(0.6914884790994429, 0.6881260173695619, 0.683...","(0.7996510950757569, 0.8011300787788361, 0.794...","(0.8011083502624269, 0.8016022907469662, 0.794..."
1,"(0.8146981208041866, 0.8157565524276031, 0.808...","(0.7045587628383526, 0.7005885983116877, 0.696...","(0.8107351201360619, 0.8108503367899709, 0.804...","(0.8109420178581341, 0.8109286836286898, 0.804..."
2,"(0.8243875592324668, 0.8250235862306852, 0.817...","(0.7175922977222298, 0.7127182708019781, 0.708...","(0.8200119663706987, 0.8201657786930437, 0.812...","(0.8211012421798025, 0.8202795538340188, 0.813..."


In [53]:
pd.concat({k: pd.Series(v) for k, v in rouge_results.items()})

rouge1     0    (0.8055344049001706, 0.8073854200709465, 0.799...
           1    (0.8146981208041866, 0.8157565524276031, 0.808...
           2    (0.8243875592324668, 0.8250235862306852, 0.817...
rouge2     0    (0.6914884790994429, 0.6881260173695619, 0.683...
           1    (0.7045587628383526, 0.7005885983116877, 0.696...
           2    (0.7175922977222298, 0.7127182708019781, 0.708...
rougeL     0    (0.7996510950757569, 0.8011300787788361, 0.794...
           1    (0.8107351201360619, 0.8108503367899709, 0.804...
           2    (0.8200119663706987, 0.8201657786930437, 0.812...
rougeLsum  0    (0.8011083502624269, 0.8016022907469662, 0.794...
           1    (0.8109420178581341, 0.8109286836286898, 0.804...
           2    (0.8211012421798025, 0.8202795538340188, 0.813...
dtype: object

In [None]:
pd.DataFrame.from_dict(rouge_results)

In [47]:
#pprint(predictions, compact=True)
#pprint(references, compact=True)
# pprint(results, compact=True)

[['as per company policy , she was asked to go through a drug test ?'],
 ['Once again how much would you raise the pax rate to drive out the'],
 ['Let him continue in this vein until he has enough rope to hang himself.'],
 ['The next president will be bad too.'],
 ['In my city paid sick leave is mandatory.'],
 ['- laughs , holding my stomach - i eat everything'],
 ["we don 't live in africa dude ."], ['if i good priiice , you put magnet ?'],
 ['west baltimore is a bad show .'],
 ["title is pretty bad , and the article 's isn 't much better ."],
 "title is pretty bad , and the article 's isn 't much better ."]
[['As per company policy, she was asked to take a drug test.'],
 ['once again how much would you raise the pax rate to drive out the foreign?'],
 ['Let him continue till he implicates himself.'],
 ['the next president will be unsuitable too .'],
 ['In my city paid sick leave is mandatory .'],
 ['Holding my stomach, i eat everything'], ["We don't live in Africa brother."],
 ['If I 

In [None]:
#try_preds = [dataset['train']['en_toxic_comment'][0]]
#try_labels = [dataset['train']['en_neutral_comment'][0]]
input_tokenized = tokenizer(try_preds, return_tensors="pt").input_ids
summary_ids = model.generate(input_tokenized)
print('input:', try_preds)
print()
print('label:', try_labels)
print()
print('style transfer output:')
pprint(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False), compact=True)

In [26]:
#@tf.function(jit_compile=True)
#def generate(inputs):
#    return model.generate(**inputs, max_length=128)

#tokenized_data = tokenizer([validation_dataset], return_tensors="np", pad_to_multiple_of=128)
#out = generate(tokenized_data)

In [27]:
#with tokenizer.as_target_tokenizer():
#    print(tokenizer.decode(out[0]))

In [28]:
# using huggingface keras callbacks to gather ROUGE scores
# https://huggingface.co/docs/transformers/main_classes/keras_callbacks

from transformers.keras_callbacks import PushToHubCallback, KerasMetricCallback

def metric_fn(eval_predictions):
    predictions, labels = eval_predictions
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    for label in labels:
        label[label < 0] = tokenizer.pad_token_id  # Replace masked label tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Rouge expects a newline after each sentence
    decoded_predictions = [
        "\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_predictions
    ]
    decoded_labels = [
        "\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels
    ]
    result = metric.compute(
        predictions=decoded_predictions, references=decoded_labels, use_stemmer=True
    )
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    # Add mean generated length
    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions
    ]
    result["gen_len"] = np.mean(prediction_lens)

    return result

In [29]:
metric_callback = KerasMetricCallback(
    metric_fn, eval_dataset=validation_dataset, predict_with_generate=True, use_xla_generation=False
)

callbacks = [metric_callback]

bart_rouge_model_history = model.fit(
    train_dataset, 
    validation_data=validation_dataset, 
    batch_size = batch_size,
    epochs=num_train_epochs,
    callbacks=callbacks
)





TypeError: ignored

In [None]:
model.summary()

In [None]:
bart_rouge_model_history.history

In [None]:
train_dataset

## Evaulate the result

In [None]:
# ROUGE SCORE
#rouge = evaluate.load('rouge')
#predictions = candidate
#references = [REFERENCE]
#results = metric.compute(predictions=predictions,
#                        references=references)
#print(results)

In [None]:
# test 
test_phrase = 'hell no, I am not gonna eat this shitty burger'
input_tokenized = tokenizer(test_phrase, return_tensors="pt").input_ids

In [None]:
input_tokenized

In [None]:
tokenizer.tokenize(test_phrase)

In [43]:
#let's make longer output readable without scrolling
from pprint import pprint

In [None]:
summary_ids = model.generate(input_tokenized)
print('input message:', test_phrase)
print()
print('style transfer output:')
pprint(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False), compact=True)

In [None]:
# ROUGE SCORE
#rouge = evaluate.load('rouge')
#predictions = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
#references = try_labels
#results = metric.compute(predictions=predictions,
#                        references=references)
#print(results)

In [None]:
#try_preds = [dataset['train']['en_toxic_comment'][0]]
#try_labels = [dataset['train']['en_neutral_comment'][0]]
input_tokenized = tokenizer(try_preds, return_tensors="pt").input_ids
summary_ids = model.generate(input_tokenized)
print('input:', try_preds)
print()
print('label:', try_labels)
print()
print('style transfer output:')
pprint(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False), compact=True)

In [None]:
# ROUGE SCORE
#rouge = evaluate.load('rouge')
predictions = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
references = try_labels
results = metric.compute(predictions=predictions,
                        references=references)
print(results)

In [None]:
#tokenized_datasets['test']['en_toxic_comment']
#v_data = tokenizer(tokenized_datasets['test']['en_toxic_comment'], return_tensors="pt").input_ids
#v_data



In [None]:
summary_ids = model.generate(validation_dataset)
print('input message:', test_phrase)
print()
print('style transfer output:')
pprint(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False), compact=True)

In [None]:
predictions = model.predict(validation_dataset)
print(predictions)