In [None]:
# Install these packages if running from colab
!pip install tensorflow-datasets --quiet
!pip install pydot --quiet
!pip install transformers --quiet

# install huggingface datasets
!pip install datasets --quiet

! pip install rouge-score nltk --quiet
! pip install huggingface_hub --quiet

!pip install sentencepiece --quiet

[K     |████████████████████████████████| 5.5 MB 26.1 MB/s 
[K     |████████████████████████████████| 7.6 MB 83.0 MB/s 
[K     |████████████████████████████████| 182 kB 91.1 MB/s 
[K     |████████████████████████████████| 451 kB 28.7 MB/s 
[K     |████████████████████████████████| 115 kB 81.4 MB/s 
[K     |████████████████████████████████| 212 kB 90.2 MB/s 
[K     |████████████████████████████████| 127 kB 87.3 MB/s 
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 1.3 MB 32.7 MB/s 
[?25h

In [None]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Embedding, Input, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
import tensorflow_datasets as tfds

import sklearn as sk
import os
import nltk
from nltk.data import find

import matplotlib.pyplot as plt

import re

#let's make longer output readable without scrolling
from pprint import pprint

# the toxic parallel dataset, with rouge metric
from datasets import load_dataset, load_from_disk, load_metric, DatasetDict

In [None]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# define paths
dataset_path = 'drive/MyDrive/Colab Notebooks/w266_project_data'
csv_path = 'drive/MyDrive/Colab Notebooks/w266_project_predictions/'
model_path = 'drive/MyDrive/Colab Notebooks/w266_project_models/'

#### Change these variables as needed for different model and different file name

In [None]:
# change these variables for different models
output_file_name = 't5_large_ft.csv'
model_checkpoint = "t5-large"
model_name = 't5_weights.hdf5'

### Import and examine our dataset

In [None]:
# load the dataset
dataset = load_from_disk(dataset_path)

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['en_toxic_comment', 'en_neutral_comment'],
        num_rows: 17789
    })
    test: Dataset({
        features: ['en_toxic_comment', 'en_neutral_comment'],
        num_rows: 989
    })
    valid: Dataset({
        features: ['en_toxic_comment', 'en_neutral_comment'],
        num_rows: 988
    })
})

In [None]:
metric = load_metric("rouge")

  """Entry point for launching an IPython kernel.


Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [None]:
pd.DataFrame(dataset['train'])

Unnamed: 0,en_toxic_comment,en_neutral_comment
0,u 'd be surprised all the shit u think about w...,You would be supriesd all the things you think...
1,plenty of holocaust deniers actually believe t...,Plenty of Holocaust deniers actually believe t...
2,holy fuck i 'm so going,i'm so going
3,i 've driven by lots of cattle farms but holy ...,i 've driven by lots of cattle farms but this
4,so much better this shit to soft for me,So much better this is to soft for me.
...,...,...
17784,"but , thats exactly what a couple of you idiot...","But, that’s exactly what a couple of you have ..."
17785,he probably assessed the situation and decided...,He probably assessed the situation and decided...
17786,brah im fucked up over here .,Brah I am feeling bad over here
17787,shit with my friends and not being able to tru...,with my friends and not being able to trust th...


## load T5 large pretrain model

In [None]:
from transformers import T5Tokenizer, TFT5Model, TFT5ForConditionalGeneration

model = TFT5ForConditionalGeneration.from_pretrained(model_checkpoint)
tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-large.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [None]:
model.summary()

Model: "tft5_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 shared (Embedding)          multiple                  32899072  
                                                                 
 encoder (TFT5MainLayer)     multiple                  334939648 
                                                                 
 decoder (TFT5MainLayer)     multiple                  435627520 
                                                                 
Total params: 737,668,096
Trainable params: 737,668,096
Non-trainable params: 0
_________________________________________________________________


### Preprocess: Encode detox dataset with T5 tokenizer

In [None]:
## Encode detox train_texts with BART tokenizer
max_length = 25
#max_label_length = 128
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples['en_toxic_comment']]
    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, padding=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples['en_neutral_comment'], max_length=max_length, truncation=True, padding=True
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
# test the preprocess function
preprocess_function(dataset['train'][:2])

  "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "


{'input_ids': [[21603, 10, 3, 76, 3, 31, 26, 36, 5597, 66, 8, 3, 7, 10536, 3, 76, 317, 81, 116, 3, 76, 3, 2047, 7, 1], [21603, 10, 2500, 13, 3534, 5133, 2064, 17, 177, 4518, 700, 857, 8, 6139, 3, 7, 10536, 79, 497, 3, 5, 1, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]], 'labels': [[148, 133, 36, 3, 7, 413, 2593, 26, 66, 8, 378, 25, 317, 81, 116, 25, 33, 131, 3823, 132, 5, 1], [28474, 13, 26178, 177, 4518, 700, 857, 8, 378, 79, 497, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}

In [None]:
# tokenize the detox dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)

  0%|          | 0/18 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

### Fine Tuning with the detox dataset

In [None]:
# Place to save CHECKPOINTS

checkpoint_filepath = model_path + model_name
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True)

In [None]:
# hyperparameters
batch_size = 8
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 1

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['en_toxic_comment', 'en_neutral_comment', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 17789
    })
    test: Dataset({
        features: ['en_toxic_comment', 'en_neutral_comment', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 989
    })
    valid: Dataset({
        features: ['en_toxic_comment', 'en_neutral_comment', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 988
    })
})

In [None]:
# huggingface data collector for ROUGE
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")
generation_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=8)

In [None]:
# convert the train and the test tokenized dataset to TF dataset
train_dataset = model.prepare_tf_dataset(
    tokenized_datasets['train'],
    batch_size = batch_size,
    shuffle=True,
    collate_fn=data_collator,
)

validation_dataset = model.prepare_tf_dataset(
    tokenized_datasets['valid'],
    batch_size = batch_size,
    shuffle = False,
    collate_fn=data_collator,
)

test_dataset = model.prepare_tf_dataset(
    tokenized_datasets['test'],
    batch_size = batch_size,
    shuffle = False,
    collate_fn=data_collator,
)

In [None]:
# model compile
from transformers import AdamWeightDecay
import tensorflow as tf

optimizer = AdamWeightDecay(learning_rate = learning_rate, 
                            weight_decay_rate = weight_decay
                            )
model.compile(optimizer = optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [None]:
# model fit
t5_model_history = model.fit(
    train_dataset, 
    validation_data = validation_dataset,
    batch_size = batch_size,
    epochs=num_train_epochs,
    callbacks=[model_checkpoint_callback]
)



In [None]:
model.summary()

Model: "tft5_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 shared (Embedding)          multiple                  32899072  
                                                                 
 encoder (TFT5MainLayer)     multiple                  334939648 
                                                                 
 decoder (TFT5MainLayer)     multiple                  435627520 
                                                                 
Total params: 737,668,096
Trainable params: 737,668,096
Non-trainable params: 0
_________________________________________________________________


In [None]:
t5_model_history.history

{'loss': [0.6645427346229553], 'val_loss': [0.49011051654815674]}

### Build Rouge eval score

In [None]:
test_predictions = []
test_references = []
test_len = len(dataset['test']['en_toxic_comment'])

for i in range(test_len):
  input_tokenized = tokenizer(["summarize: " + dataset['test']['en_toxic_comment'][i]], return_tensors="tf").input_ids
  summary_ids = model.generate(input_tokenized, num_beams=2, min_length=0, max_length=25)
  
  prediction = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
  reference = [dataset['test']['en_neutral_comment'][i]]
  
  test_predictions.insert(i, (prediction))
  test_references.insert(i, (reference))

  if i % 50 == 0:
    print('complete', i, '/', test_len)
#print(len(val_references))

complete 0 / 989
complete 50 / 989
complete 100 / 989
complete 150 / 989
complete 200 / 989
complete 250 / 989
complete 300 / 989
complete 350 / 989
complete 400 / 989
complete 450 / 989
complete 500 / 989
complete 550 / 989
complete 600 / 989
complete 650 / 989
complete 700 / 989
complete 750 / 989
complete 800 / 989
complete 850 / 989
complete 900 / 989
complete 950 / 989


In [None]:
# dictionary of lists  
dict = {'test_predictions': test_predictions, 'test_references': test_references}  
       
df = pd.DataFrame(dict) 

In [None]:
# saving the output dataframe to a csv file
# output_file_name = 'bart_large_zsl.csv'
df.to_csv(csv_path + output_file_name, index = False) 

In [None]:
# read data from csv file
df_bart_predictions = pd.read_csv(csv_path + output_file_name)

# display data frame
print(df_bart_predictions[:10])

# display shape
print(df_bart_predictions.shape)

# display columns
print(df_bart_predictions.columns)

                                    test_predictions  \
0              ['The 61 61 view must be hilarious.']   
1  ['All she has to do is smile and keep her cool...   
2  ['can we apply that litmus test to muslims tha...   
3                          ['thats it , im joining']   
4  ['The residents of davis have a collective pro...   
5  ['being blanked by my best mate lol jks please...   
6  ['this idea of just stacking things into legis...   
7                           ['yeah this is crazy .']   
8  ['i seem to remember a certain person promisin...   
9  ["she better not do anything like that ' i wou...   

                                     test_references  
0                    ['The 6161 view must be funny']  
1  ['All she has to do is smile and keep her cool...  
2  ['can we apply that litmus test to muslims tha...  
3                        ["That's it! I'm joining."]  
4    ['The residents of Davis are not so pleasant.']  
5  ['Being blanked by my best mate,. Lol his plea... 

In [None]:
rouge_results = metric.compute(predictions=df_bart_predictions['test_predictions'],
                               references=df_bart_predictions['test_references'])
pprint(rouge_results, compact=True)  

{'rouge1': AggregateScore(low=Score(precision=0.7915230281205788, recall=0.8074290080450236, fmeasure=0.7929783310774333), mid=Score(precision=0.8058124438557461, recall=0.8207772999762745, fmeasure=0.8070209848233934), high=Score(precision=0.8189597958697374, recall=0.8327099367761803, fmeasure=0.8188645374320451)),
 'rouge2': AggregateScore(low=Score(precision=0.6719895161990385, recall=0.6833130311358295, fmeasure=0.672556120993108), mid=Score(precision=0.6909799427724546, recall=0.7017443036453621, fmeasure=0.6907903091547671), high=Score(precision=0.7100391570296182, recall=0.7204720843513932, fmeasure=0.7095166676551833)),
 'rougeL': AggregateScore(low=Score(precision=0.7868715820207851, recall=0.8028006780671466, fmeasure=0.7894585811658218), mid=Score(precision=0.8018898442761985, recall=0.8164262293424556, fmeasure=0.8026835999600022), high=Score(precision=0.815886839632903, recall=0.8292797893586319, fmeasure=0.8154034943260876)),
 'rougeLsum': AggregateScore(low=Score(precis

In [None]:
pd.DataFrame.from_dict(rouge_results)

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
0,"(0.7915230281205788, 0.8074290080450236, 0.792...","(0.6719895161990385, 0.6833130311358295, 0.672...","(0.7868715820207851, 0.8028006780671466, 0.789...","(0.7881797463764102, 0.8021515868619266, 0.789..."
1,"(0.8058124438557461, 0.8207772999762745, 0.807...","(0.6909799427724546, 0.7017443036453621, 0.690...","(0.8018898442761985, 0.8164262293424556, 0.802...","(0.8020282749673897, 0.8164829044439351, 0.802..."
2,"(0.8189597958697374, 0.8327099367761803, 0.818...","(0.7100391570296182, 0.7204720843513932, 0.709...","(0.815886839632903, 0.8292797893586319, 0.8154...","(0.8151668724581835, 0.8296833899970467, 0.815..."


In [None]:
pd.concat({k: pd.DataFrame(v) for k, v in rouge_results.items()})

Unnamed: 0,Unnamed: 1,precision,recall,fmeasure
rouge1,0,0.791523,0.807429,0.792978
rouge1,1,0.805812,0.820777,0.807021
rouge1,2,0.81896,0.83271,0.818865
rouge2,0,0.67199,0.683313,0.672556
rouge2,1,0.69098,0.701744,0.69079
rouge2,2,0.710039,0.720472,0.709517
rougeL,0,0.786872,0.802801,0.789459
rougeL,1,0.80189,0.816426,0.802684
rougeL,2,0.815887,0.82928,0.815403
rougeLsum,0,0.78818,0.802152,0.789136
