In [1]:
# Install these packages if running from colab
!pip install tensorflow-datasets --quiet
!pip install pydot --quiet
!pip install transformers --quiet

# install huggingface datasets
!pip install datasets --quiet

! pip install rouge-score nltk --quiet
! pip install huggingface_hub --quiet

[K     |████████████████████████████████| 5.5 MB 17.5 MB/s 
[K     |████████████████████████████████| 7.6 MB 65.8 MB/s 
[K     |████████████████████████████████| 163 kB 70.2 MB/s 
[K     |████████████████████████████████| 441 kB 23.5 MB/s 
[K     |████████████████████████████████| 95 kB 5.0 MB/s 
[K     |████████████████████████████████| 115 kB 71.4 MB/s 
[K     |████████████████████████████████| 212 kB 59.9 MB/s 
[K     |████████████████████████████████| 127 kB 73.4 MB/s 
[K     |████████████████████████████████| 115 kB 71.5 MB/s 
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


In [2]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Embedding, Input, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
import tensorflow_datasets as tfds

import sklearn as sk
import os
import nltk
from nltk.data import find

import matplotlib.pyplot as plt

import re

#let's make longer output readable without scrolling
from pprint import pprint

# the toxic parallel dataset, with rouge metric
from datasets import load_dataset, load_from_disk, load_metric, DatasetDict

In [3]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# define paths
dataset_path = 'drive/MyDrive/Colab Notebooks/w266_project_data'
csv_path = 'drive/MyDrive/Colab Notebooks/w266_project_predictions/'

#### Change these variables as needed for different model and different file name

In [5]:
# change these variables for different models
output_file_name = 'bart_cnn_zsl.csv'
model_checkpoint = "facebook/bart-large-cnn"

### Import and examine our dataset

In [6]:
# load the dataset
dataset = load_from_disk(dataset_path)

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['en_toxic_comment', 'en_neutral_comment'],
        num_rows: 17789
    })
    test: Dataset({
        features: ['en_toxic_comment', 'en_neutral_comment'],
        num_rows: 989
    })
    valid: Dataset({
        features: ['en_toxic_comment', 'en_neutral_comment'],
        num_rows: 988
    })
})

In [8]:
metric = load_metric("rouge")

  """Entry point for launching an IPython kernel.


Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [9]:
pd.DataFrame(dataset['train'])

Unnamed: 0,en_toxic_comment,en_neutral_comment
0,u 'd be surprised all the shit u think about w...,You would be supriesd all the things you think...
1,plenty of holocaust deniers actually believe t...,Plenty of Holocaust deniers actually believe t...
2,holy fuck i 'm so going,i'm so going
3,i 've driven by lots of cattle farms but holy ...,i 've driven by lots of cattle farms but this
4,so much better this shit to soft for me,So much better this is to soft for me.
...,...,...
17784,"but , thats exactly what a couple of you idiot...","But, that’s exactly what a couple of you have ..."
17785,he probably assessed the situation and decided...,He probably assessed the situation and decided...
17786,brah im fucked up over here .,Brah I am feeling bad over here
17787,shit with my friends and not being able to tru...,with my friends and not being able to trust th...


## load BART large CNN pretrain model

In [11]:
from transformers import BartTokenizer, TFBartForConditionalGeneration

model = TFBartForConditionalGeneration.from_pretrained(model_checkpoint)
tokenizer = BartTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBartForConditionalGeneration.

All the layers of TFBartForConditionalGeneration were initialized from the model checkpoint at facebook/bart-large-cnn.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [12]:
model.summary()

Model: "tf_bart_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 model (TFBartMainLayer)     multiple                  406290432 
                                                                 
 final_logits_bias (BiasLaye  multiple                 50264     
 r)                                                              
                                                                 
Total params: 406,340,696
Trainable params: 406,290,432
Non-trainable params: 50,264
_________________________________________________________________


### Preprocess: Encode detox dataset with BART tokenizer

In [13]:
## Encode detox train_texts with BART tokenizer
max_length = 25
#max_label_length = 128

def preprocess_function(examples):
    inputs = [doc for doc in examples['en_toxic_comment']]
    model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, padding=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples['en_neutral_comment'], max_length=max_length, truncation=True, padding=True
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [14]:
# test the preprocess function
preprocess_function(dataset['train'][:2])

  "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "


{'input_ids': [[0, 257, 128, 417, 28, 3911, 70, 5, 15328, 1717, 206, 59, 77, 1717, 1236, 687, 579, 2582, 179, 89, 2], [0, 2911, 24997, 9, 18701, 43328, 3069, 4733, 888, 679, 5, 5373, 15328, 51, 224, 479, 2, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0]], 'labels': [[0, 1185, 74, 28, 28836, 4458, 417, 70, 5, 383, 47, 206, 59, 77, 47, 32, 95, 2828, 89, 4, 2], [0, 16213, 24997, 9, 13903, 3069, 4733, 888, 679, 5, 383, 51, 224, 4, 2, 1, 1, 1, 1, 1, 1]]}

In [15]:
# tokenize the detox dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)



  0%|          | 0/1 [00:00<?, ?ba/s]

### no fine-tuning, zero-shot. Try on one example

In [16]:
input_tokenized = tokenizer([dataset['test']['en_toxic_comment'][1]], return_tensors="tf").input_ids
summary_ids = model.generate(input_tokenized, num_beams=2, min_length=0, max_length=25)

In [17]:
prediction = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
reference = [dataset['test']['en_neutral_comment'][1]]

In [18]:
print([dataset['test']['en_toxic_comment'][1]])
print(prediction)
print(reference)

['all she has to do is smile and keep her cool with slobbering old wrinkled hubby .']
['all she has to do is smile and keep her cool with slobbering old wrinkled hubby.']
['All she has to do is smile and keep her cool with old aged hubby']


In [19]:
rouge_results = metric.compute(predictions=prediction,
                               references=reference)
pprint(rouge_results, compact=True)  

{'rouge1': AggregateScore(low=Score(precision=0.875, recall=0.9333333333333333, fmeasure=0.9032258064516129), mid=Score(precision=0.875, recall=0.9333333333333333, fmeasure=0.9032258064516129), high=Score(precision=0.875, recall=0.9333333333333333, fmeasure=0.9032258064516129)),
 'rouge2': AggregateScore(low=Score(precision=0.7333333333333333, recall=0.7857142857142857, fmeasure=0.7586206896551724), mid=Score(precision=0.7333333333333333, recall=0.7857142857142857, fmeasure=0.7586206896551724), high=Score(precision=0.7333333333333333, recall=0.7857142857142857, fmeasure=0.7586206896551724)),
 'rougeL': AggregateScore(low=Score(precision=0.875, recall=0.9333333333333333, fmeasure=0.9032258064516129), mid=Score(precision=0.875, recall=0.9333333333333333, fmeasure=0.9032258064516129), high=Score(precision=0.875, recall=0.9333333333333333, fmeasure=0.9032258064516129)),
 'rougeLsum': AggregateScore(low=Score(precision=0.875, recall=0.9333333333333333, fmeasure=0.9032258064516129), mid=Scor

### zero-shot learning (ZSL) on the test set

In [None]:
test_predictions = []
test_references = []
test_len = len(dataset['test']['en_toxic_comment'])

for i in range(test_len):
  input_tokenized = tokenizer([dataset['test']['en_toxic_comment'][i]], return_tensors="tf").input_ids
  summary_ids = model.generate(input_tokenized, num_beams=2, min_length=0, max_length=25)
  
  prediction = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
  reference = [dataset['test']['en_neutral_comment'][i]]
  
  test_predictions.insert(i, (prediction))
  test_references.insert(i, (reference))

  if i % 50 == 0:
    print('complete', i, '/', test_len)
#print(len(val_references))

complete 0 / 989


In [None]:
# dictionary of lists  
dict = {'test_predictions': test_predictions, 'test_references': test_references}  
       
df = pd.DataFrame(dict) 

In [None]:
# saving the output dataframe to a csv file
df.to_csv(csv_path + output_file_name, index = False) 

In [None]:
# read data from csv file
df_bart_predictions = pd.read_csv(csv_path + output_file_name)

# display data frame
print(df_bart_predictions.head())

# display shape
print(df_bart_predictions.shape)

# display columns
print(df_bart_predictions.columns)

In [None]:
rouge_results = metric.compute(predictions=df_bart_predictions['test_predictions'],
                               references=df_bart_predictions['test_references'])
pprint(rouge_results, compact=True)  

In [None]:
pd.DataFrame.from_dict(rouge_results)

In [None]:
pd.concat({k: pd.DataFrame(v) for k, v in rouge_results.items()})