In [None]:
# Install these packages if running from colab
!pip install tensorflow-datasets --quiet
!pip install pydot --quiet
!pip install transformers --quiet

# install huggingface datasets
!pip install datasets --quiet

!pip install rouge-score nltk --quiet
!pip install huggingface_hub --quiet
!pip install git+https://github.com/google-research/bleurt.git --quiet

[K     |████████████████████████████████| 5.5 MB 5.2 MB/s 
[K     |████████████████████████████████| 182 kB 47.6 MB/s 
[K     |████████████████████████████████| 7.6 MB 43.1 MB/s 
[K     |████████████████████████████████| 451 kB 4.7 MB/s 
[K     |████████████████████████████████| 212 kB 49.5 MB/s 
[K     |████████████████████████████████| 115 kB 57.8 MB/s 
[K     |████████████████████████████████| 127 kB 41.4 MB/s 
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 352 kB 4.7 MB/s 
[K     |████████████████████████████████| 1.3 MB 60.9 MB/s 
[?25h  Building wheel for BLEURT (setup.py) ... [?25l[?25hdone


In [None]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Embedding, Input, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
import tensorflow_datasets as tfds

import sklearn as sk
import os
import nltk
from nltk.data import find

import matplotlib.pyplot as plt

import re

#let's make longer output readable without scrolling
from pprint import pprint

# the toxic parallel dataset, with rouge metric
from datasets import load_dataset, load_from_disk, load_metric, DatasetDict

<h2> Loading the data and the predictions </h2>



In [None]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
dataset_path = 'drive/MyDrive/Colab Notebooks/w266_project_data'
csv_path = 'drive/MyDrive/Colab Notebooks/w266_project_predictions/'

# for local runs
#dataset_path = 'w266_project_data/'
#csv_path = 'w266_project_predictions/'

In [None]:
# a list of model's output CSV file
output_file_names = ['bart_large_zsl.csv', 'bart_xsum_zsl.csv', 'bart_cnn_zsl.csv', 
                     'bart_large_ft.csv', 'bart_xsum_ft.csv', 'bart_cnn_ft.csv',
                     't5_large_zsl.csv', 't5_large_ft.csv', 'bart_detox_zsl.csv']

In [None]:
# load the dataset
dataset = load_from_disk(dataset_path)

<h2> ROUGE </h2>

In [None]:
metric = load_metric("rouge")

  """Entry point for launching an IPython kernel.


Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [None]:
for output_file_name in output_file_names:
  df_bart_predictions = pd.read_csv(csv_path + output_file_name)
  rouge_results = metric.compute(predictions=df_bart_predictions['test_predictions'],
                                 references=df_bart_predictions['test_references'])
  display(output_file_name)
  display(pd.concat({k: pd.DataFrame(v) for k, v in rouge_results.items()}))
  print()

'bart_large_zsl.csv'

Unnamed: 0,Unnamed: 1,precision,recall,fmeasure
rouge1,0,0.688517,0.781481,0.724142
rouge1,1,0.702129,0.794881,0.738238
rouge1,2,0.714339,0.80859,0.750571
rouge2,0,0.537471,0.615092,0.567436
rouge2,1,0.553202,0.631382,0.583236
rouge2,2,0.569334,0.648449,0.599433
rougeL,0,0.684449,0.776822,0.720727
rougeL,1,0.69788,0.790671,0.734085
rougeL,2,0.711233,0.80445,0.747015
rougeLsum,0,0.684306,0.776621,0.72081





'bart_xsum_zsl.csv'

Unnamed: 0,Unnamed: 1,precision,recall,fmeasure
rouge1,0,0.191019,0.272801,0.212768
rouge1,1,0.206122,0.289535,0.227288
rouge1,2,0.221542,0.307217,0.243133
rouge2,0,0.089874,0.110479,0.093997
rouge2,1,0.102735,0.124492,0.106526
rouge2,2,0.117075,0.141416,0.121309
rougeL,0,0.181514,0.256348,0.201172
rougeL,1,0.19503,0.272375,0.214562
rougeL,2,0.210274,0.288914,0.22932
rougeLsum,0,0.181125,0.256332,0.199828





'bart_cnn_zsl.csv'

Unnamed: 0,Unnamed: 1,precision,recall,fmeasure
rouge1,0,0.685187,0.799805,0.731276
rouge1,1,0.699297,0.81407,0.744732
rouge1,2,0.711439,0.82737,0.757639
rouge2,0,0.537947,0.634818,0.577052
rouge2,1,0.553172,0.650995,0.591744
rouge2,2,0.569413,0.668629,0.608059
rougeL,0,0.68134,0.79619,0.727242
rougeL,1,0.695205,0.809636,0.740826
rougeL,2,0.709052,0.82317,0.753866
rougeLsum,0,0.681728,0.795541,0.7276





'bart_large_ft.csv'

Unnamed: 0,Unnamed: 1,precision,recall,fmeasure
rouge1,0,0.792845,0.79866,0.789376
rouge1,1,0.807479,0.811814,0.803053
rouge1,2,0.821168,0.824399,0.815986
rouge2,0,0.674311,0.675968,0.669937
rouge2,1,0.6937,0.695418,0.688902
rouge2,2,0.711133,0.713571,0.706033
rougeL,0,0.788656,0.79388,0.784993
rougeL,1,0.803411,0.807842,0.799052
rougeL,2,0.817888,0.82011,0.812318
rougeLsum,0,0.788492,0.794578,0.785125





'bart_xsum_ft.csv'

Unnamed: 0,Unnamed: 1,precision,recall,fmeasure
rouge1,0,0.802034,0.799533,0.793616
rouge1,1,0.816644,0.81226,0.806952
rouge1,2,0.83037,0.824961,0.820107
rouge2,0,0.683517,0.675343,0.672692
rouge2,1,0.701247,0.693399,0.690382
rouge2,2,0.719554,0.711657,0.708355
rougeL,0,0.798481,0.795099,0.789711
rougeL,1,0.812558,0.808519,0.803213
rougeL,2,0.826545,0.821777,0.816451
rougeLsum,0,0.797543,0.795524,0.789834





'bart_cnn_ft.csv'

Unnamed: 0,Unnamed: 1,precision,recall,fmeasure
rouge1,0,0.79418,0.813883,0.798306
rouge1,1,0.80883,0.827553,0.812179
rouge1,2,0.822415,0.839855,0.825241
rouge2,0,0.678197,0.693577,0.68121
rouge2,1,0.696424,0.711526,0.69899
rouge2,2,0.714564,0.730022,0.716697
rougeL,0,0.790531,0.810863,0.795335
rougeL,1,0.805258,0.823825,0.808734
rougeL,2,0.819257,0.837091,0.822124
rougeLsum,0,0.79113,0.810463,0.795406





't5_large_zsl.csv'

Unnamed: 0,Unnamed: 1,precision,recall,fmeasure
rouge1,0,0.637967,0.705416,0.656148
rouge1,1,0.653627,0.720261,0.670898
rouge1,2,0.667829,0.73661,0.684905
rouge2,0,0.486931,0.541429,0.500572
rouge2,1,0.503175,0.558681,0.516422
rouge2,2,0.520308,0.577213,0.533737
rougeL,0,0.630376,0.696037,0.649057
rougeL,1,0.644972,0.711278,0.661979
rougeL,2,0.6602,0.727636,0.676845
rougeLsum,0,0.62982,0.694531,0.647661





't5_large_ft.csv'

Unnamed: 0,Unnamed: 1,precision,recall,fmeasure
rouge1,0,0.79179,0.807188,0.793651
rouge1,1,0.805683,0.820467,0.8069
rouge1,2,0.819684,0.833042,0.81978
rouge2,0,0.673043,0.682798,0.672599
rouge2,1,0.690712,0.701534,0.690672
rouge2,2,0.708606,0.72053,0.708727
rougeL,0,0.787621,0.802902,0.789588
rougeL,1,0.801861,0.81679,0.80298
rougeL,2,0.816356,0.83023,0.816767
rougeLsum,0,0.787347,0.803322,0.789701





'bart_detox_zsl.csv'

Unnamed: 0,Unnamed: 1,precision,recall,fmeasure
rouge1,0,0.844398,0.841615,0.838008
rouge1,1,0.857302,0.853718,0.850361
rouge1,2,0.869847,0.866073,0.86269
rouge2,0,0.745749,0.740237,0.738328
rouge2,1,0.763183,0.758268,0.756373
rouge2,2,0.780244,0.775732,0.773688
rougeL,0,0.839524,0.836499,0.832944
rougeL,1,0.85372,0.850309,0.846827
rougeL,2,0.866368,0.862831,0.85945
rougeLsum,0,0.840079,0.836745,0.833743





<h2> Meteor </h2>

In [None]:
meteor_metric = load_metric("meteor")

Downloading builder script:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
for output_file_name in output_file_names:
  df_bart_predictions = pd.read_csv(csv_path + output_file_name)
  meteor_results = meteor_metric.compute(predictions=df_bart_predictions['test_predictions'],
                                 references=df_bart_predictions['test_references'])
  display(output_file_name)
  display(meteor_results['meteor'])
  print()
  # display(pd.concat({k: pd.DataFrame(v) for k, v in meteor_results.items()}))
  # print()

'bart_large_zsl.csv'

0.7831905637834498




'bart_xsum_zsl.csv'

0.3399882588794349




'bart_cnn_zsl.csv'

0.8012238046879839




'bart_large_ft.csv'

0.8048756009317317




'bart_xsum_ft.csv'

0.8025552798060527




'bart_cnn_ft.csv'

0.8217782469026045




't5_large_zsl.csv'

0.7154511639953924




't5_large_ft.csv'

0.8144609259256198




'bart_detox_zsl.csv'

0.8441763640930271




<h2> BLEU </h2>

In [None]:
bleu_metric = load_metric("bleu")

Downloading builder script:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

In [None]:
for output_file_name in output_file_names:
  df_bart_predictions = pd.read_csv(csv_path + output_file_name)
  bleu_predictions = [i.split(' ') for i in df_bart_predictions['test_predictions']]
  bleu_references = [[i.split(' ')] for i in df_bart_predictions['test_references']]

  bleu_results = bleu_metric.compute(predictions=bleu_predictions,
                                 references=bleu_references)
  display(output_file_name)
  bleu_results = pd.DataFrame.from_dict(bleu_results)
  display(bleu_results)
  print()

'bart_large_zsl.csv'

Unnamed: 0,bleu,precisions,brevity_penalty,length_ratio,translation_length,reference_length
0,0.375732,0.539229,1.0,1.219567,11331,9291
1,0.375732,0.418004,1.0,1.219567,11331,9291
2,0.375732,0.330161,1.0,1.219567,11331,9291
3,0.375732,0.267814,1.0,1.219567,11331,9291





'bart_xsum_zsl.csv'

Unnamed: 0,bleu,precisions,brevity_penalty,length_ratio,translation_length,reference_length
0,0.039412,0.111641,1.0,1.648585,15317,9291
1,0.039412,0.044319,1.0,1.648585,15317,9291
2,0.039412,0.027136,1.0,1.648585,15317,9291
3,0.039412,0.01797,1.0,1.648585,15317,9291





'bart_cnn_zsl.csv'

Unnamed: 0,bleu,precisions,brevity_penalty,length_ratio,translation_length,reference_length
0,0.375532,0.536275,1.0,1.261005,11716,9291
1,0.375532,0.416706,1.0,1.261005,11716,9291
2,0.375532,0.330766,1.0,1.261005,11716,9291
3,0.375532,0.269059,1.0,1.261005,11716,9291





'bart_large_ft.csv'

Unnamed: 0,bleu,precisions,brevity_penalty,length_ratio,translation_length,reference_length
0,0.479023,0.628551,1.0,1.045743,9716,9291
1,0.479023,0.516902,1.0,1.045743,9716,9291
2,0.479023,0.433316,1.0,1.045743,9716,9291
3,0.479023,0.374001,1.0,1.045743,9716,9291





'bart_xsum_ft.csv'

Unnamed: 0,bleu,precisions,brevity_penalty,length_ratio,translation_length,reference_length
0,0.494141,0.647538,1.0,1.029706,9567,9291
1,0.494141,0.53439,1.0,1.029706,9567,9291
2,0.494141,0.448149,1.0,1.029706,9567,9291
3,0.494141,0.384464,1.0,1.029706,9567,9291





'bart_cnn_ft.csv'

Unnamed: 0,bleu,precisions,brevity_penalty,length_ratio,translation_length,reference_length
0,0.474616,0.620136,1.0,1.111829,10330,9291
1,0.474616,0.512151,1.0,1.111829,10330,9291
2,0.474616,0.429957,1.0,1.111829,10330,9291
3,0.474616,0.371588,1.0,1.111829,10330,9291





't5_large_zsl.csv'

Unnamed: 0,bleu,precisions,brevity_penalty,length_ratio,translation_length,reference_length
0,0.318027,0.477364,1.0,1.210096,11243,9291
1,0.318027,0.359079,1.0,1.210096,11243,9291
2,0.318027,0.275523,1.0,1.210096,11243,9291
3,0.318027,0.216598,1.0,1.210096,11243,9291





't5_large_ft.csv'

Unnamed: 0,bleu,precisions,brevity_penalty,length_ratio,translation_length,reference_length
0,0.476999,0.625823,1.0,1.079539,10030,9291
1,0.476999,0.51554,1.0,1.079539,10030,9291
2,0.476999,0.43157,1.0,1.079539,10030,9291
3,0.476999,0.371797,1.0,1.079539,10030,9291





'bart_detox_zsl.csv'

Unnamed: 0,bleu,precisions,brevity_penalty,length_ratio,translation_length,reference_length
0,0.597164,0.718202,1.0,1.024755,9521,9291
1,0.597164,0.62752,1.0,1.024755,9521,9291
2,0.597164,0.557529,1.0,1.024755,9521,9291
3,0.597164,0.506098,1.0,1.024755,9521,9291





<h2> BLEURT </h2>

In [None]:
bleurt_metric = load_metric('bleurt')

Downloading builder script:   0%|          | 0.00/1.97k [00:00<?, ?B/s]



Downloading data:   0%|          | 0.00/405M [00:00<?, ?B/s]

In [None]:
for output_file_name in output_file_names:
  df_bart_predictions = pd.read_csv(csv_path + output_file_name)
  bleurt_predictions = [i.split(' ') for i in df_bart_predictions['test_predictions']]
  bleurt_references = [[i.split(' ')] for i in df_bart_predictions['test_references']]

  bleurt_results = bleurt_metric.compute(predictions=bleurt_predictions,
                                 references=bleurt_references)
  display(output_file_name)
  bleurt_results = pd.DataFrame.from_dict(bleurt_results).describe()
  display(bleurt_results)
  print()

'bart_large_zsl.csv'

Unnamed: 0,scores
count,989.0
mean,-0.035845
std,0.402242
min,-1.265521
25%,-0.33101
50%,-0.017892
75%,0.263814
max,0.880804





'bart_xsum_zsl.csv'

Unnamed: 0,scores
count,989.0
mean,-0.764454
std,0.356919
min,-1.634521
25%,-0.97777
50%,-0.847332
75%,-0.65265
max,0.880804





'bart_cnn_zsl.csv'

Unnamed: 0,scores
count,989.0
mean,-0.043494
std,0.407319
min,-1.327717
25%,-0.345793
50%,-0.018563
75%,0.26873
max,0.880804





'bart_large_ft.csv'

Unnamed: 0,scores
count,989.0
mean,0.217994
std,0.461496
min,-1.265521
25%,-0.124003
50%,0.267895
75%,0.606672
max,0.905509





'bart_xsum_ft.csv'

Unnamed: 0,scores
count,989.0
mean,0.21263
std,0.466758
min,-1.272143
25%,-0.148375
50%,0.264303
75%,0.605917
max,0.912668





'bart_cnn_ft.csv'

Unnamed: 0,scores
count,989.0
mean,0.22198
std,0.460912
min,-1.333405
25%,-0.122121
50%,0.275372
75%,0.615036
max,0.909683





't5_large_zsl.csv'

Unnamed: 0,scores
count,989.0
mean,-0.164345
std,0.403049
min,-1.287836
25%,-0.451366
50%,-0.169637
75%,0.141818
max,0.880804





't5_large_ft.csv'

Unnamed: 0,scores
count,989.0
mean,0.203451
std,0.46967
min,-1.154151
25%,-0.143845
50%,0.266958
75%,0.605917
max,0.926797





'bart_detox_zsl.csv'

Unnamed: 0,scores
count,989.0
mean,0.365583
std,0.459727
min,-1.154151
25%,0.035488
50%,0.47829
75%,0.790109
max,0.923983





### examine a few predictions in each model

In [None]:
print('The first 5 example\'s input and label')
pd.DataFrame(dataset['test'][:5])

The first 5 example's input and label


Unnamed: 0,en_toxic_comment,en_neutral_comment
0,"holy shit , the 61 61 view must be hilarious .",The 6161 view must be funny
1,all she has to do is smile and keep her cool w...,All she has to do is smile and keep her cool w...
2,can we apply that litmus test to muslims that ...,can we apply that litmus test to muslims that ...
3,"thats fucking it , im joining",That's it! I'm joining.
4,the residents of davis have a collective stick...,The residents of Davis are not so pleasant.


In [None]:
# examine a few predictions in each model
for output_file_name in output_file_names:
  df_bart_predictions = pd.read_csv(csv_path + output_file_name)
  display(output_file_name)
  display(df_bart_predictions.head(5))
  print()

'bart_large_zsl.csv'

Unnamed: 0,test_predictions,test_references
0,"['holy shit , the 61 61 view must be hilarious...",['The 6161 view must be funny']
1,['all she has to do is smile and keep her cool...,['All she has to do is smile and keep her cool...
2,['can we apply that litmus test to muslims tha...,['can we apply that litmus test to muslims tha...
3,"['thats fucking it , im joining']","[""That's it! I'm joining.""]"
4,['the residents of davis have a collective sti...,['The residents of Davis are not so pleasant.']





'bart_xsum_zsl.csv'

Unnamed: 0,test_predictions,test_references
0,['The BBC has released a video of the moment t...,['The 6161 view must be funny']
1,['Theresa May has been praised for the way she...,['All she has to do is smile and keep her cool...
2,['A few years ago I wrote a post saying that i...,['can we apply that litmus test to muslims tha...
3,"[""Thats what I thought about it, so I'm going ...","[""That's it! I'm joining.""]"
4,"[""The BBC has been given a special report on t...",['The residents of Davis are not so pleasant.']





'bart_cnn_zsl.csv'

Unnamed: 0,test_predictions,test_references
0,"['holy shit , the 61 61 view must be hilarious...",['The 6161 view must be funny']
1,['all she has to do is smile and keep her cool...,['All she has to do is smile and keep her cool...
2,['can we apply that litmus test to muslims tha...,['can we apply that litmus test to muslims tha...
3,"['thats fucking it , im joining.']","[""That's it! I'm joining.""]"
4,['The residents of davis have a collective sti...,['The residents of Davis are not so pleasant.']





'bart_large_ft.csv'

Unnamed: 0,test_predictions,test_references
0,['The 61 61 view must be hilarious.'],['The 6161 view must be funny']
1,['all she has to do is smile and keep her cool...,['All she has to do is smile and keep her cool...
2,['Can we apply that litmus test to muslims tha...,['can we apply that litmus test to muslims tha...
3,"[""That's it, im joining""]","[""That's it! I'm joining.""]"
4,['the residents of davis have a collective sti...,['The residents of Davis are not so pleasant.']





'bart_xsum_ft.csv'

Unnamed: 0,test_predictions,test_references
0,['The 61 61 view must be hilarious.'],['The 6161 view must be funny']
1,['All she has to do is smile and keep her cool...,['All she has to do is smile and keep her cool...
2,['can we apply that litmus test to muslims tha...,['can we apply that litmus test to muslims tha...
3,"[""I'm joining""]","[""That's it! I'm joining.""]"
4,['the residents of davis have a collective'],['The residents of Davis are not so pleasant.']





'bart_cnn_ft.csv'

Unnamed: 0,test_predictions,test_references
0,['The 61 61 view must be hilarious.'],['The 6161 view must be funny']
1,['all she has to do is smile and keep her cool...,['All she has to do is smile and keep her cool...
2,['can we apply that litmus test to muslims tha...,['can we apply that litmus test to muslims tha...
3,"['thats it , im joining']","[""That's it! I'm joining.""]"
4,['the residents of davis have a collective pro...,['The residents of Davis are not so pleasant.']





't5_large_zsl.csv'

Unnamed: 0,test_predictions,test_references
0,['the 61 61 view must be hilarious .'],['The 6161 view must be funny']
1,['all she has to do is smile and keep her cool...,['All she has to do is smile and keep her cool...
2,['can we apply that litmus test to muslims tha...,['can we apply that litmus test to muslims tha...
3,['im joining youtube . im joining youtube . im...,"[""That's it! I'm joining.""]"
4,['residents of davis have a collective stick u...,['The residents of Davis are not so pleasant.']





't5_large_ft.csv'

Unnamed: 0,test_predictions,test_references
0,['The 61 61 view must be hilarious.'],['The 6161 view must be funny']
1,['All she has to do is smile and keep her cool...,['All she has to do is smile and keep her cool...
2,['can we apply that litmus test to muslims tha...,['can we apply that litmus test to muslims tha...
3,"['thats it , im joining']","[""That's it! I'm joining.""]"
4,['The residents of davis have a collective pro...,['The residents of Davis are not so pleasant.']





'bart_detox_zsl.csv'

Unnamed: 0,test_predictions,test_references
0,['The 61 61 view must be hilarious'],['The 6161 view must be funny']
1,['All she has to do is smile and keep her cool...,['All she has to do is smile and keep her cool...
2,['can we apply that litmus test to muslims tha...,['can we apply that litmus test to muslims tha...
3,"['thats it , im joining']","[""That's it! I'm joining.""]"
4,['The residents of Davis are not good.'],['The residents of Davis are not so pleasant.']



