In [18]:
# Install these packages if running from colab
!pip install tensorflow-datasets --quiet
!pip install pydot --quiet
!pip install transformers --quiet

# install huggingface datasets
!pip install datasets --quiet

! pip install rouge-score nltk --quiet
! pip install huggingface_hub --quiet

!pip install sentencepiece --quiet

In [19]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Embedding, Input, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
import tensorflow_datasets as tfds
from sklearn.model_selection import train_test_split

import sklearn as sk
import os
import nltk
from nltk.data import find

import matplotlib.pyplot as plt

import re

#let's make longer output readable without scrolling
from pprint import pprint

# the toxic parallel dataset, with rouge metric
from datasets import load_dataset, load_from_disk, load_metric, DatasetDict

In [20]:
# # Load the Drive helper and mount
# from google.colab import drive
# drive.mount('/content/drive')

In [21]:
# define paths
csv_path = 'w266_project_predictions/'
model_path = 'w266_project_models/'

#### Change these variables as needed for different model and different file name

In [22]:
# change these variables for different models
model_checkpoint = "t5-large"
model_name = 't5_weights.hdf5'
output_file_name = 'davidson_t5_output.csv'

## load BART large pretrain model

In [23]:
from transformers import T5Tokenizer, TFT5Model, TFT5ForConditionalGeneration

model = TFT5ForConditionalGeneration.from_pretrained(model_checkpoint)
tokenizer = T5Tokenizer.from_pretrained(model_checkpoint)

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-large.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


ResourceExhaustedError: ignored

In [None]:
model.summary()

Model: "tft5_for_conditional_generation"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 shared (Embedding)          multiple                  32899072  
                                                                 
 encoder (TFT5MainLayer)     multiple                  334939648 
                                                                 
 decoder (TFT5MainLayer)     multiple                  435627520 
                                                                 
Total params: 737,668,096
Trainable params: 737,668,096
Non-trainable params: 0
_________________________________________________________________


### Load Fine-Tuned BART Model

In [None]:
# The model weights (that are considered the best) are loaded into the
# model.
checkpoint_file = model_path + model_name
model.load_weights(checkpoint_file)

In [None]:
try_phrase = 'brah im fucked up over here .'
input_tokenized = tokenizer([try_phrase], return_tensors="tf").input_ids
summary_ids = model.generate(input_tokenized, num_beams=2, min_length=0, max_length=25)

prediction = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)

print(prediction)

['im messed over here .']


## Load Davison Dataset

In [None]:
url = 'https://raw.githubusercontent.com/t-davidson/hate-speech-and-offensive-language/master/data/labeled_data.csv'
dataset = pd.read_csv(url, index_col=0)
df = dataset

In [None]:
# Remove the initial exclamation points and the RT twitter handles
df['tweet'] = df['tweet'].apply(lambda x: ": ".join(x.split(": ")[1:]) if len(x.split(": ")) > 1 else x)
# remove the unicode symbols 

df['tweet'] = df['tweet'].apply(lambda x: re.sub("&#\d+","",x))

# remove other @handles 
df['tweet'] = df['tweet'].apply(lambda x: re.sub("@[^ ]+ ","",x))
df['tweet']

0        As a woman you shouldn't complain about cleani...
1        boy dats cold...tyga dwn bad for cuffin dat ho...
2        You ever fuck a bitch and she start to cry? Yo...
3                                   she look like a tranny
4        The shit you hear about me might be true or it...
                               ...                        
25291    right! His TL is trash ;. Now, mine? Bible scr...
25292    you've gone and broke the wrong heart baby, an...
25294    young buck wanna eat!!.. dat nigguh like I ain...
25295                youu got wild bitches tellin you lies
25296    ~~Ruffled | Ntac Eileen Dahlia - Beautiful col...
Name: tweet, Length: 24783, dtype: object

In [11]:
training_tweets, testing_tweets = train_test_split(df, test_size=0.2, random_state=25, stratify = df['class'])
valid_tweets, testing_tweets = train_test_split(testing_tweets, test_size = 0.5, random_state=25, stratify = testing_tweets['class'])

In [12]:
print(f"No. of training examples: {training_tweets.shape[0]}")
print(f"No. of validation examples: {valid_tweets.shape[0]}")
print(f"No. of testing examples: {testing_tweets.shape[0]}")

No. of training examples: 19826
No. of validation examples: 2478
No. of testing examples: 2479


## model.generate() to a CSV file

In [13]:
import time

In [14]:
training_tweets = training_tweets['tweet'].apply(lambda x: "summarize: " + x)
valid_tweets = valid_tweets['tweet'].apply(lambda x: "summarize: " + x)
testing_tweets = testing_tweets['tweet'].apply(lambda x: "summarize: " + x)

In [15]:
training_tweets

3083        summarize: lol tell dat white bitch I said hey
9072                     summarize: Don't trust these hoes
14709    summarize: I'd like to issue a formal apology ...
6042                       summarize: what I thought bitch
10215                        summarize: How you livin hoe.
                               ...                        
21070    summarize: Say I'm wrong but I can treat that ...
10352    summarize: I bet that nigga got charged for mu...
18148    summarize: Obama ain't playin with these hoes....
6524     summarize: niggah you dont know me! Lol you do...
10767    summarize: I hate when a girls all "I'm single...
Name: tweet, Length: 19826, dtype: object

In [17]:
inputs = []
predictions = []
curr_df = training_tweets
length = len(curr_df)
batch_size = 10

for i in range(int(length/batch_size)):
  start_time = time.time()
  list_start = int(i*batch_size)
  list_end = int((i+1)*batch_size)
  if (int(i+1)*batch_size > length):
    list_end = length-1

  input_tokenized = tokenizer(list(curr_df[list_start:list_end]), return_tensors="tf",padding=True, truncation=True).input_ids
  summary_ids = model.generate(input_tokenized, num_beams=2, min_length=0, max_length=80)
  
  prediction = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
  input = curr_df[list_start:list_end]
  
  predictions.extend(prediction)
  inputs.extend(input)

  if i % 2 == 0:
    end_time = time.time()
    print('complete', i*batch_size, '/', length, ': ', end_time - start_time, 'per record.')
#print(len(val_references))

complete 0 / 19826 :  24.66452383995056 per record.
complete 20 / 19826 :  21.408741235733032 per record.
complete 40 / 19826 :  21.090230226516724 per record.
complete 60 / 19826 :  21.684492826461792 per record.
complete 80 / 19826 :  21.412635803222656 per record.
complete 100 / 19826 :  21.832579612731934 per record.
complete 120 / 19826 :  21.200722694396973 per record.
complete 140 / 19826 :  20.449758052825928 per record.
complete 160 / 19826 :  21.26646137237549 per record.
complete 180 / 19826 :  21.18228578567505 per record.
complete 200 / 19826 :  20.681045055389404 per record.
complete 220 / 19826 :  21.341191053390503 per record.
complete 240 / 19826 :  20.75008726119995 per record.
complete 260 / 19826 :  20.441108226776123 per record.
complete 280 / 19826 :  20.560998678207397 per record.
complete 300 / 19826 :  20.613872528076172 per record.
complete 320 / 19826 :  20.882510900497437 per record.
complete 340 / 19826 :  20.37903594970703 per record.
complete 360 / 19826 

In [23]:
dict = {'train_inputs': inputs, 'train_predictions': predictions}  
       
df = pd.DataFrame(dict) 

In [24]:
# # saving the output dataframe to a csv file
output_file_name = 'davidson_t5_train_output.csv'
df.to_csv(csv_path + output_file_name, index = False) 

In [16]:
inputs = []
predictions = []
curr_df = testing_tweets
length = len(curr_df)
batch_size = 10

for i in range(int(length/batch_size)):
  start_time = time.time()
  list_start = int(i*batch_size)
  list_end = int((i+1)*batch_size)
  if (int(i+1)*batch_size > length):
    list_end = length-1

  input_tokenized = tokenizer(list(curr_df[list_start:list_end]), return_tensors="tf",padding=True, truncation=True).input_ids
  summary_ids = model.generate(input_tokenized, num_beams=2, min_length=0, max_length=80)
  
  prediction = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
  input = curr_df[list_start:list_end]
  
  predictions.extend(prediction)
  inputs.extend(input)

  if i % 2 == 0:
    end_time = time.time()
    print('complete', i*batch_size, '/', length, ': ', end_time - start_time, 'per record.')
#print(len(val_references))

complete 0 / 2479 :  24.407761573791504 per record.
complete 20 / 2479 :  20.691839456558228 per record.
complete 40 / 2479 :  20.785637617111206 per record.
complete 60 / 2479 :  20.677837133407593 per record.
complete 80 / 2479 :  20.99403953552246 per record.
complete 100 / 2479 :  20.467914819717407 per record.
complete 120 / 2479 :  20.65203285217285 per record.
complete 140 / 2479 :  20.2889986038208 per record.
complete 160 / 2479 :  20.540759325027466 per record.
complete 180 / 2479 :  20.470852613449097 per record.
complete 200 / 2479 :  20.140308141708374 per record.
complete 220 / 2479 :  20.972471475601196 per record.
complete 240 / 2479 :  20.265602111816406 per record.
complete 260 / 2479 :  20.519848585128784 per record.
complete 280 / 2479 :  20.4103741645813 per record.
complete 300 / 2479 :  20.211116075515747 per record.
complete 320 / 2479 :  20.82405376434326 per record.
complete 340 / 2479 :  20.587030172348022 per record.
complete 360 / 2479 :  20.358636617660522

In [17]:
dict = {'test_inputs': inputs, 'test_predictions': predictions}  
       
df = pd.DataFrame(dict) 
# # saving the output dataframe to a csv file
output_file_name = 'davidson_t5_test_output.csv'
df.to_csv(csv_path + output_file_name, index = False) 

In [None]:
inputs = []
predictions = []
curr_df = valid_tweets
length = len(curr_df)
batch_size = 10

for i in range(int(length/batch_size)):
  start_time = time.time()
  list_start = int(i*batch_size)
  list_end = int((i+1)*batch_size)
  if (int(i+1)*batch_size > length):
    list_end = length-1

  input_tokenized = tokenizer(list(curr_df[list_start:list_end]), return_tensors="tf",padding=True, truncation=True).input_ids
  summary_ids = model.generate(input_tokenized, num_beams=2, min_length=0, max_length=80)
  
  prediction = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
  input = curr_df[list_start:list_end]
  
  predictions.extend(prediction)
  inputs.extend(input)

  if i % 2 == 0:
    end_time = time.time()
    print('complete', i*batch_size, '/', length, ': ', end_time - start_time, 'per record.')
#print(len(val_references))

In [None]:
dict = {'valid_inputs': inputs, 'valid_predictions': predictions}  
       
df = pd.DataFrame(dict) 
# # saving the output dataframe to a csv file
output_file_name = 'davidson_t5_valid_output.csv'
df.to_csv(csv_path + output_file_name, index = False) 

In [None]:
output_file_name = 'bart_cnn_test_ft.csv'
model_checkpoint = "facebook/bart-large-cnn"
model_name = 'bart_cnn_weights.hdf5'

In [None]:
from transformers import BartTokenizer, TFBartForConditionalGeneration

bart_cnn_model = TFBartForConditionalGeneration.from_pretrained(model_checkpoint)
bart_tokenizer = BartTokenizer.from_pretrained(model_checkpoint)

In [None]:
checkpoint_file = model_path + model_name
model.load_weights(checkpoint_file)

In [None]:
inputs = []
predictions = []
curr_df = testing_tweets
length = len(curr_df)
batch_size = 10

for i in range(int(length/batch_size)):
  start_time = time.time()
  list_start = int(i*batch_size)
  list_end = int((i+1)*batch_size)
  if (int(i+1)*batch_size > length):
    list_end = length-1

  input_tokenized = bart_tokenizer(list(curr_df[list_start:list_end]), return_tensors="tf",padding=True, truncation=True).input_ids
  summary_ids = bart_cnn_model.generate(input_tokenized, num_beams=2, min_length=0, max_length=80)
  
  prediction = bart_tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
  input = curr_df[list_start:list_end]
  
  predictions.extend(prediction)
  inputs.extend(input)

  if i % 2 == 0:
    end_time = time.time()
    print('complete', i*batch_size, '/', length, ': ', end_time - start_time, 'per record.')
#print(len(val_references))

In [None]:
dict = {'test_inputs': inputs, 'test_predictions': predictions}  
       
df = pd.DataFrame(dict) 
# # saving the output dataframe to a csv file
output_file_name = 'bart_cnn_test.csv'
df.to_csv(csv_path + output_file_name, index = False) 