In [1]:
# Install these packages if running from colab
!pip install tensorflow-datasets --quiet
!pip install pydot --quiet
!pip install transformers --quiet

# install huggingface datasets
!pip install datasets --quiet

! pip install rouge-score nltk --quiet
! pip install huggingface_hub --quiet

!pip install sentencepiece --quiet

In [2]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Embedding, Input, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
import tensorflow_datasets as tfds
from sklearn.model_selection import train_test_split

import sklearn as sk
import os
import nltk
from nltk.data import find

import matplotlib.pyplot as plt

import re

#let's make longer output readable without scrolling
from pprint import pprint

# the toxic parallel dataset, with rouge metric
from datasets import load_dataset, load_from_disk, load_metric, DatasetDict

In [3]:
# # Load the Drive helper and mount
# from google.colab import drive
# drive.mount('/content/drive')

In [4]:
# define paths
csv_path = 'w266_project_predictions/'
model_path = 'w266_project_models/'

## Load Davison Dataset

In [5]:
url = 'https://raw.githubusercontent.com/t-davidson/hate-speech-and-offensive-language/master/data/labeled_data.csv'
dataset = pd.read_csv(url, index_col=0)
df = dataset

In [6]:
# Remove the initial exclamation points and the RT twitter handles
df['tweet'] = df['tweet'].apply(lambda x: ": ".join(x.split(": ")[1:]) if len(x.split(": ")) > 1 else x)
# remove the unicode symbols 

df['tweet'] = df['tweet'].apply(lambda x: re.sub("&#\d+","",x))

# remove other @handles 
df['tweet'] = df['tweet'].apply(lambda x: re.sub("@[^ ]+ ","",x))
df['tweet']

0        As a woman you shouldn't complain about cleani...
1        boy dats cold...tyga dwn bad for cuffin dat ho...
2        You ever fuck a bitch and she start to cry? Yo...
3                                   she look like a tranny
4        The shit you hear about me might be true or it...
                               ...                        
25291    right! His TL is trash ;. Now, mine? Bible scr...
25292    you've gone and broke the wrong heart baby, an...
25294    young buck wanna eat!!.. dat nigguh like I ain...
25295                youu got wild bitches tellin you lies
25296    ~~Ruffled | Ntac Eileen Dahlia - Beautiful col...
Name: tweet, Length: 24783, dtype: object

In [7]:
training_tweets, testing_tweets = train_test_split(df, test_size=0.2, random_state=25, stratify = df['class'])
valid_tweets, testing_tweets = train_test_split(testing_tweets, test_size = 0.5, random_state=25, stratify = testing_tweets['class'])

In [8]:
print(f"No. of training examples: {training_tweets.shape[0]}")
print(f"No. of validation examples: {valid_tweets.shape[0]}")
print(f"No. of testing examples: {testing_tweets.shape[0]}")

No. of training examples: 19826
No. of validation examples: 2478
No. of testing examples: 2479


## model.generate() to a CSV file

In [9]:
model_checkpoint = "facebook/bart-large-cnn"
model_name = 'bart_cnn_weights.hdf5'

In [10]:
from transformers import BartTokenizer, TFBartForConditionalGeneration

bart_cnn_model = TFBartForConditionalGeneration.from_pretrained(model_checkpoint)
bart_tokenizer = BartTokenizer.from_pretrained(model_checkpoint)

All model checkpoint layers were used when initializing TFBartForConditionalGeneration.

All the layers of TFBartForConditionalGeneration were initialized from the model checkpoint at facebook/bart-large-cnn.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


In [11]:
max_length = 65

In [12]:
checkpoint_file = model_path + model_name
bart_cnn_model.load_weights(checkpoint_file)

In [13]:
import time

In [14]:
testing_tweets['tweet']

601      "Why would you wanna be the Green Ranger? He's...
2353     #HolySpirit God still share HIS #Secrets Amos ...
24847                                       pancakes trash
21958    The KFAN mock draft continues, Cleveland is "o...
10327    I be telling Mcgirt music ain't enough.You got...
                               ...                        
21327     Slack jawed yokel husband http://t.co/VE1PWFrz9t
8898     Dating you would be like Darnell dating that f...
3744     Did you say spray tan? **Charlie Crist switche...
24157    bitches be like " I'm a squirter" but thinkin ...
24013           Zack still questions my love for Oreos lol
Name: tweet, Length: 2479, dtype: object

In [15]:
inputs = []
predictions = []
curr_df = testing_tweets['tweet']
length = len(curr_df)
batch_size = 10

for i in range(int(length/batch_size)):
  start_time = time.time()
  list_start = int(i*batch_size)
  list_end = int((i+1)*batch_size)
  if (int(i+1)*batch_size > length):
    list_end = length-1

  input_tokenized = bart_tokenizer(list(curr_df[list_start:list_end]), return_tensors="tf",padding=True, truncation=True).input_ids
  summary_ids = bart_cnn_model.generate(input_tokenized, num_beams=2, min_length=0, max_length=max_length)
  
  prediction = bart_tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
  input = curr_df[list_start:list_end]
  
  predictions.extend(prediction)
  inputs.extend(input)

  if i % 2 == 0:
    end_time = time.time()
    print('complete', i*batch_size, '/', length, ': ', end_time - start_time, 'per record.')
#print(len(val_references))

complete 0 / 2479 :  17.4843008518219 per record.
complete 20 / 2479 :  20.20796489715576 per record.
complete 40 / 2479 :  16.9012348651886 per record.
complete 60 / 2479 :  18.953460454940796 per record.
complete 80 / 2479 :  20.6339693069458 per record.
complete 100 / 2479 :  20.221646785736084 per record.
complete 120 / 2479 :  18.494943618774414 per record.
complete 140 / 2479 :  18.132465839385986 per record.
complete 160 / 2479 :  17.706299304962158 per record.
complete 180 / 2479 :  19.654677629470825 per record.
complete 200 / 2479 :  21.192508935928345 per record.
complete 220 / 2479 :  24.498835563659668 per record.
complete 240 / 2479 :  13.387699127197266 per record.
complete 260 / 2479 :  17.44926428794861 per record.
complete 280 / 2479 :  15.749465465545654 per record.
complete 300 / 2479 :  18.91218376159668 per record.
complete 320 / 2479 :  19.147905826568604 per record.
complete 340 / 2479 :  19.047999620437622 per record.
complete 360 / 2479 :  17.871283531188965 p

In [16]:
dict = {'test_inputs': inputs, 'test_predictions': predictions}  
       
df = pd.DataFrame(dict) 
# # saving the output dataframe to a csv file
output_file_name = 'davidson_bart_cnn_test.csv'
df.to_csv(csv_path + output_file_name, index = False) 