In [None]:
# Install these packages if running from colab
!pip install tensorflow-datasets --quiet
!pip install pydot --quiet
!pip install transformers --quiet

# install huggingface datasets
!pip install datasets --quiet

! pip install rouge-score nltk --quiet
! pip install huggingface_hub --quiet

!pip install sentencepiece --quiet

[K     |████████████████████████████████| 5.5 MB 3.8 MB/s 
[K     |████████████████████████████████| 182 kB 73.8 MB/s 
[K     |████████████████████████████████| 7.6 MB 62.5 MB/s 
[K     |████████████████████████████████| 451 kB 4.6 MB/s 
[K     |████████████████████████████████| 115 kB 90.1 MB/s 
[K     |████████████████████████████████| 212 kB 90.6 MB/s 
[K     |████████████████████████████████| 127 kB 89.5 MB/s 
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 1.3 MB 4.6 MB/s 
[?25h

In [None]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Embedding, Input, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
import tensorflow_datasets as tfds
from sklearn.model_selection import train_test_split

import sklearn as sk
import os
import nltk
from nltk.data import find

import matplotlib.pyplot as plt

import re

#let's make longer output readable without scrolling
from pprint import pprint

# the toxic parallel dataset, with rouge metric
from datasets import load_dataset, load_from_disk, load_metric, DatasetDict

In [None]:
# # Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# define paths
csv_path = 'drive/MyDrive/Colab Notebooks/w266_project_predictions/'

#### Change these variables as needed for different model and different file name

In [None]:
# change these variables for different models
output_file_name = 'davidson_paradetox_output.csv'

## load BART Base Pretrain + paraDetox Fine-Tuned model

In [None]:
# using detox pretrained
from transformers import BartForConditionalGeneration, AutoTokenizer
base_model_name = 'facebook/bart-base'
model_name = 'SkolkovoInstitute/bart-base-detox'
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

In [None]:
try_phrase = 'brah im fucked up over here .'
input_tokenized = tokenizer([try_phrase], return_tensors="pt").input_ids
summary_ids = model.generate(input_tokenized, num_beams=2, min_length=0, max_length=65)

prediction = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)

print(prediction)

['brah im messed up over here .']


## Load Davison Dataset

In [None]:
url = 'https://raw.githubusercontent.com/t-davidson/hate-speech-and-offensive-language/master/data/labeled_data.csv'
dataset = pd.read_csv(url, index_col=0)
df = dataset

In [None]:
# Remove the initial exclamation points and the RT twitter handles
df['tweet'] = df['tweet'].apply(lambda x: ": ".join(x.split(": ")[1:]) if len(x.split(": ")) > 1 else x)
# remove the unicode symbols 

df['tweet'] = df['tweet'].apply(lambda x: re.sub("&#\d+","",x))

# remove other @handles 
df['tweet'] = df['tweet'].apply(lambda x: re.sub("@[^ ]+ ","",x))
df['tweet']

0        As a woman you shouldn't complain about cleani...
1        boy dats cold...tyga dwn bad for cuffin dat ho...
2        You ever fuck a bitch and she start to cry? Yo...
3                                   she look like a tranny
4        The shit you hear about me might be true or it...
                               ...                        
25291    right! His TL is trash ;. Now, mine? Bible scr...
25292    you've gone and broke the wrong heart baby, an...
25294    young buck wanna eat!!.. dat nigguh like I ain...
25295                youu got wild bitches tellin you lies
25296    ~~Ruffled | Ntac Eileen Dahlia - Beautiful col...
Name: tweet, Length: 24783, dtype: object

In [None]:
training_tweets, testing_tweets = train_test_split(df, test_size=0.2, random_state=25, stratify = df['class'])
valid_tweets, testing_tweets = train_test_split(testing_tweets, test_size = 0.5, random_state=25, stratify = testing_tweets['class'])

In [None]:
print(f"No. of training examples: {training_tweets.shape[0]}")
print(f"No. of validation examples: {valid_tweets.shape[0]}")
print(f"No. of testing examples: {testing_tweets.shape[0]}")

No. of training examples: 19826
No. of validation examples: 2478
No. of testing examples: 2479


## model.generate() to a CSV file

In [None]:
import time

In [None]:
training_tweets = training_tweets['tweet']
valid_tweets = valid_tweets['tweet']
testing_tweets = testing_tweets['tweet']

In [None]:
testing_tweets

601      "Why would you wanna be the Green Ranger? He's...
2353     #HolySpirit God still share HIS #Secrets Amos ...
24847                                       pancakes trash
21958    The KFAN mock draft continues, Cleveland is "o...
10327    I be telling Mcgirt music ain't enough.You got...
                               ...                        
21327     Slack jawed yokel husband http://t.co/VE1PWFrz9t
8898     Dating you would be like Darnell dating that f...
3744     Did you say spray tan? **Charlie Crist switche...
24157    bitches be like " I'm a squirter" but thinkin ...
24013           Zack still questions my love for Oreos lol
Name: tweet, Length: 2479, dtype: object

In [None]:
inputs = []
predictions = []
curr_df = testing_tweets
length = len(curr_df)
batch_size = 10

for i in range(int(length/batch_size)):
  start_time = time.time()
  list_start = int(i*batch_size)
  list_end = int((i+1)*batch_size)
  if (int(i+1)*batch_size > length):
    list_end = length-1

  input_tokenized = tokenizer(list(curr_df[list_start:list_end]), return_tensors="pt", padding=True, truncation=True).input_ids
  summary_ids = model.generate(input_tokenized, num_beams=2, min_length=0, max_length=65)
  
  prediction = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
  input = curr_df[list_start:list_end]
  
  predictions.extend(prediction)
  inputs.extend(input)

  end_time = time.time()
  print('complete', i*batch_size, '/', length, ': ', end_time - start_time, 'in this batch.')
#print(len(val_references))

complete 0 / 2479 :  230.86832809448242 in this batch.
complete 200 / 2479 :  279.84443736076355 in this batch.
complete 400 / 2479 :  278.94853472709656 in this batch.
complete 600 / 2479 :  257.8484380245209 in this batch.
complete 800 / 2479 :  272.0145535469055 in this batch.
complete 1000 / 2479 :  313.78338980674744 in this batch.
complete 1200 / 2479 :  293.7940146923065 in this batch.
complete 1400 / 2479 :  293.3756878376007 in this batch.
complete 1600 / 2479 :  279.6716446876526 in this batch.


In [None]:
predictions

In [None]:
inputs

In [None]:
dict = {'train_inputs': inputs, 'train_predictions': predictions}  
       
df = pd.DataFrame(dict) 

In [None]:
df

In [None]:
# # saving the output dataframe to a csv file
df.to_csv(csv_path + output_file_name, index = False) 