# Thesis 2020-2021: roBERTa 

In this notebook, we will create a roBERTa model.

In [1]:
import pandas as pd
import numpy as np
import math

import matplotlib
import matplotlib.pyplot as plt

In [2]:
import re
from pattern.text.en import singularize
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

tokenizer = TweetTokenizer()
stop_words = set(stopwords.words('english'))

# Create a function to clean the tweets
def cleanTxt(text):
    text = text.lower() # Convert everything to lower case
    text = re.sub(r'@[a-zA-Z0-9]+', '', text) # Remove @mentions
    text = re.sub(r'rt[\s]+', '', text) # Remove RT (retweet symbol)
    text = re.sub(r'&amp;', 'and', text) # Replace '&amp;' by 'and'
    text = re.sub(r'https?:\/\/\S+', '', text) # Remove hyper link  
    #text = re.sub(r'\d+', '0', text) # Replace all numbers by a zero
    text = " ".join([singularize(word) for word in tokenizer.tokenize(text) if word not in stop_words]) # Remove stopwords
    #text = " ".join([singularize(word) for word in text])
    text = re.sub(r'[^\w\s#]', ' ', text) # Remove all non-alphanumeric symbols (excluding whitespace and # characters)
    text = re.sub(r'\s+', ' ', text) # Replace multiple whitespaces by a single whitespace
    text = text.strip() # Remove whitespaces at the beginning and at the end
    
    return text

In [32]:
import csv
    
df_train = pd.read_csv('data/hateval2019_en_train.csv')
df_dev = pd.read_csv('data/hateval2019_en_dev.csv')

df_train_dev = df_train.append(df_dev, ignore_index=True)
df_train_dev = df_train_dev.drop(['TR', 'AG'], axis=1)

df_test = pd.read_csv('data/hateval2019_en_test.csv')
df_test = df_test.drop(['TR', 'AG'], axis=1)

# Clean the data

df_train_dev['text_cleaned'] = df_train_dev['text'].apply(cleanTxt)
df_test['text_cleaned'] = df_test['text'].apply(cleanTxt)
df_train_dev

Unnamed: 0,id,text,HS,text_cleaned
0,201,"Hurray, saving us $$$ in so many ways @potus @...",1,hur ray Ä saving Ä u Ä Ä in Ä so Ä many Ä way Ä Ä Ä #...
1,202,Why would young fighting age men be the vast m...,1,Ä would Ä young Ä fighting Ä age Ä man Ä be Ä the Ä va...
2,203,@KamalaHarris Illegals Dump their Kids at the ...,1,Ä illeg al Ä dump Ä their Ä kid Ä at Ä the Ä border Ä ...
3,204,NY Times: 'Nearly All White' States Pose 'an A...,0,ny Ä time Ä s n early Ä all Ä white s Ä state Ä pos...
4,205,Orban in Brussels: European leaders are ignori...,0,ban Ä in Ä br u sel Ä euro pe Ä leader Ä are Ä ignor...
...,...,...,...,...
9995,19196,@SamEnvers you unfollowed me? Fuck you pussy,0,Ä you Ä unf ollow ed Ä me Ä fuck Ä you Ä pussy
9996,19197,@DanReynolds STFU BITCH! AND YOU GO MAKE SOME ...,1,Ä st fu Ä bitch Ä and Ä you Ä go Ä make Ä some Ä sat a...
9997,19198,"@2beornotbeing Honey, as a fellow white chick,...",0,Ä honey Ä a Ä a Ä fellow Ä white Ä chick Ä let Ä me Ä t...
9998,19199,I hate bitches who talk about niggaz with kids...,1,Ä hate Ä bit ches Ä who Ä talk Ä about Ä n igg az Ä w...


In [5]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

task='hate'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

#text = "Good night ðŸ˜Š"
text = "Hurray, saving us $$$ in so many ways @potus @realDonaldTrump #LockThemUp #BuildTheWall #EndDACA #BoycottNFL #BoycottNike"
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)

ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = labels[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=588.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=898822.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=456318.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=150.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=498676425.0), HTML(value='')))


1) not-hate 0.9168
2) hate 0.0832


In [22]:
text = "This is outrageous! #StopIllegalImmigration  #MeritImmigration"
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)

ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = labels[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

1) hate 0.7246
2) not-hate 0.2754


In [26]:
def get_label(text):
    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    if scores[0] >= 0.5:
        return 0
    else:
        return 1
    

In [25]:
scores[0]

0.27537116

In [27]:
df_test

Unnamed: 0,id,text,HS,text_cleaned
0,34243,"@local1025 @njdotcom @GovMurphy Oh, I could ha...",0,oh could gone taxe since current news nj guv w...
1,30593,Several of the wild fires in #california and #...,0,several wild fire #californium #colorado inten...
2,31427,@JudicialWatch My question is how do you reset...,0,question resettle refugee refugee go home coun...
3,31694,"#Europe, you've got a problem! We must hurry...",1,#europe got problem must hurry #buildthewall b...
4,31865,This is outrageous! #StopIllegalImmigration #...,1,outrageou #stopillegalimmigration #meritimmigr...
...,...,...,...,...
2995,31368,you can never take a L off a real bitchðŸ˜© im ho...,1,never take l real bitch im hotter ho chill w
2996,30104,@Brian_202 likes to call me a cunt & a bitch b...,1,_202 like call cunt bitch tell truth can t handle
2997,31912,@kusha1a @Camio_the_wise @shoe0nhead 1. Never ...,1,_the_wise 1 never said taught 2 called bitch f...
2998,31000,If i see and know you a hoe why would i hit yo...,1,see know hoe would hit back lol bitch got new ...


In [29]:
df_test_roberta

Unnamed: 0,id,text,HS,text_cleaned
0,34243,"@local1025 @njdotcom @GovMurphy Oh, I could ha...",0,oh could gone taxe since current news nj guv w...
1,30593,Several of the wild fires in #california and #...,1,several wild fire #californium #colorado inten...
2,31427,@JudicialWatch My question is how do you reset...,1,question resettle refugee refugee go home coun...
3,31694,"#Europe, you've got a problem! We must hurry...",1,#europe got problem must hurry #buildthewall b...
4,31865,This is outrageous! #StopIllegalImmigration #...,1,outrageou #stopillegalimmigration #meritimmigr...
...,...,...,...,...
2995,31368,you can never take a L off a real bitchðŸ˜© im ho...,1,never take l real bitch im hotter ho chill w
2996,30104,@Brian_202 likes to call me a cunt & a bitch b...,0,_202 like call cunt bitch tell truth can t handle
2997,31912,@kusha1a @Camio_the_wise @shoe0nhead 1. Never ...,0,_the_wise 1 never said taught 2 called bitch f...
2998,31000,If i see and know you a hoe why would i hit yo...,1,see know hoe would hit back lol bitch got new ...


In [28]:
import import_ipynb
import evaluate # here we import the local evaluate.ipynb jupyter notebook
from sklearn.linear_model import LogisticRegression

df_test_roberta = df_test.copy()
#df_test_roberta['text'] = df_test_count_norm['text'].apply(cleanTxt)
#df_test_roberta['count_norm'] = df_test_count_norm['text'].apply(count_norm)
df_test_roberta['HS'] = df_test_roberta['text'].apply(get_label)

# Create prediction file for the pretrained_roberta
df_test_roberta[['id', 'HS']].to_csv('predictions/pretrained_roberta.tsv', sep='\t', index=False, header=False)
df_test_roberta[['id', 'HS']].to_csv('input/res/en_a.tsv', sep='\t', index=False, header=False)

# Evaluate the result of the pretrained_roberta
evaluate.write_eval("scores_pretrained_roberta")

NameError: name 'evaluate' is not defined

In [30]:
import import_ipynb
import evaluate # here we import the local evaluate.ipynb jupyter notebook
from sklearn.linear_model import LogisticRegression

# Create prediction file for the pretrained_roberta
df_test_roberta[['id', 'HS']].to_csv('predictions/pretrained_roberta.tsv', sep='\t', index=False, header=False)
df_test_roberta[['id', 'HS']].to_csv('input/res/en_a.tsv', sep='\t', index=False, header=False)

# Evaluate the result of the pretrained_roberta
evaluate.write_eval("scores_pretrained_roberta")

importing Jupyter notebook from evaluate.ipynb
taskA_fscore: 0.5565265328623641
taskA_precision: 0.6947238565192636
taskA_recall: 0.6286945812807881
taskA_accuracy: 0.5776666666666667


In [31]:
import import_ipynb
import evaluate # here we import the local evaluate.ipynb jupyter notebook
from sklearn.linear_model import LogisticRegression

df_test_roberta_cleaned = df_test.copy()
df_test_roberta_cleaned['HS'] = df_test_roberta_cleaned['text_cleaned'].apply(get_label)

# Create prediction file for the pretrained_roberta_cleaned
df_test_roberta_cleaned[['id', 'HS']].to_csv('predictions/pretrained_roberta_cleaned.tsv', sep='\t', index=False, header=False)
df_test_roberta_cleaned[['id', 'HS']].to_csv('input/res/en_a.tsv', sep='\t', index=False, header=False)

# Evaluate the result of the pretrained_roberta_cleaned
evaluate.write_eval("scores_pretrained_roberta_cleaned")

taskA_fscore: 0.608939376163911
taskA_precision: 0.6660755293928309
taskA_recall: 0.6468527640941434
taskA_accuracy: 0.6133333333333333
