# Master Thesis Script
## Sentiment classification of tweets using RoBERTa

First step: Use Twitter Sentiment BERT to classify each Tweet into neutral, positive or negative

In [None]:
# installations
#!pip install transformers
#!pip install tensorflow
#!pip install torch

In [None]:
import pandas as pd
from transformers import pipeline
import torch
import regex as re
import numpy as np

In [None]:
f = open(r'/kaggle/input/twitter-data2/Merged_Data_20230501_2.json')
  
# returns JSON object as 
# a dictionary
data = pd.read_json(f)
  
# Closing file
f.close()

In [None]:
print(len(data))

In [None]:
# only need to look at orginal tweets for this
data_OG = data[(data['referenced_tweets.retweeted.id'] == "None")]

In [None]:
print(len(data_OG))

If we only look at original tweets, we have 542,104 observations. We only classify tweet text for these.
Continue with Twitter BERT.
The following model is used: https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax

In [None]:
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
# Cconnect to GPU and push model to GPU
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
model.to(device)

In [None]:
tweet_text = data_OG.text.values.tolist()

In [None]:
# remove emojis
def deEmojify(inputString):
    return inputString.encode('ascii', 'ignore').decode('ascii')

In [None]:
tweet_text_lst_clean = []
for item in tweet_text:
    item_new = re.sub("&amp;", "&", item)
    #item_new = re.sub("https.*", "", item_new)
    #item_new = re.sub("@\w+", "", item_new)
    item_new = deEmojify(item_new)
    item_new = item_new.replace('\\n', ' ')
    item_new = item_new.replace('\\', '')
    tweet_text_lst_clean.append(item_new)

In [None]:
def preprocess_lst(text_lst):
    sentiment_results = []
    for text in text_lst:
        text = preprocess(text)
        encoded_input = tokenizer(text, return_tensors='pt')
        encoded_input.to(device)
        output = model(**encoded_input)
        output = output.logits
        output = output.cpu()
        scores = output[0].detach().numpy()
        scores = softmax(scores)

        ranking = np.argsort(scores)
        ranking = ranking[::-1]
        result_dict = {}
        for i in range(scores.shape[0]):
            l = config.id2label[ranking[i]]
            s = scores[ranking[i]]
            result_dict[l] = s
            #print(f"{i+1}) {l} {np.round(float(s), 4)}")
        sentiment_results.append(result_dict)
        
    return sentiment_results

In [None]:
tweet_sentiment_results = preprocess_lst(tweet_text_lst_clean)

In [None]:
sentiment_bert_df = pd.DataFrame(tweet_sentiment_results)

In [None]:
sentiment_bert_df.to_csv("Sentiment_BERT_results_complete.csv")

For now, we add the ordered dict of all three sentiments with their scores to the dataset.
This is because after taking a quick look at the results, the highest scored sentiment is not always the most fitting.
So there might be the need to use some kind of combination of the 2 highest scores, for instance. Therefore, we do not discard any lower-scored sentiments as of now.