In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
import re
import pandas as pd

In [None]:
# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = re.sub(r'@\S+', '@user', text)
    return new_text
MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)

model = AutoModelForSequenceClassification.from_pretrained(MODEL)
#model.save_pretrained(MODEL)


def sentiment_score(text):
    """ 
    Detects the sentiment of a string of text, using the RoBERTa model.
    :param text: string containing tweet text 
    :returns: sentiment score: int = positive - negative
    """
    score_sum = 'NI'
    try:
        # Sentiment detection
        text_processed = preprocess(str(text))
        encoded_input = tokenizer(text_processed, return_tensors='pt')
        output = model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        score_sum = (scores[2] - scores[0]) # score = positive - negative
    except Exception as e:
        print(e)
        pass

    return score_sum

In [None]:
# data source: https://www.kaggle.com/datasets/jp797498e/twitter-entity-sentiment-analysis

valdata = pd.read_csv(r'validation_data.csv', names=['irrelevant', 'also irrelevant', 'sentiment', 'text'])

In [None]:
correct = 0
total = 0
count = 0

In [None]:
for index, row in valdata.iterrows():
    total += 1
    count += 1

    score = sentiment_score(row['text'])
    actual = row['sentiment']
    if score < -0.3 and actual == 'Negative':
        correct += 1
    elif score > 0.3 and actual == 'Positive':
        correct += 1
    elif -0.3 < score < 0.3 and actual == 'Neutral':
        correct += 1

print(f'Correct:{correct}')
print(f'Total:{total}')
print(f'Accuracy:{(correct/total)*100}%')