In [None]:
!pip install transformers sentencepiece

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
import pandas as pd

In [None]:
df = pd.read_csv('/content/drive/MyDrive/CNR/Santorini/tweet_santorini.csv')

In [None]:
df.head()

In [None]:
df['text']=df['text'].str.lower()

In [None]:
df = df.dropna(subset=['text'])

In [None]:
df = df[~df.text.str.startswith('rt')]

In [None]:
stopwordlist = ['a', 'about', 'above', 'after', 'again', 'ain', 'all', 'am', 'an',
             'and','any','are', 'as', 'at', 'be', 'because', 'been', 'before',
             'being', 'below', 'between','both', 'by', 'can', 'd', 'did', 'do',
             'does', 'doing', 'down', 'during', 'each','few', 'for', 'from',
             'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in',
             'into','is', 'it', 'its', 'itself', 'just', 'll', 'm', 'ma',
             'me', 'more', 'most','my', 'myself', 'now', 'o', 'of', 'on', 'once',
             'only', 'or', 'other', 'our', 'ours','ourselves', 'out', 'own', 're','s', 'same', 'she', "shes", 'should', "shouldve",'so', 'some', 'such',
             't', 'than', 'that', "thatll", 'the', 'their', 'theirs', 'them',
             'themselves', 'then', 'there', 'these', 'they', 'this', 'those',
             'through', 'to', 'too','under', 'until', 'up', 've', 'very', 'was',
             'we', 'were', 'what', 'when', 'where','which','while', 'who', 'whom',
             'why', 'will', 'with', 'won', 'y', 'you', "youd","youll", "youre",
             "youve", 'your', 'yours', 'yourself', 'yourselves']

In [None]:
STOPWORDS = set(stopwordlist)
def cleaning_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
df['text'] = df['text'].apply(lambda text: cleaning_stopwords(text))
df['text'].head()

In [None]:
import re

def cleaning_URLs(data):
    return re.sub('((www.[^s]+)|(https?://[^s]+))',' ',data)
df['text'] = df['text'].apply(lambda x: cleaning_URLs(x))
df['text'].tail()

In [None]:
import string
english_punctuations = string.punctuation
punctuations_list = english_punctuations
def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)
df['text']= df['text'].apply(lambda x: cleaning_punctuations(x))
df['text'].tail()

In [None]:
df = df.drop_duplicates(subset=['text'])

In [None]:
data = df.text.to_list()

In [None]:
MODEL = f"cardiffnlp/twitter-xlm-roberta-base-sentiment"

tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
#model.save_pretrained(MODEL)

In [None]:
len(data)

In [None]:
import time, sys
from IPython.display import clear_output

In [None]:
def update_progress(progress):
    bar_length = 50
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
    if progress < 0:
        progress = 0
    if progress >= 1:
        progress = 1

    block = int(round(bar_length * progress))

    clear_output(wait = True)
    text = "Progress: [{0}] {1:.1f}%".format( "#" * block + "-" * (bar_length - block), progress * 100)
    print(text)

In [None]:
i = 0
final_scores = []
#
for text in data:

  encoded_input = tokenizer(text, return_tensors='pt')
  output = model(**encoded_input)
  scores = output[0][0].detach().numpy()
  scores = softmax(scores)
  scores = np.argmax(scores)

  final_scores.append(scores)

  i += 1
  update_progress(i / len(data))


In [None]:
import plotly as py
import plotly.graph_objs as go


In [None]:
def fig_pie(labels, values, title, topk=None):

    # get top-k
    if topk is not None:
        # sort by value
        sorted_list = sorted(zip(labels, values), key=lambda tup: tup[1], reverse=True)

        # getting the ones after the topk
        sorted_list_others = sorted_list[topk:]

        # summing the values for the others
        sum_others = sum([x[1] for x in sorted_list_others])

        # getting the top-k
        sorted_list = sorted_list[:topk]

        # split labels and values again
        labels, values = zip(*sorted_list)

        # adding the "others" label
        labels = list(labels)
        labels.append("others")

        values = list(values)
        values.append(sum_others)


    # setting for the plot
    fig = {
        "data": [
            { "values": values
             , "labels": labels
             , "hole": .2
             , "type": "pie"}]
        , "layout": {"title": title}
    }
    return fig

In [None]:
# computing the geo,place,non-tagged
len_positive = len([x for x in final_scores if x==2])
len_neutral = len([x for x in final_scores if x==1])
len_negative = len([x for x in final_scores if x==0])

# values for the plot
labels = ['Positive','Neutral','Negative']
values = [len_positive, len_neutral, len_negative]

# plot
fig = fig_pie(labels, values, "Satisfation/dissatisfation level")
fig = go.Figure(fig)
fig.show()
fig.write_image('output/satisfation_level.png')