# Using Transformers to compute Sentiment Scores

In [None]:
import pandas as pd
import numpy as np
import os
import pickle

import transformers
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from scipy.special import softmax

In [3]:
Output_final = pd.concat([pd.read_pickle(r'../data_scrap/Final/'+x) for x in os.listdir('../data_scrap/Final/')])
Output_final.reset_index(drop = True, inplace = True)

In [5]:
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [9]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)

model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
# Example

# text = "Figuring out MSCI 454 tweeting..."
# text = preprocess(text)
# encoded_input = tokenizer(text, return_tensors='pt')
# output = model(**encoded_input)
# scores = output[0][0].detach().numpy()
# scores = softmax(scores)
# ranking = np.argsort(scores)
# ranking = ranking[::-1]
# for i in range(scores.shape[0]):
#     l = config.id2label[ranking[i]]
#     s = scores[ranking[i]]
#     print(f"{i+1}) {l} {np.round(float(s), 4)}")

In [None]:
Output_1 = Output_final[0:500000]
Output_2 = Output_final[500000:1000000].reset_index(drop=True)
Output_3 = Output_final[1000000:1500000].reset_index(drop=True)
Output_4 = Output_final[1500000:2000000].reset_index(drop=True)
Output_5 = Output_final[2000000:2119815].reset_index(drop=True)

In [None]:
for i in range(Output_5.shape[0]):
    text = Output_5.loc[i, 'Content']
    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    
    if (i+1) % 500 == 0:
        if (i+1) % 1000 == 0:
            print("It is working....Up to {} scores computed.".format((i+1)))
        print('Wait for a while....')

    Output_1.loc[i,'Score'] = scores[0]*(-1) + scores[1]*(0) +  scores[2]*(1)
    
SAVE_FOLDER = 'bert'

with open('../data_scrap/' + SAVE_FOLDER + '/' + 'bert_3' + '.pickle', 'wb') as f:
    pickle.dump(Output_2, f, pickle.HIGHEST_PROTOCOL)