In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

task='emotion'
MODEL = "cardiffnlp/twitter-roberta-base-emotion"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [None]:
# download label mapping
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

In [None]:
import pandas as pd
df_tweet = pd.read_parquet('../../../covid_media_coverage/data/final/comments.parquet')
df_news_tweets = pd.read_parquet('../../../covid_media_coverage/data/final/news_tweets.parquet')

In [None]:
df_tweet.shape, df_news_tweets.shape

In [None]:
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

model = AutoModelForSequenceClassification.from_pretrained(MODEL).to(device)

In [None]:
def get_emotion_scores(text, model):

    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt').to(device)
    output = model(**encoded_input)
    scores = output[0][0].cpu().detach().numpy()
    
    #scores = [scores[0], scores[1], scores[3]]
    #label = [labels[0], labels[1], labels[3]]
    scores = softmax(scores)
    
    return scores  #, labels[np.argmax(scores)]

In [None]:
#computed_em_scores = pd.read_csv('results/tweet_emotion_scores.csv')
#l = computed_em_scores.shape[0]

In [None]:
def create_dataframe(df, em_scores):

    em_scores_df = pd.DataFrame()
    #em_scores_df['tweetId'] = df_tweet['tweetId']
    
    df['anger'] = [a[0] for a in em_scores]
    df['joy'] = [a[1] for a in em_scores]
    df['optimism'] = [a[2] for a in em_scores]
    df['sadness'] = [a[3] for a in em_scores]
    
    return df

In [None]:
#em_scores_ = [get_emotion_scores(tweet, model).round(3) for tweet in df_tweet['content'][0:100]]
import sys
import re
em_scores = []

import time
start_time = time.time()
diff = 0
    
x = 50000
i = 0
k = 0
for tweet in df_news_tweets['content'].iloc[k:]:
    
    if(i%x == 0 and i !=0):
        
        df = df_news_tweets.iloc[i-x:i]
        df = create_dataframe(df, em_scores)
        df.to_parquet('covid_data/em_scores/em_scores_' + str(i) + '.parquet')
        
        em_scores = []
    
    if(i%1000 == 0):    
        diff = round((time.time() - start_time) - diff, 3)
        print(i, "\t", diff, end = "\r")
        
    #print(get_emotion_scores(tweet, model).round(3))
    #Checks for too many emojis in the text as the model fails over > ~150 emojis in a tweet
    if(len(re.findall(r'[^\w\s,]', tweet)) > 150):
        em_scr = np.array([None, None, None, None])
    else:
        em_scr = get_emotion_scores(tweet, model).round(3)
        
    em_scores.append(em_scr)
    i+=1
    
df = df_news_tweets.iloc[i-x:i]
df = create_dataframe(df, em_scores)
df.to_parquet('covid_data/em_scores/em_scores_' + str(i) + '.parquet')

In [None]:
# df_news_tweet = pd.read_parquet('covid_data/news_tweets_with_em_scores.parquet')
# tmp_df = pd.concat((df_news_tweet, df), axis = 0)
# tmp_df.shape, df_news_tweet.shape
# tmp_df.to_parquet('covid_data/news_tweets_with_em_scores.parquet')