In [1]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

task='emotion'
MODEL = "cardiffnlp/twitter-roberta-base-emotion"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

Downloading:   0%|          | 0.00/779 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [2]:
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [3]:
# download label mapping
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

In [4]:
import pandas as pd
df_tweet = pd.read_parquet('../data/processed/skynews/comments_skynews.parquet')
df_news_tweets = pd.read_parquet('../data/processed/skynews/news_tweets_skynews.parquet')

In [60]:
df_tweet_shape = df_tweet.shape[0]

In [5]:
df_tweet.shape, df_news_tweets.shape

((1015519, 14), (29716, 14))

In [7]:
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

model = AutoModelForSequenceClassification.from_pretrained(MODEL).to(device)

cuda


Downloading:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [27]:
def get_emotion_scores(text, model):

    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt').to(device)
    output = model(**encoded_input)
    scores = output[0][0].cpu().detach().numpy()
    
    #scores = [scores[0], scores[1], scores[3]]
    #label = [labels[0], labels[1], labels[3]]
    scores = softmax(scores)
    
    return list(scores.round(3))  #, labels[np.argmax(scores)]

In [None]:
#computed_em_scores = pd.read_csv('results/tweet_emotion_scores.csv')
#l = computed_em_scores.shape[0]

In [9]:
def create_dataframe(df, em_scores):

    em_scores_df = pd.DataFrame()
    #em_scores_df['tweetId'] = df_tweet['tweetId']
    
    df['anger'] = [a[0] for a in em_scores]
    df['joy'] = [a[1] for a in em_scores]
    df['optimism'] = [a[2] for a in em_scores]
    df['sadness'] = [a[3] for a in em_scores]
    
    return df

In [73]:
import csv
import sys
import re
import time

def append_user_response_to_csv(tweet_list, fileName):

    #Open OR create the target CSV file
    csvFile = open(fileName, "a", newline="", encoding='utf-8')
    csvWriter = csv.writer(csvFile)

    #Loop through each tweet
    for tweet in tweet_list:        
        # Append the result to the CSV file
        csvWriter.writerow(tweet)
        
    # When done, close the CSV file
    csvFile.close()

# Create user file and write header
csvName = "./../data/processed/skynews/em_scores/user_replies_with_em_scores.csv"
csvFile = open(csvName, "a", newline="", encoding='utf-8')
csvWriter = csv.writer(csvFile)
csvWriter.writerow(['tweetId', 'anger', 'joy', 'optimism', 'sadness'])
csvFile.close()

#
start_time = time.time()
em_scores = []
errCount = 0
elapsed_time = 0
x = 1000
i = 0
for index, tweet in df_tweet.iterrows():
    
    #Checks for too many emojis in the text as the model fails over > ~150 emojis in a tweet
    if(len(re.findall(r'[^\w\s,]', tweet.content)) > 150):
        em_scr =  [tweet.tweetId, None, None, None, None]
    else:
        try:
            em_scr = [tweet.tweetId] + get_emotion_scores(tweet.content, model)
        except:
            em_scr =  [tweet.tweetId, -1., -1., -1., -1.]
            errCount += 1
    em_scores.append(em_scr)
    
    #
    if((i%x == 0) and (i > 0)):
        append_user_response_to_csv(em_scores, csvName)
        diff = (time.time() - start_time) - elapsed_time
        elapsed_time = (time.time() - start_time)
        print(i, 
              "\t", errCount,
              "\t", round(diff,1),
              "\t", round(elapsed_time/60,1), 
              "\t", round(((df_tweet_shape-i) * (elapsed_time/i))/60,1), 
              end = "\r")
        em_scores = []
    i+=1
append_user_response_to_csv(em_scores, csvName)
print(i, "\t", diff, end = "\r")  

1015519 	 55.11132836341858	 0.55

In [74]:
tmp = pd.read_csv(csvName)

In [77]:
tmp

Unnamed: 0,tweetId,anger,joy,optimism,sadness
0,1383659647609884672,0.640,0.013,0.131,0.216
1,1383431071019143180,0.966,0.003,0.019,0.012
2,1383399358050181123,0.494,0.095,0.332,0.079
3,1383394492338696201,0.976,0.005,0.010,0.009
4,1383392021553893386,0.940,0.005,0.015,0.040
...,...,...,...,...,...
1015514,1223007718186856449,0.032,0.068,0.761,0.138
1015515,1223007683659419649,0.193,0.037,0.725,0.046
1015516,1223007637740183553,0.642,0.153,0.099,0.106
1015517,1223007627724148737,0.204,0.372,0.232,0.191


In [None]:
# df_news_tweet = pd.read_parquet('covid_data/news_tweets_with_em_scores.parquet')
# tmp_df = pd.concat((df_news_tweet, df), axis = 0)
# tmp_df.shape, df_news_tweet.shape
# tmp_df.to_parquet('covid_data/news_tweets_with_em_scores.parquet')