In [1]:
from transformers import AutoTokenizer, RobertaForSequenceClassification, pipeline
from collections import Counter
from tqdm import tqdm
import pandas as pd
import torch
import re

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Use GPU if available

1. Sentiment Analysis (Positive, Negative, Neutral) :

In [None]:
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
model = RobertaForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

In [None]:
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    device=device,
    max_length=512,
    truncation=True
)

In [3]:
data = pd.read_csv('../data/results/data.csv')

# drop nan in body
data = data.dropna(subset=['body'])

# cleaning
data['body2'] = data['body'].str.replace('\n', ' ')
data['body2'] = data['body2'].str.strip()
data['body2'] = data['body2'].apply(lambda x: re.sub(' +', ' ', x))
data['body2'] = data['body2'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
data['body2'] = data['body2'].apply(lambda x: re.sub(r'http\S+', '', x))

In [None]:
tqdm.pandas()
data['sentiment'] = data['body2'].progress_apply(sentiment_pipeline)

In [None]:
data['sentiment_score'] = data['sentiment'].apply(lambda x:x[0]['score'])
data['sentiment'] = data['sentiment'].apply(lambda x:x[0]['label'])

In [None]:
data.to_csv('../data/results/data_sentiment.csv')

In [17]:
Counter(list(data['sentiment']))

Counter({'neutral': 647612, 'negative': 429790, 'positive': 215097})

2. Emotion Analysis (Anger, Fear, Joy, Sadness, Surprise, Disgust, Neutral) :

In [4]:
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=False, device=device, truncation=True, verbose=True)



In [5]:
tqdm.pandas()
# _lis = data[data['Topic']==2].body2.progress_apply(classifier)
data['emotion'] = data.body2.progress_apply(classifier)

  0%|          | 2/1276424 [00:00<45:58:16,  7.71it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 1276424/1276424 [4:27:11<00:00, 79.62it/s]  


In [6]:
data['emotion_score'] = data['emotion'].apply(lambda x:x[0]['score'])
data['emotion'] = data['emotion'].apply(lambda x:x[0]['label'])

In [13]:
Counter(list(data['emotion']))

Counter({'neutral': 661310,
         'joy': 134459,
         'anger': 121800,
         'surprise': 107678,
         'sadness': 101110,
         'fear': 96582,
         'disgust': 53485})

In [14]:
data.to_csv('../data/results/data_emotions.csv')