In [None]:
from transformers import AutoTokenizer, RobertaForSequenceClassification, pipeline
from tqdm import tqdm
import pandas as pd
import torch
import re

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
model = RobertaForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")


In [None]:
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    device=device,
    max_length=512,
    truncation=True
)

In [None]:
data = pd.read_csv('../data/results/data.csv')

# drop nan in body
data = data.dropna(subset=['body'])

# cleaning
data['body2'] = data['body'].str.replace('\n', ' ')
data['body2'] = data['body2'].str.strip()
data['body2'] = data['body2'].apply(lambda x: re.sub(' +', ' ', x))
data['body2'] = data['body2'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
data['body2'] = data['body2'].apply(lambda x: re.sub(r'http\S+', '', x))

In [None]:
tqdm.pandas()
data['sentiment'] = data['body2'].progress_apply(sentiment_pipeline)

In [None]:
data['sentiment_score'] = data['sentiment'].apply(lambda x:x[0]['score'])
data['sentiment'] = data['sentiment'].apply(lambda x:x[0]['label'])

In [25]:
data.to_csv('../data/results/data_sentiment.csv')