# Sentiment/Emotion Analysis Inference for Interpretable Misinformation Detection

## Setup

In [None]:
!pip -q install transformers datasets evaluate

In [None]:
import numpy as np
import pandas as pd
import torch

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
from datasets import Dataset
import evaluate

In [None]:
import tqdm

In [None]:
from google.colab import drive
drive.mount("/gdrive")

In [None]:
from pathlib import Path
basepath = Path("/gdrive/My Drive/Interpretable Misinformation Detection/")

## Parameters

In [None]:
# MODEL = "cardiffnlp/twitter-roberta-base-sentiment-latest"
MODEL = "cardiffnlp/twitter-roberta-base-emotion"

## Data

In [None]:
labels_df = pd.read_csv(basepath.joinpath("data/community-notes/community_notes.csv"))
tweets_df = pd.read_csv(basepath.joinpath("data/community-notes/tweets.csv"))
news_tweets_df = pd.read_csv(basepath.joinpath("data/news_tweets.csv"))

In [None]:
tweets_df = tweets_df.loc[tweets_df.content.notna()]
news_tweets_df = news_tweets_df.loc[news_tweets_df.content.notna()]

In [None]:
labels_dict = labels_df.set_index("tweetId").to_dict("index")
tweets_dict = tweets_df.set_index("tweetId").to_dict("index")
news_tweets_dict = news_tweets_df.set_index("tweetId").to_dict("index")

In [None]:
cn_tweets_ids = list(set.intersection(set(labels_df.tweetId.tolist()), set(tweets_df.tweetId.tolist())))
news_tweets_ids = news_tweets_dict.keys()
tweet_ids = list(set.union(set(cn_tweets_ids), set(news_tweets_ids)))

cn_data = [{"id": i, "text": tweets_dict[i]["content"], "label": int(labels_dict[i]["misleading"])} for i in cn_tweets_ids]
news_data = [{"id": i, "text": news_tweets_dict[i]["content"], "label": 0} for i in news_tweets_ids]

alldata = cn_data + news_data

In [None]:
np.random.seed(0)
inds = np.arange(len(alldata), dtype=int)
np.random.shuffle(inds)
k = int(np.floor(0.8 * len(inds)))
train_inds = inds[:k]
val_inds = inds[k:]

In [None]:
dataset_train = [alldata[i] for i in train_inds]
dataset_val = [alldata[i] for i in val_inds]

dataset_all = Dataset.from_list(alldata)
dataset_train = Dataset.from_list(dataset_train)
dataset_val = Dataset.from_list(dataset_val)

## Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
inf_pipeline = pipeline(task="text-classification", model=model, tokenizer=tokenizer, device=torch.device("cuda:0"))

## Inference

In [None]:
results = {ex["id"]: inf_pipeline(ex["text"]) for ex in tqdm.tqdm(dataset_all)}

In [None]:
results_list = [{"tweetId": i, **results.get(i)[0]} for i in tweet_ids if results.get(i) is not None]
results_df = pd.DataFrame(results_list)
results_df.to_csv(basepath.joinpath("results/tweets-emotion.csv"), index=False)

In [None]:
results_df