# Sentiment Analysis

In [None]:
import psycopg2
from psycopg2 import sql

from collections import OrderedDict
from typing import Optional, Callable, List, Tuple
import json

from tqdm import tqdm

import numpy as np
import pandas as pd
import torch
from scipy.special import softmax

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig

In [None]:
# Load tweet text, article title, article summary, and article sentences

conn = psycopg2.connect(host = 'localhost', database = 'nytpopular')
with conn.cursor() as cursor:
    cursor.execute('''SELECT wordbags.id, tweets.text, articles.title, articles.summary, wordbags.sentences
                        FROM wordbags
                        LEFT JOIN articles ON wordbags.id = articles.id
                        LEFT JOIN tweets ON wordbags.id = tweets.id;''')
    texts = cursor.fetchall()
conn.close()

In [None]:
def updateFieldsinDB(fields, table_name, cursor):
    keys, items = zip(*fields.items())
    keys = sql.SQL(',').join(map(sql.Identifier, keys))
    items = sql.Literal(items)
    table_name = sql.Identifier(table_name)
    key_item_pairs = sql.SQL(', ').join(
        sql.Composed([
            sql.Identifier(k), sql.SQL(' = '), sql.Literal(v)
        ]) for k, v in fields.items()
    )
    command = sql.SQL("""INSERT INTO {} ({})
        VALUES {}
        ON CONFLICT (id) DO UPDATE
        SET {};""").format(table_name, keys, items, key_item_pairs)
    cursor.execute(command)


def remove_http(text):
    return ' '.join([word for word in text.split() if not word.startswith('http')])


def average_dictionary(dicts : List[dict]) -> dict:
    avg_dict = dict()
    for key in dicts[0]:
        avg_dict[key] = sum(d[key] for d in dicts)/len(dicts)
    return avg_dict


def sentiment_processing(predict : Callable, filter_condition : Optional[Callable] = None) -> Tuple[List, List, List, List, List]:
    ids = []
    tweet_text_scores = []
    article_title_scores = []
    article_summary_scores = []
    article_main_scores = []
    for id, tweet_text, article_title, article_summary, article_sentences in tqdm(texts):
        ids.append(str(id))
        if tweet_text is None:
            tweet_text_scores.append(None)
        else:
            tweet_text_scores.append(predict(remove_http(tweet_text)))
        if article_title is None:
            article_title_scores.append(None)
        else:
            article_title_scores.append(predict(article_title))
        if article_summary is None:
            article_summary_scores.append(None)
        else:
            article_summary_scores.append(predict(article_summary))
        if filter_condition is None:
            filter_condition = lambda x: True
        sentence_scores = [entry for entry in (predict(sent) for sent in article_sentences) if filter_condition(entry)]
        if len(sentence_scores) == 0:
            article_main_scores.append(predict(' '.join(article_sentences)))
        else:
            article_main_scores.append(average_dictionary(sentence_scores))
    return ids, tweet_text_scores, article_title_scores, article_summary_scores, article_main_scores

In [None]:
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.float32):
            return float(obj)
        return json.JSONEncoder.default(self, obj)

## VADER

Rule-based sentiment analysis

In [None]:
analyzer = SentimentIntensityAnalyzer()

In [None]:
vader_ids, vader_tweet_texts, vader_article_titles, vader_article_summaries, vader_article_main = sentiment_processing(analyzer.polarity_scores, lambda entry: entry['neu'] != 1.0)

In [None]:
vader_sentiment_data = dict(
    vader_ids = vader_ids,
    vader_tweet_texts = vader_tweet_texts,
    vader_article_titles = vader_article_titles,
    vader_article_summaries = vader_article_summaries,
    vader_article_main = vader_article_main
)

with open('SentimentData/vader_sentiment_data.json', 'w') as f:
    json.dump(vader_sentiment_data, f)

## BERT

Transformer-based sentiment analysis

In [None]:
class SentimentModel:

    def __init__(self, model, max_length = None):
        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
        self.tokenizer = AutoTokenizer.from_pretrained(model)
        self.config = AutoConfig.from_pretrained(model)
        self.classifier = AutoModelForSequenceClassification.from_pretrained(model).to(self.device)
        self.max_length = max_length
    
    # Would probably be more efficient to batch data rather than send one at a time...
    def predict(self, text):
        with torch.no_grad():
            tokenized_input = self.tokenizer(text, return_tensors = 'pt', truncation = True, max_length = self.max_length).to(self.device)
            scores = softmax(self.classifier(**tokenized_input)[0][0].cpu().numpy()) # Don't need to detach when in torch.no_grad context.
        return {label: scores[idx] for idx, label in self.config.id2label.items()}

### DistilBERT from Hugging Face

In [None]:
distilbert = SentimentModel('distilbert-base-uncased-finetuned-sst-2-english')

In [None]:
distilbert_ids, distilbert_tweet_texts, distilbert_article_titles, distilbert_article_summaries, distilbert_article_main = sentiment_processing(distilbert.predict)

In [None]:
distilbert_sentiment_data = dict(
    distilbert_ids = distilbert_ids,
    distilbert_tweet_texts = distilbert_tweet_texts,
    distilbert_article_titles = distilbert_article_titles,
    distilbert_article_summaries = distilbert_article_summaries,
    distilbert_article_main = distilbert_article_main
)

with open('SentimentData/distilbert_sentiment_data.json', 'w') as f:
    json.dump(distilbert_sentiment_data, f, cls = NumpyEncoder)

### RoBERTa trained on ~124M tweets

In [None]:
roberta = SentimentModel(r'cardiffnlp/twitter-roberta-base-sentiment-latest', max_length = 512)

In [None]:
roberta_ids, roberta_tweet_texts, roberta_article_titles, roberta_article_summaries, roberta_article_main = sentiment_processing(roberta.predict)

In [None]:
roberta_sentiment_data = dict(
    roberta_ids = roberta_ids,
    roberta_tweet_texts = roberta_tweet_texts,
    roberta_article_titles = roberta_article_titles,
    roberta_article_summaries = roberta_article_summaries,
    roberta_article_main = roberta_article_main
)

with open('SentimentData/roberta_sentiment_data.json', 'w') as f:
    json.dump(roberta_sentiment_data, f, cls = NumpyEncoder)

### SiEBERT - Sentiment RoBERTa trained on diverse English-language corpus

In [None]:
siebert = SentimentModel(r'siebert/sentiment-roberta-large-english')

In [None]:
siebert_ids, siebert_tweet_texts, siebert_article_titles, siebert_article_summaries, siebert_article_main = sentiment_processing(siebert.predict)

In [None]:
siebert_sentiment_data = dict(
    siebert_ids = siebert_ids,
    siebert_tweet_texts = siebert_tweet_texts,
    siebert_article_titles = siebert_article_titles,
    siebert_article_summaries = siebert_article_summaries,
    siebert_article_main = siebert_article_main
)

with open('SentimentData/siebert_sentiment_data.json', 'w') as f:
    json.dump(siebert_sentiment_data, f, cls = NumpyEncoder)

## Storing in an SQL table

In [None]:
with open('SentimentData/vader_sentiment_data.json', 'r') as f:
    vader_sentiment_data = json.load(f)

with open('SentimentData/distilbert_sentiment_data.json', 'r') as f:
    distilbert_sentiment_data = json.load(f)

with open('SentimentData/roberta_sentiment_data.json', 'r') as f:
    roberta_sentiment_data = json.load(f)

with open('SentimentData/siebert_sentiment_data.json', 'r') as f:
    siebert_sentiment_data = json.load(f)

In [None]:
conn = psycopg2.connect(host = 'localhost', database = 'nytpopular')
conn.autocommit = True
cursor = conn.cursor()

cursor.execute('''CREATE TABLE sentiment (

	id BIGINT PRIMARY KEY,

    vader_tweet_texts_neg REAL,
    vader_tweet_texts_neu REAL,
    vader_tweet_texts_pos REAL,
    vader_tweet_texts_compound REAL,
    vader_article_titles_neg REAL,
    vader_article_titles_neu REAL,
    vader_article_titles_pos REAL,
    vader_article_titles_compound REAL,
    vader_article_summaries_neg REAL,
    vader_article_summaries_neu REAL,
    vader_article_summaries_pos REAL,
    vader_article_summaries_compound REAL,
    vader_article_main_neg REAL,
    vader_article_main_neu REAL,
    vader_article_main_pos REAL,
    vader_article_main_compound REAL,

    distilbert_tweet_texts_negative REAL,
    distilbert_tweet_texts_positive REAL,
    distilbert_article_titles_negative REAL,
    distilbert_article_titles_positive REAL,
    distilbert_article_summaries_negative REAL,
    distilbert_article_summaries_positive REAL,
    distilbert_article_main_negative REAL,
    distilbert_article_main_positive REAL,

    roberta_tweet_texts_negative REAL,
    roberta_tweet_texts_positive REAL,
    roberta_tweet_texts_neutral REAL,
    roberta_article_titles_negative REAL,
    roberta_article_titles_positive REAL,
    roberta_article_titles_neutral REAL,
    roberta_article_summaries_negative REAL,
    roberta_article_summaries_positive REAL,
    roberta_article_summaries_neutral REAL,
    roberta_article_main_negative REAL,
    roberta_article_main_positive REAL,
    roberta_article_main_neutral REAL,

    siebert_tweet_texts_negative REAL,
    siebert_tweet_texts_positive REAL,
    siebert_article_titles_negative REAL,
    siebert_article_titles_positive REAL,
    siebert_article_summaries_negative REAL,
    siebert_article_summaries_positive REAL,
    siebert_article_main_negative REAL,
    siebert_article_main_positive REAL
);''')

cursor.close()
conn.close()

In [None]:
def process_sentiment_record(entry):
    processed_entry = dict()
    for key, item in entry:
        if 'ids' in key:
            processed_entry['id'] = int(item)
        else:
            if item is None:
                continue
            elif isinstance(item, dict):
                for label, score in item.items():
                    processed_entry[key + '_' + label.lower()] = score
            else:
                raise TypeError(f'Unknown entry: {key}, {item}')
    return processed_entry

def sentiment_pivot(sentiment_data, return_DataFrame = False):
    keys, items = zip(*sentiment_data.items())
    items = zip(*items)
    data = []
    for row in items:
        data.append(process_sentiment_record(zip(keys, row)))
    if return_DataFrame:
        data = pd.DataFrame(data)
    return data

In [None]:
with psycopg2.connect(host = 'localhost', database = 'nytpopular') as conn:
    with conn.cursor() as cursor:
        for row in sentiment_pivot(vader_sentiment_data):
            updateFieldsinDB(row, 'sentiment', cursor)

In [None]:
with psycopg2.connect(host = 'localhost', database = 'nytpopular') as conn:
    with conn.cursor() as cursor:
        for row in sentiment_pivot(distilbert_sentiment_data):
            updateFieldsinDB(row, 'sentiment', cursor)

In [None]:
with psycopg2.connect(host = 'localhost', database = 'nytpopular') as conn:
    with conn.cursor() as cursor:
        for row in sentiment_pivot(roberta_sentiment_data):
            updateFieldsinDB(row, 'sentiment', cursor)

In [None]:
with psycopg2.connect(host = 'localhost', database = 'nytpopular') as conn:
    with conn.cursor() as cursor:
        for row in sentiment_pivot(siebert_sentiment_data):
            updateFieldsinDB(row, 'sentiment', cursor)