In [None]:
DATA_PATH = '../data_preparation/whole_dataset.csv'

# Imports

In [None]:
import torch
import sys

sys.path.append('../lib/sentiment_analysis_utils')
sys.path.append('../lib')
from sentiment_analysis_utils import combine_lede_and_text, remove_text_formatting, read_all_news_in_dir

import os
from tqdm import tqdm
import pandas as pd

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
transformers_device = 0 if device == 'cuda:0' else -1
# print(torch.cuda.get_device_name(0))

# Read the data

In [None]:
df_en_raw = pd.read_csv(DATA_PATH)
df_en_raw

load the model below. We decided on siebert at this moment.

In [None]:
from transformers import pipeline

models = {
    "siebert-roberta": pipeline("sentiment-analysis", model="siebert/sentiment-roberta-large-english", device=transformers_device),
    ## labels: 0=neg, 1=pos

    "financial-bert":  pipeline("text-classification", model="ahmedrachid/FinancialBERT-Sentiment-Analysis", device=transformers_device),
    ## labels: 0=neg, 1=neutral, 2=pos

    "auditor_sentiment": pipeline("text-classification", model="FinanceInc/auditor_sentiment_finetuned", device=transformers_device),
    ## labels: 0=neg, 1=neutral, 2=pos

    "twitter-roberta": pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment-latest", device=transformers_device),
    ## labels: 0=neg, 1=neutral, 2=pos

    "financial-roberta": pipeline("text-classification", model="mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis", device=transformers_device)
    ## labels: 0=neg, 1=neutral, 2=pos
}

Due to limitation of 512 tokens we need to iterate over 512-token chunks of longer articles. The logits are finally aggregated by mean.

In [None]:
def predict_article_sentiment(article_text, sentiment_pipeline):
    tokens = torch.tensor(sentiment_pipeline.tokenizer.encode(article_text)).unsqueeze(0)
    max_tokens = min(sentiment_pipeline.tokenizer.model_max_length, 512)
    n_iter = tokens.shape[1] // max_tokens + 1
    logits = []
    # iterate over [max_tokens]-token chunks of the article
    for i in range(n_iter):
        current_tokens = tokens[:,i*max_tokens:(i+1)*max_tokens]
        if current_tokens.shape[1] == 0:
            break
        current_attention = torch.ones_like(current_tokens)
        # get prediction of current chunk of the article
        logits += [sentiment_pipeline.model(input_ids=current_tokens, attention_mask=current_attention).logits]
    # aggregate results of chunks by mean
    softmax = torch.cat(logits).mean(0).softmax(0)
    if len(softmax) == 2:
        if softmax[0] < 0.5:
            return 1 # neutral
        else:
            return 2 * softmax.argmax().item() # 2 * {0, 1} -> 2 is positive, 0 is negative
    return softmax.argmax().item()

## Check for one article

In [None]:
text = df_en_raw.text[21]
text

In [None]:
for name, model in models.items():
    print(f"model: {name} predicted {predict_article_sentiment(text, model)}")

In [None]:
# test_df = pd.read_csv(os.path.join('..', 'data_preparation', 'testset.csv'), index_col=0)
# for i, row in tqdm(test_df.iterrows()):
#     test_df.loc[i, 'overall_sentiment'] = predict_article_sentiment(row.whole_text, sentiment_analysis)

In [None]:
#test_df.to_csv("test_df_overall_sentiment.csv")

## Full data evaluation

Make predicitons and save the file at the end.

In [None]:
for name, model in models.items():
    for i, row in tqdm(df_en_raw.iterrows()):
        df_en_raw.loc[i, 'overall_sentiment_name'] = predict_article_sentiment(row.whole_text, model)

        if i % 100 == 0:
            print(i)
            df_en_raw.to_csv(f"full_dataseet_overall_sentiment_{name}_{i}.csv")

    df_en_raw.to_csv(f"full_dataseet_overall_sentiment_{name}.csv")
