In [45]:
### Core packages
import numpy as np
import pandas as pd
import torch
import sqlite3 as sql
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast

# Create a connection to the SQLite database
conn = sql.connect('data.db')
ECs = pd.read_sql_query("SELECT * from ECs2", conn)
conn.close()

# Custom Dataset for batching windows
class TextWindowDataset(Dataset):
    def __init__(self, texts, tokenizer, window_size=300, overlap=5, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.window_size = window_size
        self.overlap = overlap
        self.max_length = max_length
        self.windows = self._create_windows()

    def _create_windows(self):
        all_windows = []
        for text in self.texts:
            words = text.split()
            windows = []
            for i in range(0, len(words), self.window_size - self.overlap):
                window = words[i:i + self.window_size]
                if len(window) > 0:
                    windows.append(" ".join(window))
            all_windows.append(windows)
        return all_windows

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.windows[idx]

def get_sentiment_batched(dataset, model, tokenizer, device, batch_size=32, max_length=512):
    model.eval()
    all_scores = []

    with torch.no_grad():
        for windows in dataset:
            if not windows:  # Skip empty window sets
                all_scores.append(np.nan)
                continue

            # Tokenize all windows for this text
            inputs = tokenizer(
                windows,
                truncation=True,
                padding=True,
                max_length=max_length,
                return_tensors="pt"
            )
            inputs = {k: v.to(device) for k, v in inputs.items()}

            # Create DataLoader for this text's windows
            dataset = torch.utils.data.TensorDataset(inputs['input_ids'], inputs['attention_mask'])
            dataloader = DataLoader(dataset, batch_size=batch_size)

            scores = []
            for batch in dataloader:
                input_ids, attention_mask = batch
                with autocast():  # Mixed precision
                    outputs = model(input_ids, attention_mask=attention_mask)
                probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
                predicted_indices = torch.argmax(probs, dim=-1).cpu().numpy()
                scores.extend(predicted_indices)

            all_scores.append(np.mean(scores) if scores else np.nan)

    return all_scores

The code below has already been executed over several hours, and the results have been saved in the SQL database. Therefore, it has been commented out.

In [46]:
### Models we will work with
model_names = {
    "financialbert": "ahmedrachid/FinancialBERT-Sentiment-Analysis",
    "distilroberta": "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis",
    "financial-roberta": "soleimanian/financial-roberta-large-sentiment"
}

# ### Main loop
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# for i in model_names:
#     print(f"\n{'='*40}\n")
#     print(f"Running model - {i} ({model_names[i]})")
#     print(f"\n{'='*40}\n")

#     # Load tokenizer and model
#     tokenizer = AutoTokenizer.from_pretrained(model_names[i])
#     config = AutoConfig.from_pretrained(model_names[i], output_hidden_states=True)
#     model = AutoModelForSequenceClassification.from_pretrained(model_names[i], config=config)
#     model = model.to(device)

#     # Create dataset
#     dataset = TextWindowDataset(ECs['PA'].tolist(), tokenizer, window_size=300, overlap=5)

#     # Run batched sentiment analysis
#     ECs[i] = get_sentiment_batched(dataset, model, tokenizer, device, batch_size=32)

#     # Print model summary
#     print(f"Model successfully loaded and run.")
#     print(f"\n{'='*40}\n")
#     print(f"Model Summary:")
#     print(f"  - Model Name: {i} ({model_names[i]})")
#     print(f"  - Labels: {model.config.id2label}")
#     print(f"  - Device: {device}")
#     print(f"{'='*40}\n")

# ### Save the results
# conn = sql.connect('data.db')
# ECs.to_sql('ECs_with_bart', conn, if_exists='replace', index=False)
# conn.close()

In [48]:
### Getting the BART values
conn = sql.connect('data.db')
ECs_with_bart = pd.read_sql_query("SELECT * from ECs_with_bart", conn)
conn.close()

### Getting only the values we need
ECs_with_bart = ECs_with_bart[['financialbert', 'distilroberta', 'financial-roberta']]
ECs = ECs.join(ECs_with_bart, how='inner')

### Save the results
conn = sql.connect('data.db')
ECs_with_bart.to_sql('bart', conn, if_exists='replace', index=False)
ECs.to_sql('ECs3', conn, if_exists='replace', index=False)
conn.close()