In [1]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch

df = pd.read_csv('updated_final_annotated_dataset_with_impacts (1).csv')

impact_columns = ['default_impact', 'mergers_acquisitions_impact', 'revenue_impact', 'margin_profitability_impact', 'industry_competition_impact']
df['combined_impact'] = df[impact_columns].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

label_to_id = {'good': 1, 'neutral': 0, 'bad': -1} 
df['impact_numerical'] = df['combined_impact'].map(label_to_id).fillna(0).astype(int)

text_column = 'content'  
texts = df[text_column].tolist()
impacts = df['impact_numerical'].tolist()

model_name = 'yiyanghkust/finbert-tone'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

max_length = 512  
inputs = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")

class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels)

    def __getitem__(self, idx):
        item = {key: self.encodings[key][idx] for key in self.encodings}  # No need for additional tensor creation
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

dataset = NewsDataset(inputs, impacts)
dataloader = DataLoader(dataset, batch_size=16)

def get_predictions(dataloader, model):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(model.device)
            attention_mask = batch['attention_mask'].to(model.device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)
            predictions.extend(preds.cpu().numpy())
    return predictions

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

predictions = get_predictions(dataloader, model)



  return self.fget.__get__(instance, owner)()


In [3]:
from sklearn.metrics import accuracy_score

df['true_impact_numerical'] = df['combined_impact'].map(label_to_id).fillna(0).astype(int)
true_labels = df['true_impact_numerical'].tolist()


accuracy = accuracy_score(true_labels, predictions)
print(f'Accuracy of the model is: {accuracy:.4f}')


Accuracy of the model is: 0.7237


In [9]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

text_column = 'content'
true_label_column = 'impact_numerical'

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df[text_column], df[true_label_column], test_size=0.2, random_state=42)

tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone')

train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, max_length=512)

train_dataset = TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(train_labels.values)
)
test_dataset = TensorDataset(
    torch.tensor(test_encodings['input_ids']),
    torch.tensor(test_encodings['attention_mask']),
    torch.tensor(test_labels.values)
)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16)

def get_predictions(dataloader, model):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)
            predictions.extend(preds.cpu().numpy())
    return predictions

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

test_predictions = get_predictions(test_dataloader, model)

test_accuracy = accuracy_score(test_labels, test_predictions)
print(f'Accuracy on the test set: {test_accuracy:.4f}')




Accuracy on the test set: 0.7337
