In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch

df = pd.read_csv('updated_final_annotated_dataset_with_impacts (1).csv')

label_to_id = {'good': 1, 'neutral': 0, 'bad': -1} 
df['impact_numerical'] = df['combined_impact'].map(label_to_id).fillna(0).astype(int)

text_column = 'content'  
texts = df[text_column].tolist()
impacts = df['impact_numerical'].tolist()

model_name = 'yiyanghkust/finbert-tone'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

max_length = 512  
inputs = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")

class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels)

    def __getitem__(self, idx):
        item = {key: self.encodings[key][idx] for key in self.encodings}  # No need for additional tensor creation
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

dataset = NewsDataset(inputs, impacts)
dataloader = DataLoader(dataset, batch_size=16)

def get_predictions(dataloader, model):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(model.device)
            attention_mask = batch['attention_mask'].to(model.device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)
            predictions.extend(preds.cpu().numpy())
    return predictions

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

predictions = get_predictions(dataloader, model)



In [3]:
from sklearn.metrics import accuracy_score

df['true_impact_numerical'] = df['combined_impact'].map(label_to_id).fillna(0).astype(int)
true_labels = df['true_impact_numerical'].tolist()


accuracy = accuracy_score(true_labels, predictions)
print(f'Accuracy of the model is: {accuracy:.4f}')


Accuracy of the model is: 0.7237


In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

text_column = 'content'
true_label_column = 'impact_numerical'

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df[text_column], df[true_label_column], test_size=0.2, random_state=42)

tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone')

train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, max_length=512)

train_dataset = TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(train_labels.values)
)
test_dataset = TensorDataset(
    torch.tensor(test_encodings['input_ids']),
    torch.tensor(test_encodings['attention_mask']),
    torch.tensor(test_labels.values)
)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16)

def get_predictions(dataloader, model):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)
            predictions.extend(preds.cpu().numpy())
    return predictions

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

test_predictions = get_predictions(test_dataloader, model)

test_accuracy = accuracy_score(test_labels, test_predictions)
print(f'Accuracy on the test set: {test_accuracy:.4f}')


### With Impact

In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

event_keywords = {
    'default': ['bankruptcy', 'default', 'insolvency'],
    'mergers_acquisitions': ['merger', 'acquisition', 'takeover'],
    'revenue': ['revenue', 'sales', 'earnings'],
    'margin_profitability': ['margin', 'profitability', 'operating income'],
    'industry_competition': ['competition', 'market share', 'competitor']
}

def is_event_mentioned(text, keywords):
    for word in keywords:
        if word in text.lower():
            return True
    return False

df = pd.read_csv('updated_final_annotated_dataset_with_impacts (1).csv')

for event_type, keywords in event_keywords.items():
    column_name = f'{event_type}_mentioned'
    df[column_name] = df['content'].apply(lambda text: is_event_mentioned(text, keywords))

for event_type in event_keywords:
    impact_column = f'{event_type}_impact'
    mentioned_column = f'{event_type}_mentioned'
    df[impact_column] = df.apply(lambda row: row[impact_column] if row[mentioned_column] else 'no_event', axis=1)

df['filtered_impacts'] = df[[f'{event_type}_impact' for event_type in event_keywords]].apply(lambda row: ' '.join(str(val) for val in row.values), axis=1)
df['mentioned_events'] = df.apply(lambda row: ' '.join([event_type for event_type in event_keywords if row[f'{event_type}_mentioned']]), axis=1)


label_to_id = {'good': 1, 'neutral': 0, 'bad': -1, 'no_event': 0}
df['impact_numerical'] = df['filtered_impacts'].apply(lambda impacts: max([label_to_id.get(impact, 0) for impact in impacts.split()]))


text_column = 'content'
true_label_column = 'impact_numerical'

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df[text_column], df[true_label_column], test_size=0.2, random_state=42)

tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone')

train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, max_length=512)

train_dataset = TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(train_labels.values)
)
test_dataset = TensorDataset(
    torch.tensor(test_encodings['input_ids']),
    torch.tensor(test_encodings['attention_mask']),
    torch.tensor(test_labels.values)
)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16)

def get_predictions(dataloader, model):
    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)
            predictions.extend(preds.cpu().numpy())
    return predictions

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

test_predictions = get_predictions(test_dataloader, model)

test_accuracy = accuracy_score(test_labels, test_predictions)
print(f'Accuracy on the test set: {test_accuracy:.4f}')


In [8]:
df.head()

Unnamed: 0,index,summary,description,Ticker,Sector,Industry,Company,pubDate_brief,pubDate,categories,...,margin_profitability_impact,industry_competition_impact,default_mentioned,mergers_acquisitions_mentioned,revenue_mentioned,margin_profitability_mentioned,industry_competition_mentioned,filtered_impacts,mentioned_events,impact_numerical
0,12024,Osaka Governor Hirofumi Yoshimura said that th...,Years of delay to plans for Japan‚Äö√Ñ√¥s firs...,MGM,Services,Casinos & Gaming,MGM Resorts International,2023-05-18,2023-05-18T21:25:29+00:00,[{'name': 'Health'}],...,no_event,no_event,False,False,False,False,False,no_event no_event no_event no_event no_event,,0
1,20675,MetLife (MET) is a Finance stock that has seen...,Dividends are one of the best benefits to bein...,MET,Financials,Insurance,Metlife Inc,2022-10-31,2022-10-31T20:36:25+00:00,[],...,no_event,good,False,False,True,False,True,no_event no_event good no_event good,revenue industry_competition,1
2,33685,"This week, top-five producer AngloGold Ashanti...",(Bloomberg) -- The momentum has been building ...,NEM,Extractives & Minerals Processing,Metals & Mining,Newmont Corp,2023-02-08,2023-02-08T22:16:21+00:00,[{'name': 'Politics'}],...,no_event,no_event,False,False,False,False,False,no_event no_event no_event no_event no_event,,0
3,12072,The case is In re Tesla Inc Securities Litigat...,Some of the biggest securities cases of 2023 a...,NDAQ,Financials,Security & Commodity Exchanges,Nasdaq Inc,2023-05-18,2023-05-18T14:28:52+00:00,[{'name': 'Tech'}],...,no_event,no_event,False,False,False,False,False,no_event no_event no_event no_event no_event,,0
4,28164,"CFOs Boost Currency Protections, Extend Hedge ...","Coca-Cola, Kimberly-Clark and Prologis are amo...",KO,Food & Beverage,Non-Alcoholic Beverages,Coca-Cola Co,2023-05-04,2023-05-04T23:39:33+00:00,[{'name': 'Tech'}],...,good,good,False,True,True,True,True,no_event good good good good,mergers_acquisitions revenue margin_profitabil...,1
