### With Impact

In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.nn.functional import softmax

# Define keywords for events
event_keywords = {
    'default': ['bankruptcy', 'default', 'insolvency'],
    'mergers_acquisitions': ['merger', 'acquisition', 'takeover'],
    'revenue': ['revenue', 'sales', 'earnings'],
    'margin_profitability': ['margin', 'profitability', 'operating income'],
    'industry_competition': ['competition', 'market share', 'competitor']
}

# Check if event is mentioned in the text
def is_event_mentioned(text, keywords):
    for word in keywords:
        if word in text.lower():
            return True
    return False

# Function to compute sentiment score
def get_sentiment_score(text, tokenizer, model, device):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = softmax(logits, dim=1).cpu().numpy()
    
    sentiment_score = probabilities[0, 2] - probabilities[0, 0]
    return sentiment_score

# Load the dataset
df = pd.read_csv('updated_final_annotated_dataset_with_impacts.csv')

# Apply the event checking function to each event type and create a new column for it
for event_type, keywords in event_keywords.items():
    column_name = f'{event_type}_mentioned'
    df[column_name] = df['content'].apply(lambda text: is_event_mentioned(text, keywords))

# Filter the impact scores based on whether the corresponding event is mentioned
for event_type in event_keywords:
    impact_column = f'{event_type}_impact'
    mentioned_column = f'{event_type}_mentioned'
    df[impact_column] = df.apply(lambda row: row[impact_column] if row[mentioned_column] else 'no_event', axis=1)

# Combine the filtered impact scores into a single column
df['filtered_impacts'] = df[[f'{event_type}_impact' for event_type in event_keywords]].apply(lambda row: ' '.join(str(val) for val in row.values), axis=1)

# Map the categorical labels to integers
label_to_id = {'good': 1, 'neutral': 0, 'bad': -1, 'no_event': 0}
df['impact_numerical'] = df['filtered_impacts'].apply(lambda impacts: max([label_to_id.get(impact, 0) for impact in impacts.split()]))

# Load tokenizer and model for sentiment analysis
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
model = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone')

# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Compute the sentiment score for each article and add it as a column
df['sentiment_score'] = df['content'].apply(lambda text: get_sentiment_score(text, tokenizer, model, device))







  return self.fget.__get__(instance, owner)()


In [2]:
# Specify the column names
text_column = 'content'
true_label_column = 'impact_numerical'

# Split the data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df[text_column], df[true_label_column], test_size=0.2, random_state=42)

# Tokenize the training and test data
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, max_length=512)

# Convert labels to numpy and then to tensors
train_dataset = TensorDataset(
    torch.tensor(train_encodings['input_ids']),
    torch.tensor(train_encodings['attention_mask']),
    torch.tensor(train_labels.values)
)
test_dataset = TensorDataset(
    torch.tensor(test_encodings['input_ids']),
    torch.tensor(test_encodings['attention_mask']),
    torch.tensor(test_labels.values)
)

# DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16)

# Training loop
optimizer = AdamW(model.parameters(), lr=5e-5)
for epoch in range(3):  # Number of epochs
    model.train()
    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1} completed")

# Evaluate the model
def evaluate_model(dataloader, model):
    model.eval()
    total_accuracy = 0
    for batch in dataloader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)
            accuracy = (preds == labels).float().mean()
            total_accuracy += accuracy.item()

    return total_accuracy / len(dataloader)

test_accuracy = evaluate_model(test_dataloader, model)
print(f"Test accuracy: {test_accuracy:.4f}")




Epoch 1 completed
Epoch 2 completed
Epoch 3 completed
Test accuracy: 0.8741


In [4]:
df.head()

Unnamed: 0,index,summary,description,Ticker,Sector,Industry,Company,pubDate_brief,pubDate,categories,...,margin_profitability_impact,industry_competition_impact,default_mentioned,mergers_acquisitions_mentioned,revenue_mentioned,margin_profitability_mentioned,industry_competition_mentioned,filtered_impacts,impact_numerical,sentiment_score
0,12024,Osaka Governor Hirofumi Yoshimura said that th...,Years of delay to plans for Japan‚Äö√Ñ√¥s firs...,MGM,Services,Casinos & Gaming,MGM Resorts International,2023-05-18,2023-05-18T21:25:29+00:00,[{'name': 'Health'}],...,no_event,no_event,False,False,False,False,False,no_event no_event no_event no_event no_event,0,-0.99978
1,20675,MetLife (MET) is a Finance stock that has seen...,Dividends are one of the best benefits to bein...,MET,Financials,Insurance,Metlife Inc,2022-10-31,2022-10-31T20:36:25+00:00,[],...,no_event,good,False,False,True,False,True,no_event no_event good no_event good,1,-0.999844
2,33685,"This week, top-five producer AngloGold Ashanti...",(Bloomberg) -- The momentum has been building ...,NEM,Extractives & Minerals Processing,Metals & Mining,Newmont Corp,2023-02-08,2023-02-08T22:16:21+00:00,[{'name': 'Politics'}],...,no_event,no_event,False,False,False,False,False,no_event no_event no_event no_event no_event,0,0.985379
3,12072,The case is In re Tesla Inc Securities Litigat...,Some of the biggest securities cases of 2023 a...,NDAQ,Financials,Security & Commodity Exchanges,Nasdaq Inc,2023-05-18,2023-05-18T14:28:52+00:00,[{'name': 'Tech'}],...,no_event,no_event,False,False,False,False,False,no_event no_event no_event no_event no_event,0,-0.882581
4,28164,"CFOs Boost Currency Protections, Extend Hedge ...","Coca-Cola, Kimberly-Clark and Prologis are amo...",KO,Food & Beverage,Non-Alcoholic Beverages,Coca-Cola Co,2023-05-04,2023-05-04T23:39:33+00:00,[{'name': 'Tech'}],...,good,good,False,True,True,True,True,no_event good good good good,1,6.1e-05


In [5]:
df.to_csv('modified_dataset.csv', index=False)

print("Dataset has been saved to 'modified_dataset.csv'.")


Dataset has been saved to 'modified_dataset.csv'.
