# Financial News Sentiment Analysis

In [None]:
!pip install datasets transformers evaluate
!pip install accelerate -U



In [None]:
import os
import numpy as np
import pandas as pd

from datasets import load_dataset, load_metric, Dataset
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate

## Data

Dataset source: https://huggingface.co/datasets/financial_phrasebank

* 0 (negative)
* 1 (neutral)
* 2 (positive)

In [None]:
fi_data = load_dataset('financial_phrasebank', 'sentences_50agree')
fi_data

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 4846
    })
})

In [None]:
df = pd.DataFrame(fi_data['train'])
df.head()

Unnamed: 0,sentence,label
0,"According to Gran , the company has no plans t...",1
1,Technopolis plans to develop in stages an area...,1
2,The international electronic industry company ...,0
3,With the new production plant the company woul...,2
4,According to the company 's updated strategy f...,2


In [None]:
print('Negative'.ljust(15) + 'Neutral'.ljust(15) + 'Positive'.ljust(15))
print('-'*45)
print(f"{len(df[df['label']==0])/len(df):.3g}".ljust(15) + f"{len(df[df['label']==1])/len(df):.3g}".ljust(15) + f"{len(df[df['label']==2])/len(df):.3g}".ljust(15))

Negative       Neutral        Positive       
---------------------------------------------
0.125          0.594          0.281          


### Split Dataset into Train and Test

In [None]:
fi_data = fi_data['train'].train_test_split(test_size=0.2, shuffle=True, seed=123)
train_data = fi_data['train']
test_data = fi_data['train']
fi_data

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 3876
    })
    test: Dataset({
        features: ['sentence', 'label'],
        num_rows: 970
    })
})

### Preprocessing

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def preprocess_function(examples):
    return tokenizer(examples["sentence"], truncation=True)
tok_train = train_data.map(preprocess_function, batched=True)
tok_test = test_data.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

In [None]:
tok_train

Dataset({
    features: ['sentence', 'label', 'input_ids', 'attention_mask'],
    num_rows: 3876
})

## Finetuning DistilBERT

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from sklearn import metrics

def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")

   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   return {"accuracy": accuracy}

In [None]:
training_args = TrainingArguments(
   output_dir='./',
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=10,
   weight_decay=0.01,
   save_strategy="epoch",
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tok_train,
   eval_dataset=tok_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.4372
1000,0.1451
1500,0.0456
2000,0.0234


TrainOutput(global_step=2430, training_loss=0.13564279128494577, metrics={'train_runtime': 279.4885, 'train_samples_per_second': 138.682, 'train_steps_per_second': 8.694, 'total_flos': 590844111927264.0, 'train_loss': 0.13564279128494577, 'epoch': 10.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.005310766864567995,
 'eval_accuracy': 0.9989680082559339,
 'eval_runtime': 8.2525,
 'eval_samples_per_second': 469.677,
 'eval_steps_per_second': 29.446,
 'epoch': 10.0}

In [None]:
trainer.save_model('FinNews_DistilBert')

## Test with Unseen News

Lets test the model with some articles.

Negative: https://finance.yahoo.com/news/australias-optus-hit-national-network-222203676.html

Positive: https://au.finance.yahoo.com/news/commonwealth-bank-australia-posts-flat-211016933.html


In [None]:
eval_data = [{"sentence": "Optus outage causes chaos in Australia before services restored", "label": 0},
             {"sentence": "Australia's CBA says home loan margins have stabilised, shares up", "label": 2}]
eval_data = Dataset.from_list(eval_data)
eval_data = eval_data.map(preprocess_function, batched=True)

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [None]:
pred_proba = trainer.predict(eval_data)
preds = np.argmax(pred_proba.predictions, axis=-1)

In [None]:
for i in range(len(preds)):
  print(f"Sentence: {eval_data[i]['sentence']}\nPrediction: {preds[i]}\nTrue: {eval_data[i]['label']}\n")

Sentence: Optus outage causes chaos in Australia before services restored
Prediction: 0
True: 0

Sentence: Australia's CBA says home loan margins have stabilised, shares up
Prediction: 2
True: 2



## Backtest - Trading The News

There are many methods to trade based on news. Lets keep it simple by buying and holdng positive news and selling negative news.