# Financial News Sentiment Analysis

In [1]:
import os
import numpy as np
import pandas as pd

from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


## Data

Dataset source: https://huggingface.co/datasets/financial_phrasebank

* 0 (negative)
* 1 (neutral)
* 2 (positive)

In [2]:
fi_data = load_dataset('financial_phrasebank', 'sentences_50agree')
fi_data

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 4846
    })
})

In [3]:
df = pd.DataFrame(fi_data['train'])
df.head()

Unnamed: 0,sentence,label
0,"According to Gran , the company has no plans t...",1
1,Technopolis plans to develop in stages an area...,1
2,The international electronic industry company ...,0
3,With the new production plant the company woul...,2
4,According to the company 's updated strategy f...,2


In [4]:
print('Negative'.ljust(15) + 'Neutral'.ljust(15) + 'Positive'.ljust(15))
print('-'*45)
print(f"{len(df[df['label']==0])/len(df):.3g}".ljust(15) + f"{len(df[df['label']==1])/len(df):.3g}".ljust(15) + f"{len(df[df['label']==2])/len(df):.3g}".ljust(15))

Negative       Neutral        Positive       
---------------------------------------------
0.125          0.594          0.281          


### Split Dataset into Train and Test

In [5]:
fi_data = fi_data['train'].train_test_split(test_size=0.2, shuffle=True, seed=123)
train_data = fi_data['train']
test_data = fi_data['train']
fi_data

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 3876
    })
    test: Dataset({
        features: ['sentence', 'label'],
        num_rows: 970
    })
})

### Preprocessing

In [6]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def preprocess_function(examples):
    return tokenizer(examples["sentence"], truncation=True)
tok_train = train_data.map(preprocess_function, batched=True)
tok_test = test_data.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

In [7]:
tok_train

Dataset({
    features: ['sentence', 'label', 'input_ids', 'attention_mask'],
    num_rows: 3876
})

## Finetuning DistilBERT

In [8]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
def compute_metrics(eval_pred):
   load_accuracy = load_metric("accuracy")
   load_f1 = load_metric("f1")
  
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   accuracy = load_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
   f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
   return {"accuracy": accuracy, "f1": f1}

In [10]:
training_args = TrainingArguments(
   output_dir='./',
   learning_rate=2e-5,
   per_device_train_batch_size=16,
   per_device_eval_batch_size=16,
   num_train_epochs=2,
   weight_decay=0.01,
   save_strategy="epoch",
   use_cpu=True
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=tok_train,
   eval_dataset=tok_test,
   tokenizer=tokenizer,
   data_collator=data_collator,
   compute_metrics=compute_metrics,
)

In [11]:
trainer.train()

  0%|          | 0/486 [00:00<?, ?it/s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 20%|█▉        | 96/486 [05:28<21:26,  3.30s/it]

KeyboardInterrupt: 