## Predict sentiment using FinBERT (Araci, 2019)

Pre-trained BERT with a classification layer finetuned for sentiment classification using Financial Phrasebank data.

## Intro

Import and load packages

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 13.6 gigabytes of available RAM

Not using a high-RAM runtime


In [None]:
pip install transformers datasets ray

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 4.2 MB/s 
[?25hCollecting datasets
  Downloading datasets-1.18.3-py3-none-any.whl (311 kB)
[K     |████████████████████████████████| 311 kB 72.0 MB/s 
[?25hCollecting ray
  Downloading ray-1.10.0-cp37-cp37m-manylinux2014_x86_64.whl (59.6 MB)
[K     |████████████████████████████████| 59.6 MB 1.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 16.0 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 36.6 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 10.2 M

In [None]:
import tensorflow as tf
from sklearn.metrics import accuracy_score
import pandas as pd
from sklearn.metrics import confusion_matrix
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from keras.preprocessing.sequence import pad_sequences
import torch
from transformers import (BertForSequenceClassification, AdamW, BertConfig, AutoModelForSequenceClassification, 
                          AutoTokenizer, Trainer, TrainingArguments)
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import numpy as np
import random
from transformers import get_linear_schedule_with_warmup
import time
import datetime
from datasets import list_datasets, load_dataset, Dataset, load_metric
from pprint import pprint
from transformers import Trainer, TrainingArguments
from google.colab import files, drive
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch import nn

## Data

Connect to Drive

In [None]:
drive.mount('/drive')

Mounted at /drive


Load text data

In [None]:
data = pd.read_excel("/drive/My Drive/Colab Notebooks/remainder_data_daily_100.xlsx", index_col=0) # 149654 samples vs now *149609* samples

In [None]:
data = data.loc[data['text'].apply(lambda x: isinstance(x,str))]

data = data.loc[data['date'] >= "2019-08-01"].drop_duplicates().dropna().reset_index(drop=True)
data = data.loc[data['date'] < "2021-01-01"].drop_duplicates().dropna().reset_index(drop=True)

In [None]:
data = data.loc[data['text'].apply(lambda x: isinstance(x,str))]

data['label'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
data

Unnamed: 0,source,cryptocurrency,date,text,label
0,news,BTC,2019-08-01,Fundstrat Co-Founder Tom Lee: US Fed Rate Cuts...,0
1,news,BTC,2019-08-01,Andorra Telecom Inks New Partnership to Run Bl...,0
2,news,BTC,2019-08-01,Application of Transaction Mining in Online Ga...,0
3,news,BTC,2019-08-01,"Blockchain Adoption Takes Off in Airlines, Avi...",0
4,news,BTC,2019-08-01,Kraken Exchange Acquires Service Provider for ...,0
...,...,...,...,...,...
182927,news,ETH,2022-02-21,Sipher’s Alain Dinh on What’s Next for NFT Gam...,0
182928,news,ETH,2022-02-21,Pocket Network becomes primary provider for Fu...,0
182929,news,ETH,2022-02-21,"$250K bounty 'not too low to be insulting,' sa...",0
182930,news,ETH,2022-02-21,NFTs and social capital: How projects are coll...,0


In [None]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import recall_score, precision_score, f1_score

## Setup

Define the labels, load the FinBERT model from the HuggingFace transformers library

In [None]:
def num_labels(data):
    
    data = np.where(data == "positive", 2, data)
    data = np.where(data == "neutral", 1, data)
    data = np.where(data == "negative", 0, data)
    
    return data.astype(int)

model_name = "ProsusAI/finbert"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/758 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/418M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/252 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
tokenizer.convert_ids_to_tokens(tokenizer.encode("bitcoin is not a shitcoin"))

['[CLS]',
 'bit',
 '##co',
 '##in',
 'is',
 'not',
 'a',
 'shit',
 '##co',
 '##in',
 '[SEP]']

Create text dataset object for faster fitting (using GPU)

In [None]:

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# accuracy as metric
metric = load_metric("accuracy")
 
# initialize model, FinBERT
def model_init():
    model = AutoModelForSequenceClassification.from_pretrained(model_name)

    return model

# compute the selected metrics
def compute_metrics(p):    
    pred, labels = p
    pred = np.argmax(pred, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='micro')
    precision = precision_score(y_true=labels, y_pred=pred, average='micro')
    f1 = f1_score(y_true=labels, y_pred=pred, average='micro')
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1} 

## Predict

Define the model using a trainer

In [None]:
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="epoch",     # Evaluation is done at the end of each epoch.
    #learning_rate=1e-2,
    learning_rate=3.607459472971186e-05,
    seed=20,
    num_train_epochs=4,              # total number of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    warmup_steps=0,                # number of warmup steps for learning rate scheduler
    weight_decay=0.03,               # strength of weight decay
    save_total_limit=1,              # limit the total amount of checkpoints. Deletes the older checkpoints.    
)

trainer = Trainer(
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=data,
    eval_dataset=data,
    model_init=model_init,
    compute_metrics=compute_metrics,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/ProsusAI/finbert/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/2120f4f96b5830e5a91fe94d242471b0133b0976c8d6e081594ab837ac5f17bc.ef97278c578016c8bb785f15296476b12eae86423097fed78719d1c8197a3430
Model config BertConfig {
  "_name_or_path": "ProsusAI/finbert",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "positive",
    "1": "negative",
    "2": "neutral"
  },
  "initializer_range": 0.02,
  "intermedi

Encode the test data, predict using FinBERT

In [None]:
test_encodings = tokenizer(data['text'].tolist(), truncation=True, padding='max_length', max_length=32)
test_labels = data.label.tolist() #num_labels(test.prediction).tolist()
test_dataset = TextDataset(test_encodings, test_labels)

raw_pred, __, __ = trainer.predict(test_dataset)
y_pred = np.argmax(raw_pred, axis=1)

TypeError: ignored

In [None]:
raw_pred[:,0]

array([ 1.8990431 ,  1.7718022 , -0.37985763,  0.09844496,  0.7437015 ,
        1.0239779 , -1.4079612 ,  1.9340616 ,  0.26231498,  1.6981953 ],
      dtype=float32)

Save the results as FinBERT output

In [None]:
data['pred_finbert'] = y_pred
data['positive_finbert'] = raw_pred[:,2]
data['negative_finbert'] = raw_pred[:,0]
data['neutral_finbert'] = raw_pred[:,1]

In [None]:
data['y_pred_finbert'] = y_pred
data = data.drop(['label'], axis=1)

In [None]:
data

Unnamed: 0,source,cryptocurrency,date,text,pred_finbert,positive_finbert,negative_finbert,neutral_finbert,y_pred_finbert
0,news,BTC,2019-08-01,Fundstrat Co-Founder Tom Lee: US Fed Rate Cuts...,0,-1.382105,1.899043,-1.758253,0
1,news,BTC,2019-08-01,Andorra Telecom Inks New Partnership to Run Bl...,0,0.251928,1.771802,-2.853214,0
2,news,BTC,2019-08-01,Application of Transaction Mining in Online Ga...,2,2.445175,-0.379858,-1.746230,2
3,news,BTC,2019-08-01,"Blockchain Adoption Takes Off in Airlines, Avi...",2,2.099126,0.098445,-2.048443,2
4,news,BTC,2019-08-01,Kraken Exchange Acquires Service Provider for ...,2,1.972164,0.743702,-2.631175,2
...,...,...,...,...,...,...,...,...,...
60770,twitter,ETH,2020-12-31,It's new year's eve.\n\nCrypto has come a long...,2,2.201941,-0.496605,-1.262185,2
60771,twitter,ETH,2020-12-31,It took me ~700 hours to paint this modern ver...,2,2.156669,-0.103092,-1.819631,2
60772,twitter,ETH,2020-12-31,Everyone bearish on ETH in 2020 was wrong. ETH...,0,-1.468105,1.630334,-1.332306,0
60773,twitter,ETH,2020-12-31,\n\n1. Lawsuit will be good for all of crypto....,0,0.389441,1.342949,-2.632306,0


In [None]:
print("Accuracy - ZSC & FinBERT Labels:", accuracy_score(num_labels(test.label).tolist(), y_pred))

Accuracy - ZSC Labels: 0.7144329896907217
Accuracy - Actual Labels: 0.6876288659793814


In [None]:
drive.mount('/drive')

Drive already mounted at /drive; to attempt to forcibly remount, call drive.mount("/drive", force_remount=True).


In [None]:
data.to_excel('/drive/My Drive/Colab Notebooks/data_finbert_labeled.xlsx', sheet_name = 'test_output')

In [None]:
#trainer.save_model('/drive/My Drive/Colab Notebooks/unfrozen_crypto_newtokens')

Saving model checkpoint to /drive/My Drive/Colab Notebooks/unfrozen_crypto_newtokens
Configuration saved in /drive/My Drive/Colab Notebooks/unfrozen_crypto_newtokens/config.json
Model weights saved in /drive/My Drive/Colab Notebooks/unfrozen_crypto_newtokens/pytorch_model.bin
tokenizer config file saved in /drive/My Drive/Colab Notebooks/unfrozen_crypto_newtokens/tokenizer_config.json
Special tokens file saved in /drive/My Drive/Colab Notebooks/unfrozen_crypto_newtokens/special_tokens_map.json
added tokens file saved in /drive/My Drive/Colab Notebooks/unfrozen_crypto_newtokens/added_tokens.json


In [None]:
model = BertForSequenceClassification.from_pretrained("/drive/My Drive/Colab Notebooks/unfrozen_fin", local_files_only=True)
trainer = Trainer(model=model)
trainer.model = model.cuda()

loading configuration file /drive/My Drive/Colab Notebooks/unfrozen_fin/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.16.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading we