# Fine tune BERT language model for sentiment analysis
## BERT
BERT (Bidirectional Encoder Representations from Transformers) is pre-trained language model, trained on 800M words and has 110M parameters  
Using a larger model to improve the accuracy of sentiment analysis task, which was under 40% from LSTM model trained from scratch

Fine tune the input and output layers to take in headlines news and classify them into negative, natural or positive

Imports

In [1]:
import tensorflow as tf
import tensorflow_hub as hub

from keras.utils import np_utils

import official.nlp.bert.tokenization as tokenization

from official import nlp
import official.nlp.optimization as opt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import numpy as np
import pandas as pd


Load data, data from 'data_processor_catergoical.py'

In [2]:
df = pd.read_csv('cleaned_FB_catergorical.csv')

Split data int training and testing

In [3]:
#Convert into list for model inpuot
x = df.Headlines.values
y = df.Labels.values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20)

Adapted github user's adam0ling - twiter_sentiment "https://github.com/adam0ling/twitter_sentiment/blob/main/3_BERT.ipynb" and updated to version 3 of BERT multi cased

Label Encoding

In [4]:
encoder = LabelEncoder()
encoder.fit(y)
encoded_y_test = encoder.transform(y_test)
encoder_y_train = encoder.transform(y_train)

#Encode (0,1,2) labels into (001, 010, 100)
dummy_y_test = np_utils.to_categorical(encoded_y_test)
dummy_y_train = np_utils.to_categorical(encoder_y_train)

Download Bert layers

In [5]:
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3", #Using cased version, cased words in headlines news carries meaningful insights
                            trainable=True)

Tokenization

In [6]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [7]:
tokenizer.convert_tokens_to_ids(['[CLS]', '[SEP]'])

[101, 102]

Function to tokenize input list

In [8]:
def encode_names(n):
    tokens = list(tokenizer.tokenize(n))
    tokens.append('[SEP]')
    return tokenizer.convert_tokens_to_ids(tokens)

headlines = tf.ragged.constant([encode_names(n) for n in x_train])

cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*headlines.shape[0]
input_word_ids = tf.concat([cls, headlines], axis=-1)

input_mask = tf.ones_like(input_word_ids).to_tensor()

type_cls = tf.zeros_like(cls)
type_headline = tf.ones_like(headlines)
input_type_ids = tf.concat([type_cls, type_headline], axis=-1).to_tensor()

lens = [len(i) for i in input_word_ids]
max_seq_length = max(lens)
max_seq_length = int(1.5*max_seq_length)

In [9]:
def encode_names(n, tokenizer):
    tokens = list(tokenizer.tokenize(n))
    tokens.append('[SEP]')
    return tokenizer.convert_tokens_to_ids(tokens)

def bert_encode(string_list, tokenizer, max_seq_length):
    num_examples = len(string_list)

    string_tokens = tf.ragged.constant([
        encode_names(n, tokenizer) for n in np. array(string_list)
    ])

    cls = [tokenizer.convert_tokens_to_ids('[CLS]')]*string_tokens.shape[0]
    input_word_ids = tf.concat([cls, string_tokens], axis=-1)

    input_mask = tf.ones_like(input_word_ids).to_tensor(shape=(None, max_seq_length))

    type_cls = tf.zeros_like(cls)
    type_tokens = tf.ones_like(string_tokens)
    input_type_ids = tf.concat(
        [type_cls, type_tokens], axis=-1).to_tensor(shape=(None, max_seq_length))

    inputs = {
        'input_words_ids' : input_word_ids.to_tensor(shape=(None,  max_seq_length)),
        'input_mask' : input_mask,
        'input_type_ids' : input_type_ids
    }

    return inputs

Tokenize training and testing data

In [10]:
X_train = bert_encode(x_train, tokenizer, max_seq_length)
X_test = bert_encode(x_test, tokenizer, max_seq_length)

Set sequence length and number of classes 

In [11]:
num_class = len(encoder.classes_) #3 classes 
max_seq_length = max_seq_length

Fine tune model

In [12]:
#Input layer
encoder_inputs = dict(
    input_word_ids=tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32),
    input_mask=tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32),
    input_type_ids=tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32),
)

outputs = bert_layer(encoder_inputs)

pooled_output = outputs["pooled_output"]
sequence_output = outputs["sequence_output"]

output = tf.keras.layers.Dropout(rate=0.1)(pooled_output)
output = tf.keras.layers.Dense(num_class, activation='softmax', name='output')(output) #Output layer

model = tf.keras.Model(inputs=encoder_inputs, outputs=output) #Fine tuned model

Model training parameters

In [13]:
epochs = 2
batch_size = 10 #Depend on your GPU ram, increase for faster training
eval_batch_size = batch_size

train_data_size = len(dummy_y_test)
steps_per_epoch = int(train_data_size / batch_size)
num_train_steps = steps_per_epoch * epochs
warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)

optimizer = nlp.optimization.create_optimizer(2e-5, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)

Compile model

In [14]:
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

Train model on 2 epochs

In [15]:
history = model.fit(
    X_train,
    dummy_y_train,
    epochs=epochs,
    batch_size=batch_size,
    validation_data=(X_test, dummy_y_test),
    verbose=1
)

Epoch 1/2
Epoch 2/2


In [None]:
model.save('bert_headline.h5')

# Conclusion
BERT model outperformed LSTM model that was trained from scratch  
BERT's validation accuracy is 53% compared to LSTM validation accruacy of 40%

## Lack of training data
Althought there were only 2,000 training samples, a larger data set could increase the accuracy at the cost of higher training time  
Model was able to learn to classify better than random guessing, having a slight edge could be using in developing a strategy

## To-do:
* Collect more data, instead of collecting headline news of a specific stock  
Collect data of the entire market and train it on an index eg. S&P 500  
* Impelement a mid-low frequency trading strategy  
Softmax activation funcation gives the probability of the output, probability could be used for risk management
