In [4]:
#%pip install transformers torch

In [11]:
import pandas as pd

# load financial data -- Note we have the following columns: Date, Ticker Pair, BERT Spread, BERT Position
data = pd.read_csv('outputs/bert_dummy.csv')
data['Date'] = pd.to_datetime(data['Date'])
data.rename(columns={"Ticker Pair": "Ticker_Pair"}, inplace=True)
data[["tick1", "tick2"]] = data["Ticker_Pair"].str.split("-", expand=True)
# making binary labels for the position
data['Label'] = data['BERT Position'].apply(lambda x: 1 if x == 'Buy' else 0)
# create text description for BERT to process
data['texts'] = [f"The spread for {row['Ticker_Pair']} on {row['Date'].strftime('%Y-%m-%d')} is {row['BERT Spread']}" for index, row in data.iterrows()]


data.head()

Unnamed: 0,Date,Ticker_Pair,BERT Spread,BERT Position,tick1,tick2,Label,texts
0,2019-02-28,AMD-NVDA,-1.24582,Buy,AMD,NVDA,1,The spread for AMD-NVDA on 2019-02-28 is -1.24...
1,2019-03-31,AMD-NVDA,-1.004199,Buy,AMD,NVDA,1,The spread for AMD-NVDA on 2019-03-31 is -1.00...
2,2019-04-30,AMD-NVDA,-1.52541,Buy,AMD,NVDA,1,The spread for AMD-NVDA on 2019-04-30 is -1.52...
3,2019-05-31,AMD-NVDA,2.611275,Sell,AMD,NVDA,0,The spread for AMD-NVDA on 2019-05-31 is 2.611...
4,2019-06-30,AMD-NVDA,0.882452,Sell,AMD,NVDA,0,The spread for AMD-NVDA on 2019-06-30 is 0.882...


In [None]:
# import the bert tokenizer
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader, Dataset, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, f1_score
import numpy as np


texts = data['texts'].values
labels = data['Label'].values
print(f'Data Loaded: {len(data)} rows')
print(f'Label distribution {sum(labels)} Buy, {len(labels) - sum(labels)} Sell')


# splitting the data into train and test splits
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels)
print(f'Train Size: {len(train_texts)}, Test Size: {len(test_texts)}')


# tokenize and encode the data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# tokenize training data
train_encodings = tokenizer(
    list(train_texts), padding=True, truncation=True, return_tensors='pt', max_length=128
)

# tokenize test data
test_encodings = tokenizer(
    list(test_texts), padding=True, truncation=True, return_tensors='pt', max_length=128
)

# create a custom dataset
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(train_labels))
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], torch.tensor(test_labels))

# create data loaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)



# loading the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# fine tune the model
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5) # learning rate

# epochs means one complete pass through the entire training dataset. 
# epoch 1: the model sees all training data once and makes initial adjustments
# epoch 2: the model sees the data again, learns from its previous mistakes, and makes further adjustments
# epoch 3: the model sees the data a third time, refines its understanding, and makes final adjustments
# In this case, we will use 3 epochs for training.
num_epochs = 3 

model.train()
for epoch in range(num_epochs): # number of epochs
    total_loss = 0
    for batch in train_loader:
        input_ids, attention_mask, batch_labels = [b.to(device) for b in batch] # move to GPU if available
        optimizer.zero_grad() # clear previous gradients
        # forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=batch_labels)
        loss = outputs.loss
        # backward pass
        loss.backward()
        optimizer.step()

    # print epoch loss
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {total_loss / len(train_loader):.4f}")


model.eval()
all_preds = [] 
all_labels = [] 


# Generate Predictions
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, batch_labels = [b.to(device) for b in batch] # move to GPU if available
        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1) # get the predicted labels
        
        # found that converting to numpy array works better for metrics
        all_preds.extend(predictions.cpu().numpy()) 
        all_labels.extend(batch_labels.cpu().numpy())

# Convert lists numpy arrays for metrics
all_preds = np.array(all_preds)
all_labels = np.array(all_labels)

# Calculate metrics 
mse = mean_squared_error(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds, average='weighted')
r2 = r2_score(all_labels, all_preds)

# evaluate performance
mse = mean_squared_error(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds, average='weighted')
r2 = r2_score(all_labels, all_preds)
print(f"Mean Squared Error: {mse}")
print(f"F1 Score: {f1}")
print(f"R^2 Score: {r2}")


Data Loaded: 45 rows
Label distribution 21 Buy, 24 Sell
Train Size: 36, Test Size: 9


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3, Average Loss: 0.0000
Epoch 2/3, Average Loss: 0.0000
Epoch 3/3, Average Loss: 0.0000
Mean Squared Error: 0.4444444444444444
F1 Score: 0.39682539682539686
R^2 Score: -0.7999999999999998
