In [4]:
#%pip install transformers torch

In [22]:
# import the bert tokenizer
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


# load financial data -- Note we have the following columns: Date, Ticker Pair, BERT Spread, BERT Position
data = pd.read_csv('outputs/bert_dummy.csv')
data['Date'] = pd.to_datetime(data['Date'])
# making binary labels for the position
data['Label'] = data['BERT Position'].apply(lambda x: 1 if x == 'Buy' else 0)

# create text description for BERT to process
texts = [f"The spread for {row['Ticker Pair']} on {row['Date'].strftime('%Y-%m-%d')} is {row['BERT Spread']}" for index, row in data.iterrows()]
labels = data['Label'].values

print(f'Data Loaded: {len(data)} rows')
print(f'Example Text: {texts[0]}')
print(f'Label distribution {sum(labels)} Buy, {len(labels) - sum(labels)} Sell')


# splitting the data into train and test splits
train_texts, test_texts, tain_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42, satisfy=labels)
print(f'Train Size: {len(train_texts)}, Test Size: {len(test_texts)}')


# tokenize and encode the data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=128)
labels = torch.tensor(labels)

# loading the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# fine tune the model
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5) # learning rate
model.train()

for epoch in range(3): # number of epochs
    optimizer.zero_grad() # clear previous gradients
    outputs = model(**inputs, labels=labels) # forward pass
    loss = outputs.loss # compute the loss
    loss.backward() # backpropagation
    optimizer.step() # update the weights
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}") # print the loss

# Generate Predictions
model.eval() # set the model to evaluation mode
with torch.no_grad():
    outputs = model(**inputs) # forward pass
    predictions = torch.argmax(outputs.logits, dim=1) # get the predicted labels

# evaluate performance
mse = mean_squared_error(labels, predictions)
r2 = r2_score(labels, predictions)
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")


Data Loaded: 45 rows
Example Text: The spread for AMD-NVDA on 2019-02-28 is -1.2458198968364615
Label distribution 21 Buy, 24 Sell


TypeError: got an unexpected keyword argument 'satisfy'