## In this file, we try to predict spread
We are using BERT model to predict the spread of the assets with the following steps:
- Using spreads.csv -- has columns: 'Date', 'Ticker Pair', 'Spread', and 'Return'
- Making a texts column takin  

Desired output format: 'Date', 'Ticker Pair', 'Spread', 'Return'


In [None]:
#%pip install transformers torch

In [34]:
import pandas as pd

# load financial data -- Note we have the following columns: Date, Ticker Pair, BERT Spread, BERT Position
data = pd.read_csv('outputs/spreads.csv')
data['Date'] = pd.to_datetime(data['Date'])
data.rename(columns={"Ticker Pair": "Ticker_Pair"}, inplace=True)
data[["tick1", "tick2"]] = data["Ticker_Pair"].str.split("-", expand=True)
# making binary labels for the position, 'Buy' if spread is negatvie, 'Sell' if spread is positive
# data['BERT Position'] = data['Spread'].apply(lambda x: 'Buy' if x < 0 else 'Sell')
# create text description for BERT to process
data['texts'] = [f"The spread for {row['Ticker_Pair']} on {row['Date'].strftime('%Y-%m-%d')} is {row['Spread']}" for index, row in data.iterrows()]


data.head()

Unnamed: 0,Date,Ticker_Pair,Spread,Return,tick1,tick2,texts
0,2019-02-28,AMD-NVDA,-1.319545,,AMD,NVDA,The spread for AMD-NVDA on 2019-02-28 is -1.31...
1,2019-03-31,AMD-NVDA,-2.016619,-0.079436,AMD,NVDA,The spread for AMD-NVDA on 2019-03-31 is -2.01...
2,2019-04-30,AMD-NVDA,-1.31197,0.07466,AMD,NVDA,The spread for AMD-NVDA on 2019-04-30 is -1.31...
3,2019-05-31,AMD-NVDA,1.456129,0.242786,AMD,NVDA,The spread for AMD-NVDA on 2019-05-31 is 1.456...
4,2019-06-30,AMD-NVDA,0.568122,-0.104398,AMD,NVDA,The spread for AMD-NVDA on 2019-06-30 is 0.568...


In [None]:
# import the bert tokenizer
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader, Dataset, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, f1_score
import numpy as np


texts = data['texts'].values
spread = data['Spread'].values
print(f'Data Loaded: {len(data)} rows')
print(f'Spread distribution {sum(spread)}, {len(spread) - sum(spread)}')


# splitting the data into train and test splits
train_texts, test_texts, train_spreads, test_spreads = train_test_split(
    texts, spread, test_size=0.2, random_state=42, stratify=spread)
print(f'Train Size: {len(train_texts)}, Test Size: {len(test_texts)}')


# tokenize and encode the data using a pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# tokenize training data
train_encodings = tokenizer(
    list(train_texts), padding=True, truncation=True, return_tensors='pt', max_length=128
)

# tokenize test data
test_encodings = tokenizer(
    list(test_texts), padding=True, truncation=True, return_tensors='pt', max_length=128
)

# create a custom dataset
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(train_spreads))
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], torch.tensor(test_spreads))

# create data loaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)



# loading the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_spreads=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# fine tune the model
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5) # learning rate

# epochs means one complete pass through the entire training dataset. 
# epoch 1: the model sees all training data once and makes initial adjustments
# epoch 2: the model sees the data again, learns from its previous mistakes, and makes further adjustments
# epoch 3: the model sees the data a third time, refines its understanding, and makes final adjustments
# In this case, we will use 3 epochs for training.
num_epochs = 3 

model.train()
for epoch in range(num_epochs): # number of epochs
    total_loss = 0
    for batch in train_loader:
        input_ids, attention_mask, batch_spreads = [b.to(device) for b in batch] # move to GPU if available
        optimizer.zero_grad() # clear previous gradients
        # forward pass
        outputs = model(input_ids, attention_mask=attention_mask, spread=batch_spreads)
        loss = outputs.loss
        # backward pass
        loss.backward()
        optimizer.step()

    # print epoch loss
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {total_loss / len(train_loader):.4f}")


model.eval()
all_preds = [] 
all_spreads = [] 


# Generate Predictions
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, batch_spreads = [b.to(device) for b in batch] # move to GPU if available
        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1) # get the predicted spreads
        
        # found that converting to numpy array works better for metrics
        all_preds.extend(predictions.cpu().numpy()) 
        all_spreads.extend(batch_spreads.cpu().numpy())

# Convert lists numpy arrays for metrics
all_preds = np.array(all_preds)
all_spreads = np.array(all_spreads)

# Calculate metrics 
mse = mean_squared_error(all_spreads, all_preds)
f1 = f1_score(all_spreads, all_preds, average='weighted')
r2 = r2_score(all_spreads, all_preds)

# evaluate performance
mse = mean_squared_error(all_spreads, all_preds)
f1 = f1_score(all_spreads, all_preds, average='weighted')
r2 = r2_score(all_spreads, all_preds)
print(f"Mean Squared Error: {mse}")
print(f"F1 Score: {f1}")
print(f"R^2 Score: {r2}")



Data Loaded: 45 rows
Spread distribution -1.1546319456101628e-14, 45.000000000000014


ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [None]:
# I want to compare the predictions with the actual data
print("Predictions vs Actual")
print("===================================")
print(f"Predictions: {all_preds[:10]}")
print(f"Labels:      {all_spreads[:10]}")
print("===================================")

Predictions vs Actual
Predictions: [0 0 0 0 0 0 0 0 0]
Labels:      [0 0 0 0 1 1 1 1 0]
