In [4]:
#%pip install transformers torch

In [None]:

# import the bert tokenizer
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import pandas as pd
import yfinance as yf
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import time

# load financial data
def load_data(ticker1, ticker2, start_date, end_date):
    # Helper function to retry downloads
    def download_retry(ticker, max_retries=5, delay=5):
        attempts = 0
        while attempts < max_retries:
            try:
                data = yf.download(ticker, start=start_date, end=end_date)
                if not data.empty:
                    return data
                else:
                    print(f"Empty data returned for {ticker}, retrying...")
            except Exception as e:
                print(f"Error downloading {ticker}: {e}")
            
            attempts += 1
            if attempts < max_retries:
                print(f"Waiting {delay} seconds before retry {attempts+1}/{max_retries}")
                time.sleep(delay)
        
        raise ValueError(f"Failed to download data for {ticker} after {max_retries} attempts")
    
    # download the data retrying if necessary
    print(f"Downloading data for {ticker1}...")
    data1 = download_retry(ticker1)
    print(f"Downloading data for {ticker2}...")
    data2 = download_retry(ticker2)
    
    # Align the data by date
    common_dates = data1.index.intersection(data2.index)
    data1 = data1.loc[common_dates]
    data2 = data2.loc[common_dates]
    
    # Combine the data into a single DataFrame
    data = pd.DataFrame({
        'Date': common_dates,
        'Spread': data1['Adj Close'] - data2['Adj Close']
    }).set_index('Date')
    
    print(f"Successfully loaded spread data with {len(data)} rows")
    return data

# example data
data = load_data('KO', 'PEP', '2022-01-01', '2023-01-01')

# prepare data for BERT
data["Spread_Change"] = data["Spread"].diff().fillna(0)
texts = data["Spread_Change"].astype(str).tolist()
labels = (data["Spread_Change"] > 0).astype(int).tolist() # 1 for positive change, 0 for negative change

# tokenize and encode the data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=128)
labels = torch.tensor(labels)

# loading the BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# fine tune the model
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5) # learning rate
model.train()

for epoch in range(3): # number of epochs
    optimizer.zero_grad() # clear previous gradients
    outputs = model(**inputs, labels=labels) # forward pass
    loss = outputs.loss # compute the loss
    loss.backward() # backpropagation
    optimizer.step() # update the weights
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}") # print the loss

# Generate Predictions
model.eval() # set the model to evaluation mode
with torch.no_grad():
    outputs = model(**inputs) # forward pass
    predictions = torch.argmax(outputs.logits, dim=1) # get the predicted labels

# evaluate performance
mse = mean_squared_error(labels, predictions)
r2 = r2_score(labels, predictions)
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")


Downloading data for KO...


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['KO']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


Empty data returned for KO, retrying...
Waiting 5 seconds before retry 2/5


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['KO']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


Empty data returned for KO, retrying...
Waiting 5 seconds before retry 3/5


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['KO']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


Empty data returned for KO, retrying...
Waiting 5 seconds before retry 4/5


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['KO']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


Empty data returned for KO, retrying...
Waiting 5 seconds before retry 5/5


[*********************100%***********************]  1 of 1 completed

1 Failed download:
['KO']: YFRateLimitError('Too Many Requests. Rate limited. Try after a while.')


Empty data returned for KO, retrying...


ValueError: Failed to download data for KO after 5 attempts