In [7]:
import pandas as pd
from dateutil.relativedelta import relativedelta



df = pd.read_csv("outputs/spreads.csv")


df["formatted_date"] = pd.to_datetime(df["Date"]).dt.strftime("on %B %-d, %Y")
df.rename(columns={"Ticker Pair": "Ticker_Pair"}, inplace=True)


df[["tick1", "tick2"]] = df["Ticker_Pair"].str.split("-", expand=True)


df["chronobert_text"] = (
    df["formatted_date"] + ", the pairwise spread between " +
    df["tick1"] + " and " + df["tick2"] +
    " closed at " + df["Spread"].astype(str) + "."
)


df.drop(columns=["formatted_date", "tick1", "tick2"], inplace=True)
df.tail()

Unnamed: 0,Date,Ticker_Pair,Spread,Return,chronobert_text
40,2019-12-31,NVDA-TSLA,-0.135089,-0.182264,"on December 31, 2019, the pairwise spread betw..."
41,2020-01-31,NVDA-TSLA,-1.808124,-0.550358,"on January 31, 2020, the pairwise spread betwe..."
42,2020-02-29,NVDA-TSLA,-1.397503,0.11619,"on February 29, 2020, the pairwise spread betw..."
43,2020-03-31,NVDA-TSLA,-0.560452,0.1916,"on March 31, 2020, the pairwise spread between..."
44,2020-04-30,NVDA-TSLA,-1.70784,-0.395445,"on April 30, 2020, the pairwise spread between..."


In [8]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.preprocessing import OneHotEncoder


device = 'cpu'  
model_name = "manelalab/chrono-bert-v1-19991231"


tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)
model.eval()



def embed_text(texts, model, tokenizer, device='cpu'):
    """Convert a list of sentences into [CLS] embeddings."""
    embeddings = []
    for text in tqdm(texts, desc="Embedding"):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)

        cls = outputs.last_hidden_state[:, 0, :]  
        embeddings.append(cls.squeeze().cpu().numpy())
    
    return np.stack(embeddings)


def predict_text_spread(text, model, tokenizer, model_reg, pair_str=None, encoder=None, device='cpu'):
    """Predict the spread for a single new sentence, with optional pair encoding and fallback."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()

    if pair_str and encoder:
        try:
            pair_encoded = encoder.transform([[pair_str]])
        except ValueError:
            
            n_pairs = sum(len(cat) for cat in encoder.categories_)
            pair_encoded = np.zeros((1, n_pairs))
        
        full_embedding = np.concatenate([embedding, pair_encoded.squeeze()])
    else:
        full_embedding = embedding

    return model_reg.predict(full_embedding.reshape(1, -1))[0]




texts = df["chronobert_text"].tolist()
y = df["Spread"].values


pair_encoder = OneHotEncoder(sparse_output=False)
pair_onehot = pair_encoder.fit_transform(df[["Ticker_Pair"]])  


X_embed = embed_text(texts, model, tokenizer, device)


X = np.concatenate([X_embed, pair_onehot], axis=1)


window_size = 10  
predictions, actuals = [], []

for i in range(window_size, len(X)):
    X_train, y_train = X[:i], y[:i]
    X_test, y_test_actual = X[i].reshape(1, -1), y[i]

    model_reg = Ridge(alpha=1.0)
    model_reg.fit(X_train, y_train)

    y_pred = model_reg.predict(X_test)[0]
    predictions.append(y_pred)
    actuals.append(y_test_actual)


mse = mean_squared_error(actuals, predictions)
print(f"Rolling Ridge MSE (ticker-pair aware): {mse:.6f}")
















Embedding: 100%|██████████| 45/45 [00:05<00:00,  7.71it/s]

Rolling Ridge MSE (ticker-pair aware): 0.573311





In [12]:
import calendar

def predict_future_spreads(df, ticker_pair, steps=12, time_unit="day"):
    last_date = pd.to_datetime(df["Date"]).max()
    future_preds = []
    t1, t2 = ticker_pair.split("-")
    pair_str = ticker_pair  

    for i in range(1, steps + 1):
        if time_unit == "day":
            future_date = last_date + timedelta(days=i)
        elif time_unit == "week":
            future_date = last_date + timedelta(weeks=i)
        elif time_unit == "month_end":
            target_month = last_date + relativedelta(months=i)
            last_day = calendar.monthrange(target_month.year, target_month.month)[1]
            future_date = target_month.replace(day=last_day)
        else:
            raise ValueError(f"Unsupported time_unit: {time_unit}")

        date_str = future_date.strftime("%B %-d, %Y")
        future_text = f"on {date_str}, the spread between {t1} and {t2} was 0.0000."

        predicted_spread = predict_text_spread(
            text=future_text,
            model=model,
            tokenizer=tokenizer,
            model_reg=model_reg,
            pair_str=pair_str,
            encoder=pair_encoder,
            device=device
        )

        future_preds.append({
            "date": future_date.strftime("%Y-%m-%d"),
            "ticker_pair": ticker_pair,
            "CHRONOBERT_spread": predicted_spread
        })

    return pd.DataFrame(future_preds)



pairs = ["AMD-META", "NVDA-TSLA", "AMD-NVDA"]
future_all = pd.concat(
    [predict_future_spreads(df, pair, steps=36, time_unit="month_end") for pair in pairs],
    ignore_index=True
)






In [14]:

future_all.to_csv("outputs/CHRONOBERT_spreads.csv", index=False)
future_all

Unnamed: 0,date,ticker_pair,CHRONOBERT_spread
0,2020-05-31,AMD-META,0.920550
1,2020-06-30,AMD-META,0.902704
2,2020-07-31,AMD-META,1.004044
3,2020-08-31,AMD-META,0.873997
4,2020-09-30,AMD-META,0.984757
...,...,...,...
103,2022-12-31,AMD-NVDA,0.579656
104,2023-01-31,AMD-NVDA,0.596021
105,2023-02-28,AMD-NVDA,0.471082
106,2023-03-31,AMD-NVDA,0.396769
