In [1]:
import pandas as pd
from dateutil.relativedelta import relativedelta



df = pd.read_csv("outputs/spreads_weekly_large.csv")


df["formatted_date"] = pd.to_datetime(df["Date"]).dt.strftime("on %B %-d, %Y")
df.rename(columns={"Ticker Pair": "Ticker_Pair"}, inplace=True)


df[["tick1", "tick2"]] = df["Ticker_Pair"].str.split("-", expand=True)


df["chronobert_text"] = (
    df["formatted_date"] + ", the pairwise spread between " +
    df["tick1"] + " and " + df["tick2"] +
    " closed at " + df["Spread"].astype(str) + "."
)


df.drop(columns=["formatted_date", "tick1", "tick2"], inplace=True)
df.tail()

Unnamed: 0,Date,Ticker_Pair,Spread,Return,chronobert_text
1555,2018-12-02,TSLA-NVDA,-0.145262,-0.052578,"on December 2, 2018, the pairwise spread betwe..."
1556,2018-12-09,TSLA-NVDA,0.07863,0.118171,"on December 9, 2018, the pairwise spread betwe..."
1557,2018-12-16,TSLA-NVDA,0.131947,0.02948,"on December 16, 2018, the pairwise spread betw..."
1558,2018-12-23,TSLA-NVDA,0.110505,-0.010357,"on December 23, 2018, the pairwise spread betw..."
1559,2018-12-30,TSLA-NVDA,0.132623,0.012605,"on December 30, 2018, the pairwise spread betw..."


In [2]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.preprocessing import OneHotEncoder


device = 'cpu'  
model_name = "manelalab/chrono-bert-v1-19991231"


tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)
model.eval()



def embed_text(texts, model, tokenizer, device='cpu'):
    """Convert a list of sentences into [CLS] embeddings."""
    embeddings = []
    for text in tqdm(texts, desc="Embedding"):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)

        cls = outputs.last_hidden_state[:, 0, :]  
        embeddings.append(cls.squeeze().cpu().numpy())
    
    return np.stack(embeddings)


def predict_text_spread(text, model, tokenizer, model_reg, pair_str=None, encoder=None, device='cpu'):
    """Predict the spread for a single new sentence, with optional pair encoding and fallback."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()

    if pair_str and encoder:
        try:
            pair_encoded = encoder.transform([[pair_str]])
        except ValueError:
            
            n_pairs = sum(len(cat) for cat in encoder.categories_)
            pair_encoded = np.zeros((1, n_pairs))
        
        full_embedding = np.concatenate([embedding, pair_encoded.squeeze()])
    else:
        full_embedding = embedding

    return model_reg.predict(full_embedding.reshape(1, -1))[0]




texts = df["chronobert_text"].tolist()
y = df["Spread"].values


pair_encoder = OneHotEncoder(sparse_output=False)
pair_onehot = pair_encoder.fit_transform(df[["Ticker_Pair"]])  


X_embed = embed_text(texts, model, tokenizer, device)


X = np.concatenate([X_embed, pair_onehot], axis=1)


window_size = 10  
predictions, actuals = [], []

for i in range(window_size, len(X)):
    X_train, y_train = X[:i], y[:i]
    X_test, y_test_actual = X[i].reshape(1, -1), y[i]

    model_reg = Ridge(alpha=1.0)
    model_reg.fit(X_train, y_train)

    y_pred = model_reg.predict(X_test)[0]
    predictions.append(y_pred)
    actuals.append(y_test_actual)


mse = mean_squared_error(actuals, predictions)
print(f"Rolling Ridge MSE (ticker-pair aware): {mse:.6f}")
















  from .autonotebook import tqdm as notebook_tqdm
Embedding: 100%|██████████| 1560/1560 [02:52<00:00,  9.03it/s]


Rolling Ridge MSE (ticker-pair aware): 0.242920


In [4]:
import calendar
from datetime import datetime, timedelta


def predict_future_spreads(df, ticker_pair, steps=12, time_unit="day"):
    last_date = pd.to_datetime(df["Date"]).max()
    future_preds = []
    t1, t2 = ticker_pair.split("-")
    pair_str = ticker_pair  

    for i in range(1, steps + 1):
        if time_unit == "day":
            future_date = last_date + timedelta(days=i)
        elif time_unit == "week":
            future_date = last_date + timedelta(weeks=i)
        elif time_unit == "month_end":
            target_month = last_date + relativedelta(months=i)
            last_day = calendar.monthrange(target_month.year, target_month.month)[1]
            future_date = target_month.replace(day=last_day)
        else:
            raise ValueError(f"Unsupported time_unit: {time_unit}")

        date_str = future_date.strftime("%B %-d, %Y")
        future_text = f"on {date_str}, the spread between {t1} and {t2} was 0.0000."

        predicted_spread = predict_text_spread(
            text=future_text,
            model=model,
            tokenizer=tokenizer,
            model_reg=model_reg,
            pair_str=pair_str,
            encoder=pair_encoder,
            device=device
        )

        future_preds.append({
            "date": future_date.strftime("%Y-%m-%d"),
            "ticker_pair": ticker_pair,
            "CHRONOBERT_spread": predicted_spread
        })

    return pd.DataFrame(future_preds)



pairs = ["AMD-META", "NVDA-TSLA", "AMD-NVDA"]
future_all = pd.concat(
    [predict_future_spreads(df, pair, steps=52, time_unit="week") for pair in pairs],
    ignore_index=True
)






In [5]:

future_all.to_csv("outputs/CHRONOBERT_spreads_weekly.csv", index=False)
future_all

Unnamed: 0,date,ticker_pair,CHRONOBERT_spread
0,2019-01-06,AMD-META,1.662015
1,2019-01-13,AMD-META,1.488941
2,2019-01-20,AMD-META,1.582295
3,2019-01-27,AMD-META,1.530764
4,2019-02-03,AMD-META,1.719117
...,...,...,...
151,2019-12-01,AMD-NVDA,0.913022
152,2019-12-08,AMD-NVDA,1.084414
153,2019-12-15,AMD-NVDA,1.017953
154,2019-12-22,AMD-NVDA,1.014233
