In [None]:
%load_ext autoreload
%autoreload 2

# Using Article Embeddings to model Asset Returns: Informer

Multistep forecasting.

In [None]:
import os
import sys
from pathlib import Path

sys.path.append(
    Path.cwd().parents[0].as_posix()
)

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.optim as optim
from accelerate import Accelerator
from datasets import Dataset, DatasetDict
from gluonts.dataset.field_names import FieldName
from gluonts.dataset.common import ListDataset
from gluonts.dataset.loader import as_stacked_batches
from gluonts.time_feature import time_features_from_frequency_str
from gluonts.transform import (
    AddAgeFeature, 
    AddTimeFeatures, 
    Chain, 
    InstanceSplitter, 
    RenameFields,
    VstackFeatures
)
from gluonts.transform.sampler import ExpectedNumInstanceSampler
from sklearn.manifold import TSNE
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from torch.utils.data import DataLoader
from transformers import InformerConfig, InformerForPrediction

from finnews.datasets import InformerFusionDataset
from finnews.models import InformerFusionModel

In [None]:
accelerator = Accelerator()
device = accelerator.device

# Defines

In [None]:
root_input_dir = os.path.join("data", "processed")

train_start = "2010-01-01"
val_start = "2023-06-01"
test_start = "2023-09-01"

holding_period = 14

# Data

## Raw

In [None]:
df_prices = pd.read_parquet(
    os.path.join(root_input_dir, "prices.parquet")
).sort_values(
    by=["symbol", "date"],
    ignore_index=True
)

In [None]:
df_articles_meta = pd.read_parquet(
    os.path.join(root_input_dir, "articles.parquet"),
    columns=["date", "symbol", "article"]
)

In [None]:
articles = torch.load(
    os.path.join(root_input_dir, "embeddings.pt"),
    weights_only=True,
    map_location=device
)

In [None]:
# The number of articles should equal to the number of embeddings
assert (df_articles_meta.shape[0] == articles.shape[0])

## Returns

Open-to-open next day ahead log returns:

- Strategy implies we buy and sell on market open (simplifying assumption)
- Easier for multiscale (just add up returns)
- Returns on non-business days are 0 (model will handle this via time embeddings)
- Trimming to coinside with the start of articles

In [None]:
df_ret = df_prices[["date", "symbol"]].copy()

# you buy today
df_ret["ret"] = -np.log(df_prices["open"])
# you sell tomorrow
df_ret["ret"] += np.log(df_prices.groupby('symbol')["open"].shift(-1))

In [None]:
# Adding non-business days and setting their returns to 0
df_ret = df_ret.set_index(
    ['symbol', 'date']
).sort_index().groupby(
    level="symbol"
).apply(
    lambda x: x.reset_index(
        level="symbol", 
        drop=True
    ).reindex(
        pd.date_range(
            start=x.index.get_level_values("date").min(),
            end=x.index.get_level_values("date").max(),
        ),
        level="date"
    ),
).fillna(0)

df_ret.index.rename([df_ret.index.names[0], "date"], inplace=True)
df_ret = df_ret.reset_index()

In [None]:
# Trimming to articles
start_dates = df_articles_meta.groupby(
    "symbol"
)["date"].min().to_dict()

df_ret = df_ret.groupby('symbol', as_index=True).apply(
    lambda x: x.loc[x["date"] >= start_dates[x.name]],
    include_groups=False
).droplevel(level=-1).reset_index()

## Articles

- Multiple articles on any given day are averaged
- Timestamps are shifted forward to align with returns
- Zero embedding is used to denote days where there were no news articles

In [None]:
df_articles_meta_agg = df_articles_meta.groupby(
    ["date", "symbol"]
).apply(
    lambda x: x.index.tolist(), 
    include_groups=False
).to_frame(
    "indices"
).reset_index()

df_articles_meta_agg = df_articles_meta_agg.sort_values(
    ["symbol", "date"],
    ignore_index=True
)

# We can respond to today's news tomorrow
df_articles_meta_agg["date"] += pd.Timedelta("1d")

# Conforming to df_ret
df_articles_meta_agg = df_articles_meta_agg.set_index(
    ["symbol", "date"]
)
df_articles_meta_agg = df_articles_meta_agg.reindex(
    df_ret.set_index(["symbol", "date"]).index
)

In [None]:
# Should be the case
df_articles_meta_agg.shape[0] == df_ret.shape[0]

In [None]:
articles_agg = torch.stack(
    [
        articles[x].mean(axis=0) if ~np.isnan(x).all() else torch.zeros_like(articles[0])
        for x in df_articles_meta_agg["indices"].tolist()
    ],
    axis=0
)

# Model Config

In [None]:
prediction_length = holding_period
context_length = 64 # How many articles from the past we are using
distribution_output = "normal"
input_size = 1  # Univariate model
lags_sequence = [1]  # Not adding lags as additional features
scaling = None  # Data is already scaled
d_model = articles.shape[-1]  # Ease of use 
freq = "1d"

dropout = 0.1
encoder_layers = 4
decoder_layers = 4
num_parallel_samples = 25
epochs = 10

In [None]:
time_features = time_features_from_frequency_str(freq)

In [None]:
model_config = InformerConfig(
    prediction_length=holding_period,
    context_length=context_length,
    input_size=input_size,
    lags_sequence=lags_sequence,
    scaling=scaling,
    num_time_features=len(time_features) + 1,
    dropout=dropout,
    encoder_layers=encoder_layers,
    decoder_layers=decoder_layers,
    d_model=d_model,
    distribution_output=distribution_output,
    num_parallel_samples=num_parallel_samples
)

# Data

In [None]:
batch_size = 64

In [None]:
idc = InformerFusionDataset(
    input_size=input_size,
    d_model=d_model,
    context_length=context_length,
    prediction_length=prediction_length,
    dropout=dropout,
    encoder_layers=encoder_layers,
    decoder_layers=decoder_layers,
    lags_sequence=lags_sequence,
    distribution_output=distribution_output,
    scaling=scaling,
    freq=freq
)

In [None]:
ds_train = idc.create_dataloader(
    df=df_ret.loc[
        df_ret["date"] < val_start
    ],
    articles_agg=articles_agg,
    mode="train",
    num_instances=500,
    batch_size=batch_size
)

ds_val = idc.create_dataloader(
    df=df_ret.loc[
        (df_ret["date"] >= val_start)
        & (df_ret["date"] < test_start)
    ],
    articles_agg=articles_agg,
    mode="validation",
    batch_size=batch_size
)

ds_test = idc.create_dataloader(
    df=df_ret.loc[
        (df_ret["date"] >= test_start)
    ],
    articles_agg=articles_agg,
    mode="test",
    batch_size=batch_size
)

# Model: `InformerFusionModel`

A modified implementation of `InformerForPrediction` from `transformers` package:
- Adds an option to feed article embeddings.

In [None]:
from torch.optim import AdamW
from finnews.models.train import train_informer_fusion

In [None]:
model = InformerFusionModel(
    config=model_config
)

In [None]:
epochs = 3

cur_timestamp = pd.Timestamp.now().strftime('%d_%m_%Y_%H_%M_%S')
model_save_dir = os.path.join(
    "models",
    "InformerFusion",
    cur_timestamp
)
os.makedirs(model_save_dir, exist_ok=True)

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-4, betas=(0.9, 0.95), weight_decay=1e-1)

In [None]:
train_informer_fusion(
    model=model,
    optimizer=optimizer,
    dataloader_train=ds_train,
    dataloader_val=ds_val,
    save_dir=model_save_dir,
    epochs=epochs
)