In [1]:
import sys
sys.path.append('../..')  # Adjust the path as needed
import numpy as np
import pandas as pd
import json
import torch
import torch.nn as nn
import re
import time
from einops import rearrange
from transformers.models.gpt2.modeling_gpt2 import GPT2Model
from transformers.models.gpt2.configuration_gpt2 import GPT2Config
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset, DataLoader
from MultimodalForcast.data_loader.data_loader import process_bitcoin_data, split_series, TimeSeriesDataset, collate_fn

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# test dataloader
start_date = '2020-01-01'
end_date = '2020-03-31'
ts_file_path = '../data/bitcoin/bitcoin_daily.csv'
news_file_path = '../data/bitcoin/bitcoin_news.json'
lookback = 10
predict = 10

df_filtered = process_bitcoin_data(ts_file_path, news_file_path, start_date, end_date)
train_data, val_data, test_data = split_series(df_filtered, lookback)

train_dataset = TimeSeriesDataset(train_data, lookback, predict)
val_dataset = TimeSeriesDataset(val_data, lookback, predict)
test_dataset = TimeSeriesDataset(test_data, lookback, predict)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, drop_last=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=8, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=8, collate_fn=collate_fn)

# Get one batch
batch = next(iter(train_loader))

texts = batch['text']      # List of article sequences
values = batch['value']    # Tensor of value sequences
targets = batch['target']  # Tensor of target values

# Print shapes and sample content
print("Batch shapes:")
print(f"Texts: {len(texts)} sequences, each with {len(texts[0])} articles")
print(f"Values shape: {values.shape}")  # Should be [batch_size, lookback_window]
print(f"Targets shape: {targets.shape}")  # Should be [batch_size, prediction_window]

# Print first sequence's content
print("\nFirst sequence:")
# print("Texts:", texts[0])
print("Values:", values[0])
print("Targets:", targets[0])

There are 91 rows in the filtered bitcoin dataframe
Batch shapes:
Texts: 8 sequences, each with 10 articles
Values shape: torch.Size([8, 10, 1])
Targets shape: torch.Size([8, 10, 1])

First sequence:
Values: tensor([[ 9233.],
        [ 9421.],
        [ 9706.],
        [ 9798.],
        [ 9823.],
        [10083.],
        [ 9932.],
        [ 9968.],
        [10331.],
        [10289.]])
Targets: tensor([[10272.],
        [10152.],
        [ 9916.],
        [ 9723.],
        [ 9840.],
        [10091.],
        [ 9612.],
        [ 9688.],
        [ 9666.],
        [ 9850.]])


  return x_text, torch.tensor(x_value, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)


In [3]:
# #  draft of pretrained bert text encoder
# # pretrained bert text encoder - draft
# # bert-base-uncased: 768

# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# # tocuda
# model = BertModel.from_pretrained("bert-base-uncased")

# texts = df_filtered['full_article'].values.tolist()
# # Removing special characters (keeping only alphanumeric and spaces)
# #If text is not a string, then NA
# texts = [
#     re.sub(r'[^a-zA-Z0-9 ]+', "", str(text)) 
#     if pd.notna(text) and isinstance(text, (str, float, int)) 
#     else "NA" 
#     for text in texts
# ]


# batch_size = 8 # number of text sequences in one batch
# all_embeddings = []

# # Process each batch
# for i in range(0, len(texts), batch_size):
#     batch_texts = texts[i:i+batch_size]

#     # Tokenize the batch
#     # padding=True, truncation=True, max_length=512
#     # tocuda
#     # inputs: [batch_size, sequence_length]
#     inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512)

#     # Get the embeddings
#     with torch.no_grad():
#         # outputs: [batch_size, sequence_length, bert_output_hidden_size]
#         outputs = model(**inputs)

#     # [CLS]
#     # batch_embeddings: [batch_size, bert_output_hidden_size]
#     batch_embeddings = outputs.last_hidden_state[:, 0, :]
#     all_embeddings.append(batch_embeddings)

# # Combine all batches    
# if all_embeddings:
#     # all_embeddings: [total_number_of_texts, bert_output_hidden_size]
#     all_embeddings = torch.cat(all_embeddings, dim=0)

# print(all_embeddings.shape)

# # text encoder
# # input: list of list
# # output: [batch size, look_back, bert_hidden_size]

In [4]:
class Pretrained_Bert_Encoder(nn.Module):
    def __init__(self, device, finetune=False):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        
        self.device = device
        self.bert = self.bert.to(device)
        # self.linear = self.linear.to(device)
        
        # Freeze BERT parameters
        if finetune == False:
            for param in self.bert.parameters():
                param.requires_grad = False

    def preprocess_text(self, texts):
        # texts is a list of strings
        texts = [
        re.sub(r'[^a-zA-Z0-9 ]+', "", str(text)) 
        if pd.notna(text) and isinstance(text, (str, float, int)) 
        else "NA" 
        for text in texts
    ]
        return texts
            
    def forward(self, texts):
        # texts is a list of lists: [batch_size, lookback]
        
        # Flatten the list of lists into a single list
        flat_texts = [article for sample_texts in texts for article in sample_texts]
        
        # Preprocess all texts
        flat_texts = self.preprocess_text(flat_texts)
        
        # Tokenize and get embeddings
        inputs = self.tokenizer(flat_texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(self.bert.device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = self.bert(**inputs)
        
        # Get [CLS] token embeddings
        cls_embeddings = outputs.last_hidden_state[:, 0, :]  # [total_articles, 768]

        
        # Reshape back to [batch_size, lookback, bert_output_dim]
        batch_size = len(texts)
        lookback = len(texts[0])  #lookback
        reshaped_output = cls_embeddings.reshape(batch_size, lookback, -1)
        
        return reshaped_output

In [5]:
# test encoder
encoder = Pretrained_Bert_Encoder(device="cpu")
encoder.forward([["article1", "article2"], ["article2", "article3"], ["article3", "article4"]]).shape

torch.Size([3, 2, 768])

In [6]:
#  RevIN (Reversible Instance Normalization)

class RevIn(nn.Module):
    def __init__(self, num_features: int, eps=1e-5, affine=True, subtract_last=False):
        """
        :param num_features: Number of input features
        :param eps: Stability add-on value
        :param affine: If True, RevIN has learnable affine parameters
        """
        super(RevIn, self).__init__()

        self.num_features = num_features
        self.eps = eps
        self.affine = affine
        self.subtract_last = subtract_last

        if self.affine:
            self._init_params()

    def forward(self, x, mode: str):
        if mode == 'norm':
            self._get_statistics(x)
            x = self._normalize(x)
        elif mode == 'denorm':
            x = self._denormalize(x)
        else:
            raise NotImplementedError
        return x

    def _init_params(self):
        # initialize RevIN params: (C,)
        self.affine_weight = nn.Parameter(torch.ones(self.num_features))
        self.affine_bias = nn.Parameter(torch.zeros(self.num_features))

    def _get_statistics(self, x):
        # x is [batch_size, seq_len, num_features]:
        # range(1, x.ndim - 1) = range(1, 2) = (1,)
        # along along sequence dimension
        dim2reduce = tuple(range(1, x.ndim - 1))
        if self.subtract_last:
            self.last = x[:, -1, :].unsqueeze(1)
        else:
            self.mean = torch.mean(x, dim=dim2reduce, keepdim=True).detach()
        self.stdev = torch.sqrt(torch.var(x, dim=dim2reduce, keepdim=True, unbiased=False) + self.eps).detach()

    def _normalize(self, x):
        if self.subtract_last:
            x = x - self.last.to(x.device)
        else:
            x = x - self.mean.to(x.device)
        x = x / self.stdev.to(x.device)
        if self.affine:
            x = x * self.affine_weight.to(x.device)
            x = x + self.affine_bias.to(x.device)
        return x

    def _denormalize(self, x):
        if self.affine:
            x = x - self.affine_bias.to(x.device)
            x = x / (self.affine_weight.to(x.device) + self.eps * self.eps)
        x = x * self.stdev.to(x.device)
        if self.subtract_last:
            x = x + self.last.to(x.device)
        else:
            x = x + self.mean.to(x.device)
        return x

In [7]:
# # detach() example
# w = nn.Parameter(torch.tensor([2.0]), requires_grad=True)

# # Case 1: No detach
# result = 2*w + w  # = 3w
# loss = some_function(result)
# loss.backward()
# # Gradient flows: loss -> result -> w
# # w.grad will include factor of 3 (both paths contribute)
# # If loss gradient is 1, w.grad would be 3

# # Case 2: Partial detach
# result = (2*w).detach() + w  # = 2(constant) + w
# loss = some_function(result)
# loss.backward()
# # Gradient flows only through the 'w' term
# # 2*w is treated as constant
# # If loss gradient is 1, w.grad would be 1

In [8]:
class multimodal_gpt4mts(nn.Module):
    def __init__(self, config, device):
        super().__init__()

        # Patching parameters
        self.patch_size = config.patch_size
        self.stride = config.stride
        self.patch_num = (config.lookback - self.patch_size) // self.stride + 2
        self.revin = config.revin
        self.device = device

        # Original sequence: [1, 2, 3, 4, 5]
        # With (0, 2) padding:
        # - Add 0 elements at start
        # - Add 2 elements at end by replicating last value
        # Result: [1, 2, 3, 4, 5, 5, 5]

        # If we used (2, 2):
        # - Add 2 elements at start by replicating first value
        # - Add 2 elements at end by replicating last value
        # Result: [1, 1, 1, 2, 3, 4, 5, 5, 5]
        self.padding_patch_layer = nn.ReplicationPad1d((0, config.stride))
      

        # encoder decoder
        
        self.in_layer = nn.Linear(config.patch_size, config.model_hidden_dim)
        self.prompt_layer = nn.Linear(config.model_hidden_dim, config.model_hidden_dim)
        self.out_layer = nn.Linear(config.model_hidden_dim * (self.patch_num), config.predict)
        self.relu = nn.ReLU()
        self.rev_in = RevIn(config.num_features)
        
        # text encoder, currently frozen
        self.text_encoder = Pretrained_Bert_Encoder(finetune=False, device=device)

        if config.pretraingpt2:
            self.decoder = GPT2Model.from_pretrained('gpt2')
        else:
            self.decoder = GPT2Model(GPT2Config())


        if config.pretraingpt2 and config.finetunedecoder:
            # Only fine-tunes layer normalization and positional embeddings
            # Layer norms help adapt to new data distributions
            # layer norm operates on the last dimension - 766 for each patch

            for i, (name, param) in enumerate(self.decoder.named_parameters()):
                if 'ln' in name or 'wpe' in name:
                    param.requires_grad = True
                else:
                    param.requires_grad = False
                    

        for layer in (self.decoder, self.in_layer, self.out_layer, self.prompt_layer):
            layer.to(device=device)

    def get_patch_text_embeddings(self, texts):
        text_embeddings = self.text_encoder(texts)
        text_embeddings = rearrange(text_embeddings, 'b l m -> b m l') # [batch_size, 768, lookback]
        text_embeddings = self.padding_patch_layer(text_embeddings) # [batch_size, 768, lookback + stride]

        # patch along the last dimension
        text_embeddings = text_embeddings.unfold(dimension=-1, size=self.patch_size, step=self.stride) #[batch_size, 768, num_patches, patch_size]
        # average text embedding of each patch
        text_embeddings = text_embeddings.mean(dim=-1).squeeze() # [batch_size, 768, num_patches]
        text_embeddings = rearrange(text_embeddings, 'b l m -> b m l') # [batch_size, num_patches, 768]
        return text_embeddings
    
    def get_ts_patch_embeddings(self, ts):
        ts = rearrange(ts, 'b l m -> b m l')  #[batch_size, num_features, lookback]
        ts = self.padding_patch_layer(ts) #[batch_size, num_features, lookback+stride]
        ts = ts.unfold(dimension=-1, size=self.patch_size, step=self.stride) #[batch_size, num_features, num_patches, patch_size]
        # Combines batch and feature dimensions
        # num_features has be to 1: channel independence, otherwise, the batch size will be different from batch_size of text embeddings
        ts = rearrange(ts, 'b m n p -> (b m) n p')  # [batch_size*num_features, num_patches, patch_size]
        return ts
        

    def forward(self, ts, texts):
        text_embeddings = self.get_patch_text_embeddings(texts) #([batch_size, num_patches, 768])
        text_embeddings = self.relu(self.prompt_layer(text_embeddings)) #([batch_size, num_patches, 768])

        # normalize time series for the whole lookback window todo
        if self.revin:   
            # ts: [batch_size, lookback, num_features]
            ts = self.rev_in(ts, 'norm').to(self.device)
        else:
            # regular normalization todo
            means = ts.mean(1, keepdim=True).detach() # detach from gpt4mts, necessary?
            ts = ts - means
            stdev = torch.sqrt(torch.var(ts, dim=1, keepdim=True, unbiased=False)+ 1e-5).detach() 
            ts /= stdev

        ts_embeddings = self.get_ts_patch_embeddings(ts) #([batch_size, num_patches, 768])
        ts_embeddings = self.in_layer(ts_embeddings) #([batch_size, num_patches, 768])
        x_all = torch.cat((text_embeddings, ts_embeddings), dim=1) #([batch_size, num_patches*2, 768])

        outputs = self.decoder(inputs_embeds=x_all).last_hidden_state # [batch_size, num_patches, 768]
        # Take the last hidden state for prediction
        outputs = outputs[:, -self.patch_num:, :]  # Return only the last b tokens
        
        # Reshape to [batch_size, num_patches * 768] for linear layer
        a, b, c = outputs.shape
        outputs = self.out_layer(outputs.reshape(a, b*c)) # [batch_size, predict]
        outputs = outputs.unsqueeze(2) # [batch_size, predict, 1] 1 for one channel
        

        if self.revin:
            outputs = self.rev_in(outputs, 'denorm').to(self.device)
        else:
            outputs = outputs * stdev
            outputs = outputs + means
        return outputs



In [9]:
class Config:
    def __init__(self, **kwargs):
        for key, value in kwargs.items():
            setattr(self, key, value)

In [10]:
# test forward path
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_config = Config(
    lookback=4,
    predict=2,
    num_features=1,
    patch_size=2,
    stride=1,
    model_input_dim=768,
    model_hidden_dim=768,
    finetunedecoder=True,
    pretraingpt2=True,
    revin=True
)


multimodal_gpt4mts(model_config, device)(
    torch.randn(3, 4, 1), # [batch_size, lookback, num_features]
    [["article1", "article2", "article3", "article4"], 
     ["article2", "article3", "article4", "article5"], 
     ["article3", "article4", "article5", "article6"]])

tensor([[[-0.0322],
         [-0.3327]],

        [[-0.0573],
         [-0.3371]],

        [[ 0.2202],
         [ 0.1671]]], device='cuda:0', grad_fn=<AddBackward0>)

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_config = Config(
    lookback=15,
    predict=7,
    num_features=1,
    patch_size=8,
    stride=4,
    pretraingpt2=True,
    model_input_dim=768,
    model_hidden_dim=768,
    finetunedecoder=True,
    revin=True
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# training parameters
learning_rate = 0.001
patience = 3
train_epochs = 10
batch_size = 16
weight_decay = 0
model_save_path = "multimodal_gpt4mts_iter0.pth"

# data parameters
start_date = '2018-01-01'
end_date = '2022-04-22'
ts_file_path = '../data/bitcoin/bitcoin_daily.csv'
news_file_path = '../data/bitcoin/bitcoin_news_with_summaries.json'


df_filtered = process_bitcoin_data(ts_file_path, news_file_path, start_date, end_date, text_col='summary')
train_data, val_data, test_data = split_series(df_filtered, model_config.lookback)

# dataloader
train_dataset = TimeSeriesDataset(train_data, model_config.lookback, model_config.predict, text_col='summary')
val_dataset = TimeSeriesDataset(val_data, model_config.lookback, model_config.predict, text_col='summary')
test_dataset = TimeSeriesDataset(test_data, model_config.lookback, model_config.predict, text_col='summary')

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=False, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, drop_last=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, drop_last=False, collate_fn=collate_fn)

model = multimodal_gpt4mts(model_config, device)
model.to(device)
params = model.parameters()
model_optim = torch.optim.Adam(params, lr=learning_rate, weight_decay=0)
criterion = nn.MSELoss()

print(f"Device: {device}")


There are 1236 rows in the filtered bitcoin dataframe
Device: cuda


In [12]:
# model training 

best_val_mse = float('inf')
best_epoch = -1
for epoch in range(train_epochs):
    train_loss = []
    all_train_preds = []
    all_train_targets = []
    epoch_time = time.time()
    for i, train_loader_item in enumerate(train_loader):
        texts = train_loader_item['text']
        ts_values = train_loader_item['value'].float().to(device)
        targets = train_loader_item['target'].float().to(device)
        model_optim.zero_grad()

        outputs = model(ts_values, texts)
        all_train_preds.append(outputs.detach().cpu().numpy())
        all_train_targets.append(targets.detach().cpu().numpy())

        loss = criterion(outputs, targets)
        train_loss.append(loss.item())

        loss.backward()
        model_optim.step()

    all_train_preds = np.concatenate(all_train_preds, axis=0)
    all_train_targets = np.concatenate(all_train_targets, axis=0)
    train_mse = np.mean((all_train_preds - all_train_targets) ** 2)
    # avg_train_loss = np.average(train_loss)

    # validation
    model.eval()
    val_loss = []
    all_val_preds = []
    all_val_targets = []
    with torch.no_grad():
        for i, val_loader_item in enumerate(val_loader):
            texts = val_loader_item['text']
            ts_values = val_loader_item['value'].float().to(device)
            targets = val_loader_item['target'].float().to(device)
            outputs = model(ts_values, texts)
            loss = criterion(outputs, targets)
            val_loss.append(loss.item())

            all_val_preds.append(outputs.detach().cpu().numpy())
            all_val_targets.append(targets.detach().cpu().numpy())

    all_val_preds = np.concatenate(all_val_preds, axis=0)
    all_val_targets = np.concatenate(all_val_targets, axis=0)
    val_mse = np.mean((all_val_preds - all_val_targets) ** 2)

    # avg_val_loss = np.average(val_loss)
    model.train()

    print(f"""
    Epoch {epoch}:
    Train MSELoss: {train_mse:.4f}, Val MSELoss: {val_mse:.4f}, 
    Training RMSE: {np.sqrt(train_mse):.4f}, Val RMSE: {np.sqrt(val_mse):.4f}, 
    Training time: {(time.time() - epoch_time) / 60:.2f} minutes
    """)

    # Save the best model
    if val_mse < best_val_mse:
        best_val_mse = val_mse
        best_epoch = epoch
        torch.save(model.state_dict(), model_save_path)
        print(f"New best model saved at epoch {epoch} with Val RMSE: {np.sqrt(val_mse):.4f}")

    # Early stopping
    if epoch > 0 and val_loss[-1] > val_loss[-2]:
        consec_increase += 1
    else:
        consec_increase = 0

    if consec_increase == patience:
        print(f"Early stopping at epoch {epoch+1} as val loss has been increasing for {patience} epochs \
              for {consec_increase} epochs")
        break
    


    Epoch 0:
    Train MSELoss: 659531.0625, Val MSELoss: 292673.7500, 
    Training RMSE: 812.1152, Val RMSE: 540.9933, 
    Training time: 2.39 minutes
    
New best model saved at epoch 0 with Val RMSE: 540.9933

    Epoch 1:
    Train MSELoss: 495931.2812, Val MSELoss: 296300.2188, 
    Training RMSE: 704.2239, Val RMSE: 544.3347, 
    Training time: 2.46 minutes
    

    Epoch 2:
    Train MSELoss: 432079.3750, Val MSELoss: 266626.8125, 
    Training RMSE: 657.3275, Val RMSE: 516.3592, 
    Training time: 2.46 minutes
    
New best model saved at epoch 2 with Val RMSE: 516.3592


KeyboardInterrupt: 

In [24]:
# naive baseline
all_last_point_forecasts = []
all_avg_window_forecasts = []
all_targets = []

for i, val_loader_item in enumerate(val_loader):
    ts_values = val_loader_item['value'].float()
    targets = val_loader_item['target'].float()
    
    # Last point baseline
    last_values = ts_values[:, -1:, :]
    last_point_forecast = last_values.repeat(1, model_config.predict, 1)
    all_last_point_forecasts.append(last_point_forecast)
    
    # Average of lookback window baseline
    avg_values = ts_values.mean(dim=1, keepdim=True)  # shape: [batch, 1, features]
    avg_forecast = avg_values.repeat(1, model_config.predict, 1)
    all_avg_window_forecasts.append(avg_forecast)
    
    # Collect targets
    all_targets.append(targets)

# Concatenate all batches
all_last_point_forecasts = torch.cat(all_last_point_forecasts, dim=0)
all_avg_window_forecasts = torch.cat(all_avg_window_forecasts, dim=0)
all_targets = torch.cat(all_targets, dim=0)

# Compute MSE and RMSE
last_point_mse = torch.mean((all_last_point_forecasts - all_targets) ** 2).item()
avg_window_mse = torch.mean((all_avg_window_forecasts - all_targets) ** 2).item()

print(f"""
    Model Val MSELoss: {best_val_mse:.4f}, 
    Last Point Baseline MSELoss: {last_point_mse:.4f}, 
    Average Window Baseline MSELoss: {avg_window_mse:.4f}, 
    Model RMSE: {np.sqrt(best_val_mse):.4f}, 
    Last Point Baseline RMSE: {np.sqrt(last_point_mse):.4f}, 
    Average Window Baseline RMSE: {np.sqrt(avg_window_mse):.4f}
""")


    Model Val MSELoss: 245810.2656, 
    Last Point Baseline MSELoss: 263000.5938, 
    Average Window Baseline MSELoss: 727225.7500, 
    Model RMSE: 495.7926, 
    Last Point Baseline RMSE: 512.8358, 
    Average Window Baseline RMSE: 852.7753

