In [15]:
import numpy as np
import pandas as pd
import pyarrow
from datetime import datetime
import matplotlib.pyplot as plt
import torch
from sklearn.preprocessing import MinMaxScaler
import os
import re

In [2]:
def create_dataset(data, lookback, window_size=50, val_step=1, test_step=7, dates=None):
    '''
    This function creates a dataset for time series forecasting, with a rolling window of lookback. 
    Note that the first column need to be the daily price movement
    
    Parameters
    data: a 2D numpy array with shape (# of days, # of features)
    lookback: an integer of how many trading days to lookback to
    window_size: number of days to include in each block, window_size = train_step + test_step
    test_step: number of days to predict
    dates: spcific dates provided for splitting train and val set

    Returns
    X_train, y_train, X_val, y_val: dict of n_blocks as keys and 3D tensor of shape (n_data, lookback, n_feat) as values
    y_train, y_val: dict of n_blocks as keys and 2D tensor of shape (n_data, 1) as values
    X_test: 2D tensor of shape (n_data, n_feat)
    y_test: 2D tensor of shape (n_data, 1)
    '''
    assert type(data) == np.ndarray and len(data.shape) == 2, 'Input data needs to be a 2D numpy array'

    n_data, n_feat = data.shape
    loop = n_data - lookback - 1
    
    # X and y are 3D arrays
    X = np.empty((loop, lookback, n_feat))
    y = np.empty((loop, 1))
    for i in range(loop):
        X[i] = data[i:i+lookback] # all features of the past lookback days
        y[i] = data[i+lookback+1, 0] # price movement of the next day
    
    X = torch.from_numpy(X).float()
    y = torch.from_numpy(y)
    #split data into train and val, also keep 7 days at the end as holdout test set
    X_train, X_val, X_test = time_series_split(X, window_size=window_size, val_step=val_step, test_step=test_step)
    y_train, y_val, y_test = time_series_split(y, window_size=window_size, val_step=val_step, test_step=test_step)
    
    return X_train, X_val, X_test, y_train, y_val, y_test
    
def time_series_split(data, window_size=50, val_step=1, test_step=7):
    """
    Split data using rolling-window (block) split
    data: a 3D array([number of data, number of lookbacks, number of features]) X or a 2D array([number of data, 1]) y
    window_size: number of days to include in each block, window_size = train_step + val_step
    test_step: number of days to predict
    
    return: dict of 3D tensor(train/val, # of lookbacks, # of features) and a 3D tensor(test, # of lookbacks, # of features)
    """
    assert len(data) >= window_size, 'Data length needs to be longer than window size'

    n_data = data.shape[0]
    n_block = n_data // window_size
    test = data[-test_step:] # holdout test set
    data = data[:-test_step] # remove holdout part then split
    train, val= dict(), dict()

    if val_step != 0:
        for i in range(n_block):
            init = i * window_size
            if init + window_size <= n_data:
                block = data[init:init + window_size]
                train[f'block_{i}'] = block[:-val_step]
                val[f'block_{i}'] = block[-val_step:]
            else:
                # Handle the last block which might be smaller
                block = data[init:]
                train[f'block_{i}'] = block[:-val_step] if len(block) > val_step else block
                val[f'block_{i}'] = block[-val_step:] if len(block) > val_step else torch.tensor([])

        return train, val, test
    
    else: # after tuning, we will train the final model on the complete data excluding the holdout test set without splitting into blocks
        return data, torch.tensor([]), test

In [24]:
embed

Unnamed: 0,Title_Embedding,date
0,"[[1.017, 1.0, 1.5383, 2.0, 6.7358, 2.0, 9.7041...",2021-03-18
1,"[[8.6033, 2.0, 5.9632, 3.0, 4.0968, 2.0, 2.373...",2021-03-19
2,"[[9.4112, 2.0, 5.5919, 3.0, 6.2575, 2.0, 1.750...",2021-03-22
3,"[[8.6203, 2.0, 1.1078, 2.0, 5.542, 2.0, 2.0828...",2021-03-23
4,"[[1.0256, 1.0, 2.2014, 2.0, 6.7714, 2.0, 1.309...",2021-03-24
...,...,...
792,"[[8.8404, 2.0, 8.7555, 3.0, 5.8778, 2.0, 2.063...",2023-09-21
793,"[[1.0754, 1.0, 9.7573, 3.0, 6.5648, 2.0, 1.328...",2023-09-22
794,"[[1.0516, 1.0, 1.5294, 2.0, 7.2295, 2.0, 1.113...",2023-09-25
795,"[[8.588, 2.0, 4.9078, 4.0, 4.0654, 2.0, 1.3243...",2023-09-26


In [18]:
def tensor_string_to_numpy(tensor_str):
    if pd.isna(tensor_str):
        return np.array([])
    nums = re.findall(r'[\d.\d]+', tensor_str)
    nums = [float(num) for num in nums]
    return np.array([nums])

In [41]:
stock_path = '../data/daily_price_movement.csv'
daily = pd.read_csv(stock_path, header=0)

embed_path = '../data/BERT_embedding'
files = os.listdir(embed_path)
embed = pd.DataFrame(columns=['Date', 'Title_Embedding'])
for f in files:
    embed = pd.concat([embed,pd.read_csv(embed_path+'/'+f, header=0)], ignore_index=True)

embed['date'] = embed['Date'].apply(lambda x: x.split(' ')[0])
embed['Title_Embedding'] = embed['Title_Embedding'].apply(tensor_string_to_numpy)
temp = pd.merge(daily, embed, on='date', how='left')
data = np.append(temp['price_movement'].to_numpy().reshape(-1, 1), np.vstack(temp['Title_Embedding']), axis=1)

In [44]:
data[:10]

array([[1.    , 9.4837, 2.    , ..., 1.    , 3.067 , 3.    ],
       [1.    , 9.0773, 2.    , ..., 1.    , 2.8119, 2.    ],
       [1.    , 1.0125, 1.    , ..., 1.    , 2.666 , 3.    ],
       ...,
       [1.    , 1.1286, 1.    , ..., 1.    , 3.8511, 2.    ],
       [0.    , 1.0391, 1.    , ..., 1.    , 3.8911, 3.    ],
       [1.    , 1.136 , 1.    , ..., 1.    , 1.2183, 2.    ]])

In [28]:
daily_movement = daily.to_numpy()[:, 1].reshape(-1, 1)
X_train, X_val, X_test, y_train, y_val, y_test = create_dataset(daily_movement, lookback=5, window_size=50, val_step=1, test_step=7)

In [29]:
list(X_val.keys())

['block_0',
 'block_1',
 'block_2',
 'block_3',
 'block_4',
 'block_5',
 'block_6',
 'block_7',
 'block_8',
 'block_9',
 'block_10',
 'block_11',
 'block_12',
 'block_13',
 'block_14']

In [31]:
[len(v) for v in X_train.values()]

[49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49]

In [47]:
torch.tensor([])

tensor([])