In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
import tensorflow_probability as tfp
import warnings
import time
import gc
import torch
import torch.nn as nn
import lightgbm as lgb
import statsmodels.api as sm
import warnings

from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils.validation import _deprecate_positional_args
from torch.utils.data import Dataset, DataLoader, TensorDataset
from joblib import dump, load
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import r2_score, mean_absolute_error
from statsmodels.graphics.tsaplots import plot_acf

train_df = pd.read_csv('trading_the_close_data/train.csv')
train_df = train_df[train_df['date_id'] <= 200]
plt.rcParams.update({'font.size': 16})

warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and being used")
else:
    device = torch.device("cpu")
    print("GPU is not available, using CPU instead")

2024-03-04 23:47:54.252068: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-03-04 23:47:54.275466: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-04 23:47:54.461532: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-04 23:47:54.462647: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


GPU is not available, using CPU instead


utilities

In [2]:
def set_seeds(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)
    
class CFG:
    BATCH_SIZE = 32
    N_EPOCHS = 500
    LEARNING_RATE = 0.0001
    N_FOLDS = 5
    TARGET_COLS = ['target']
    SEED = 2023
    N_ASSETS = train_df['stock_id'].nunique()
    SCALER = MinMaxScaler()
    WEIGHT_DECAY = 0.4
    DEBUG = True

CFG = CFG()
set_seeds(CFG.SEED)

In [3]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2163205 entries, 0 to 2163204
Data columns (total 17 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   stock_id                 int64  
 1   date_id                  int64  
 2   seconds_in_bucket        int64  
 3   imbalance_size           float64
 4   imbalance_buy_sell_flag  int64  
 5   reference_price          float64
 6   matched_size             float64
 7   far_price                float64
 8   near_price               float64
 9   bid_price                float64
 10  bid_size                 float64
 11  ask_price                float64
 12  ask_size                 float64
 13  wap                      float64
 14  target                   float64
 15  time_id                  int64  
 16  row_id                   object 
dtypes: float64(11), int64(5), object(1)
memory usage: 297.1+ MB


In [4]:
train_df.isna().sum()

stock_id                         0
date_id                          0
seconds_in_bucket                0
imbalance_size                  55
imbalance_buy_sell_flag          0
reference_price                 55
matched_size                    55
far_price                  1197578
near_price                 1179955
bid_price                       55
bid_size                         0
ask_price                       55
ask_size                         0
wap                             55
target                          31
time_id                          0
row_id                           0
dtype: int64

In [5]:
train_df.describe()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id
count,2163205.0,2163205.0,2163205.0,2163150.0,2163205.0,2163150.0,2163150.0,965627.0,983250.0,2163150.0,2163205.0,2163150.0,2163205.0,2163150.0,2163174.0,2163205.0
mean,98.99331,100.5831,270.0,5865240.0,-0.02106689,1.000067,47085690.0,1.004494,0.999789,0.9997929,48734.93,1.000346,51174.64,1.000066,-0.07369144,5559.072
std,57.9962,57.9376,158.7451,20468400.0,0.8671006,0.002587448,145974000.0,1.123478,0.011679,0.002560397,119205.8,0.002570721,134944.8,0.002564237,9.778475,3186.608
min,0.0,0.0,0.0,0.0,-1.0,0.935285,18733.2,7.7e-05,0.879117,0.934915,0.0,0.939827,0.0,0.938008,-319.7599,0.0
25%,48.0,51.0,130.0,5043.557,-1.0,0.998819,5553255.0,0.996681,0.997339,0.998579,6790.52,0.999094,7249.78,0.998835,-4.659891,2809.0
50%,98.0,101.0,270.0,1041975.0,0.0,0.999995,13126560.0,0.999901,0.999909,0.999754,20993.18,1.000233,22032.5,1.0,-0.07987022,5572.0
75%,149.0,151.0,410.0,4115766.0,1.0,1.001208,32703750.0,1.003118,1.002491,1.000931,53345.4,1.001456,55681.0,1.001188,4.470348,8318.0
max,199.0,200.0,540.0,1192694000.0,1.0,1.053535,7713682000.0,437.9531,1.148104,1.050635,30287840.0,1.057996,46626580.0,1.053216,446.0704,11054.0


In [6]:
def plot_hist(df):
    COLS = list(df.drop(['stock_id', 'date_id', 'seconds_in_bucket', 'target', 'row_id', 'imbalance_buy_sell_flag', 'time_id'], axis=1).columns)
    fig, axes = plt.subplots(10, 1, sharex=True, figsize=(35,60))
    for col, ax in zip(COLS, axes.ravel()):
        sns.lineplot(data=df, x='time_id', y=df[col], hue='stock_id', ax=ax)
        ax.legend(loc='best')       
if not CFG.DEBUG:
    plot_hist(train_df)

In [7]:
def plot_target(df):
    plt.figure(figsize=(35, 10))
    sns.histplot(data=df, x='target', kde=True) 
if not CFG.DEBUG:
    plot_target(train_df)

In [8]:
def plot_auto_corr(df):
    COLS = list(df.drop(['stock_id', 'date_id', 'seconds_in_bucket', 'target', 'row_id', 'imbalance_buy_sell_flag', 'time_id'], axis=1).columns)
    fig, axes = plt.subplots(10, 1, sharex=True, figsize=(35,60))
    for col, ax in zip(COLS, axes.ravel()):
        stock = df[df['stock_id']==1]
        plot_acf(stock[col][:500000], lags=50, ax=ax)
        ax.set_title(str(col))
if not CFG.DEBUG:
    plot_auto_corr(train_df)

pre-process

In [9]:
train_df


Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180602.69,1,0.999812,1.338028e+07,,,0.999812,60651.50,1.000026,8493.03,1.000000,-3.029704,0,0_0_0
1,1,0,0,166603.91,-1,0.999896,1.642214e+06,,,0.999896,3233.04,1.000660,20605.09,1.000000,-5.519986,0,0_0_1
2,2,0,0,302879.87,-1,0.999561,1.819368e+06,,,0.999403,37956.00,1.000298,18995.00,1.000000,-8.389950,0,0_0_2
3,3,0,0,11917682.27,-1,1.000171,1.838975e+07,,,0.999999,2324.90,1.000214,479032.40,1.000000,-4.010200,0,0_0_3
4,4,0,0,447549.96,-1,0.999532,1.786061e+07,,,0.999394,16485.54,1.000016,434.10,1.000000,-7.349849,0,0_0_4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2163200,195,200,540,1640897.40,-1,1.001304,4.785947e+07,0.999900,1.000201,1.001204,4591.26,1.001304,9982.00,1.001235,7.580519,11054,200_540_195
2163201,196,200,540,98538.52,-1,1.001487,8.296411e+06,1.000155,1.000688,1.001487,7516.00,1.001754,38492.16,1.001531,11.090040,11054,200_540_196
2163202,197,200,540,0.00,0,1.000842,2.075511e+07,1.000842,1.000842,1.000842,382.76,1.001260,7658.40,1.000861,-0.360012,11054,200_540_197
2163203,198,200,540,10265079.13,-1,1.000602,1.600525e+08,0.999414,0.999414,1.000364,20549.68,1.000602,959240.88,1.000369,5.580187,11054,200_540_198


In [10]:
def preprocess(df, mode='train'):
    print(df.columns)
    df['log_return'] = np.log(df['wap'])
    df.drop(['imbalance_buy_sell_flag', 'far_price', 'near_price'], axis=1, inplace=True) 
    
    for i in range(1, 10):
        df[f'imbalance_size_lag_{i}'] = df.groupby('stock_id')['imbalance_size'].shift(i)
        df[f'reference_price_lag_{i}'] = df.groupby('stock_id')['reference_price'].shift(i)
        df[f'matched_size_lag_{i}'] = df.groupby('stock_id')['matched_size'].shift(i)
        df[f'bid_price_lag_{i}'] = df.groupby('stock_id')['bid_price'].shift(i)
        df[f'ask_price_lag_{i}'] = df.groupby('stock_id')['ask_price'].shift(i)
        df[f'wap_{i}'] = df.groupby('stock_id')['wap'].shift(i)
    
    df['bid_size_lag_1'] = df.groupby('stock_id')['bid_size'].shift(1)
    df['ask_size_lag_1'] = df.groupby('stock_id')['ask_size'].shift(1)
    
    df.fillna(0, inplace=True)
    # Standardize
    if mode == 'train':
        print(df.columns)
        cols_to_norm = df.drop(['stock_id', 'time_id', 'date_id', 'row_id', 'target'], axis=1).columns
        df[cols_to_norm] = CFG.SCALER.fit_transform(df[cols_to_norm])
    elif mode == 'test':
        cols_to_norm = df.drop(['stock_id', 'time_id', 'date_id', 'row_id', 'target'], axis=1).columns
        df[cols_to_norm] = CFG.SCALER.transform(df[cols_to_norm])
    else:
        print('Wrong Mode.')
    return df
    
train_df = preprocess(train_df)

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'row_id'],
      dtype='object')
Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'reference_price', 'matched_size', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'row_id', 'log_return',
       'imbalance_size_lag_1', 'reference_price_lag_1', 'matched_size_lag_1',
       'bid_price_lag_1', 'ask_price_lag_1', 'wap_1', 'imbalance_size_lag_2',
       'reference_price_lag_2', 'matched_size_lag_2', 'bid_price_lag_2',
       'ask_price_lag_2', 'wap_2', 'imbalance_size_lag_3',
       'reference_price_lag_3', 'matched_size_lag_3', 'bid_price_lag_3',
       'ask_price_lag_3', 'wap_3', 'imbalance_size_lag_4',
       'reference_price_lag_4', 'matched_size_lag_4', 'bid_price_lag_4

In [11]:
train_df

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,reference_price,matched_size,bid_price,bid_size,ask_price,ask_size,...,ask_price_lag_8,wap_8,imbalance_size_lag_9,reference_price_lag_9,matched_size_lag_9,bid_price_lag_9,ask_price_lag_9,wap_9,bid_size_lag_1,ask_size_lag_1
0,0,0,0.0,0.002667,0.949007,0.001735,0.951626,0.002003,0.945208,0.000182,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,1,0,0.0,0.000140,0.949087,0.000213,0.951706,0.000107,0.945807,0.000442,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,2,0,0.0,0.000254,0.948769,0.000236,0.951237,0.001253,0.945465,0.000407,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,3,0,0.0,0.009992,0.949348,0.002384,0.951804,0.000077,0.945385,0.010274,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,4,0,0.0,0.000375,0.948741,0.002315,0.951229,0.000544,0.945198,0.000009,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2163200,195,200,1.0,0.001376,0.950423,0.006204,0.952951,0.000152,0.946416,0.000214,...,0.946795,0.950958,0.002642,0.951185,0.006012,0.953811,0.947364,0.951600,0.004400,0.000617
2163201,196,200,1.0,0.000083,0.950597,0.001076,0.953221,0.000248,0.946841,0.000826,...,0.946337,0.950385,0.000167,0.949838,0.001060,0.952460,0.946085,0.950296,0.000254,0.000943
2163202,197,200,1.0,0.000000,0.949985,0.002691,0.952607,0.000013,0.946374,0.000164,...,0.945979,0.950034,0.000000,0.950083,0.002677,0.952308,0.946177,0.950240,0.000009,0.000084
2163203,198,200,1.0,0.008607,0.949757,0.020749,0.952152,0.000678,0.945752,0.020573,...,0.945976,0.950137,0.008527,0.949982,0.020616,0.952604,0.946201,0.950316,0.006001,0.010953


outout

In [12]:

class PurgedGroupTimeSeriesSplit(_BaseKFold):
    """Time Series cross-validator variant with non-overlapping groups.
    Allows for a gap in groups to avoid potentially leaking info from
    train into test if the model has windowed or lag features.
    Provides train/test indices to split time series data samples
    that are observed at fixed time intervals according to a
    third-party provided group.
    In each split, test indices must be higher than before, and thus shuffling
    in cross validator is inappropriate.
    This cross-validation object is a variation of :class:`KFold`.
    In the kth split, it returns first k folds as train set and the
    (k+1)th fold as test set.
    The same group will not appear in two different folds (the number of
    distinct groups has to be at least equal to the number of folds).
    Note that unlike standard cross-validation methods, successive
    training sets are supersets of those that come before them.
    Read more in the :ref:`User Guide <cross_validation>`.
    Parameters
    ----------
    n_splits : int, default=5
        Number of splits. Must be at least 2.
    stacking_mode : bool, default=True
        Whether to provide an additional set to test a stacking classifier or not. 
    max_train_group_size : int, default=Inf
        Maximum group size for a single training set.
    max_val_group_size : int, default=Inf
        Maximum group size for a single validation set.
    max_test_group_size : int, default=Inf
        We discard this number of groups from the end of each train split, if stacking_mode = True and None 
        it defaults to max_val_group_size.
    val_group_gap : int, default=None
        Gap between train and validation
    test_group_gap : int, default=None
        Gap between validation and test, if stacking_mode = True and None 
        it defaults to val_group_gap.
    """

    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 max_train_group_size=np.inf,
                 max_val_group_size=np.inf,
                 max_test_group_size=np.inf,
                 val_group_gap=None,
                 test_group_gap=None,
                 verbose=False
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_group_size = max_train_group_size
        self.max_val_group_size = max_val_group_size
        self.max_test_group_size = max_test_group_size
        self.val_group_gap = val_group_gap
        self.test_group_gap = test_group_gap
        self.verbose = verbose
        
    def split(self, X, y=None, groups=None):
        return self.split_standard(X, y, groups)
        
    def split_standard(self, X, y=None, groups=None):
        """Generate indices to split data into training and validation set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Group labels for the samples used while splitting the dataset into
            train/validation set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        val : ndarray
            The validation set indices for that split.
        """
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_splits = self.n_splits
        group_gap = self.val_group_gap
        max_val_group_size = self.max_val_group_size
        max_train_group_size = self.max_train_group_size
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                ("Cannot have number of folds={0} greater than"
                 " the number of groups={1}").format(n_folds,
                                                     n_groups))

        group_val_size = min(n_groups // n_folds, max_val_group_size)
        group_val_starts = range(n_groups - n_splits * group_val_size,
                                  n_groups, group_val_size)
        for group_val_start in group_val_starts:
            train_array = []
            val_array = []

            group_st = max(0, group_val_start - group_gap - max_train_group_size)
            for train_group_idx in unique_groups[group_st:(group_val_start - group_gap)]:
                train_array_tmp = group_dict[train_group_idx]
                
                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)

            train_end = train_array.size
 
            for val_group_idx in unique_groups[group_val_start:
                                                group_val_start +
                                                group_val_size]:
                val_array_tmp = group_dict[val_group_idx]
                val_array = np.sort(np.unique(
                                              np.concatenate((val_array,
                                                              val_array_tmp)),
                                     axis=None), axis=None)

            val_array  = val_array[group_gap:]
            
            
            if self.verbose > 0:
                    pass
                    
            yield [int(i) for i in train_array], [int(i) for i in val_array]

model

In [13]:

def global_average_pooling(x):
    return x.mean(dim=(-1))
    
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.batchnorm1d = nn.BatchNorm1d(200)
        self.conv1d1 = nn.Conv1d(200, 8, 3)
        self.conv1d2 = nn.Conv1d(8, 8, 3)
        self.conv1d3 = nn.Conv1d(8, 16, 3)
        self.conv1d4 = nn.Conv1d(16, 16, 3)
        self.conv1d5 = nn.Conv1d(16, 32, 3)
        self.conv1d6 = nn.Conv1d(32, 32, 3)
        self.conv1d7 = nn.Conv1d(32, 32, 3)
        
        self.pool1d1 = nn.AvgPool1d(2)
        self.pool1d2 = nn.AvgPool1d(2)
     
        self.linear1 = nn.Linear(32, 32)
        self.linear2 = nn.Linear(32, 200)

    def forward(self, inputs):
        x = self.batchnorm1d(inputs)
        x = self.conv1d1(x)
        x = self.conv1d2(x)
        x = self.pool1d1(x)
        
        x = self.conv1d3(x)
        x = self.conv1d4(x)
        x = self.pool1d2(x)
        
        x = self.conv1d5(x)
        x = self.conv1d6(x)
        x = self.conv1d7(x)

        out = global_average_pooling(x)

        out = self.linear1(out)
        out = nn.ReLU()(out)
        
        output = self.linear2(out)
        
        return output

In [14]:
def pad_df(df):
    missing_ids = list(set(np.arange(0, 200)) - set(df.index))
    for id in missing_ids:
        df.loc[id] = 0
    df.sort_index(inplace=True)
    return df
    
def get_dataset(df):
    X = []
    y = []
    for name, group in df.groupby('time_id'):
        group = group.set_index('stock_id')
        if len(group) < 200:
            group = pad_df(group)  
        features = torch.tensor(group.drop(['time_id', 'date_id', 'row_id', 'target'], axis=1).values, dtype=torch.float32)
        labels = group['target'].values 
        X.append(features)
        y.append(labels)
    X_ = torch.stack(X)
    y_ = torch.tensor(np.array(y), dtype=torch.float32)
    return TensorDataset(X_, y_) 

In [15]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        
def get_score(y_val, y_pred):
    all_scores = []
    for i in range(len(y_val)):
        score = mean_absolute_error(y_val[i], y_pred[i])
        all_scores.append(score)
    return np.mean(all_scores)
        
def train_fn(fold, train_dataloader, model, loss_fn, optimizer, epoch, device):
    losses = AverageMeter()
    model.train()
    for step, (inputs, labels) in enumerate(train_dataloader):
        inputs = inputs.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = loss_fn(outputs, labels)
        losses.update(loss.item(), labels.size(0))
        loss.backward()
        optimizer.step()
    return losses.avg

def valid_fn(valid_dataloader, model, loss_fn, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    for step, (input_ids, labels) in enumerate(valid_dataloader):
        input_ids = input_ids.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            outputs = model(input_ids)   
            loss = loss_fn(outputs, labels)
        losses.update(loss.item(), batch_size)
        preds.append(outputs.to('cpu').numpy())
    predictions = np.concatenate(preds)
    return losses.avg, predictions


def train_loop(train, val, fold):
    print(f'----------------- Fold: {fold+1} -----------------') 
    
    train_ds = get_dataset(train)
    val_ds = get_dataset(val)
    _, y_val = val_ds[:]
    
    train_dataloader = DataLoader(
            train_ds,  
            batch_size = CFG.BATCH_SIZE 
    )

    val_dataloader = DataLoader(
            val_ds,  
            batch_size = CFG.BATCH_SIZE
    )
    
    model = CNN().to(device)
    optimizer_parameters = [
            {'params': [p for n, p in model.named_parameters()],
             'lr': CFG.LEARNING_RATE, 'weight_decay': CFG.WEIGHT_DECAY},
        ]

    
    optimizer = torch.optim.AdamW(optimizer_parameters)
    loss_fn = nn.MSELoss()
    min_score = np.inf        
    wait = 0
    patience=10
    
    for epoch in range(CFG.N_EPOCHS):
        avg_loss = train_fn(fold, train_dataloader, model, loss_fn, optimizer, epoch, device)

        avg_val_loss, y_pred = valid_fn(val_dataloader, model, loss_fn, device)
   
        score = get_score(y_val, y_pred)

        print(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f} avg_val_loss: {avg_val_loss:.4f} MAE: {score:.4f}')
           
        wait += 1
        if round(score, 4) < round(min_score, 4):
            min_score = score
            wait = 0
            torch.save(model.state_dict(), f'model_fold_{fold+1}')
            print(f'﹂ saving model with score: {min_score:.4f}')
        if wait >= patience:
            print(f'Triggering Early Stopping on epoch {epoch+1}')
            return min_score
        
    gc.collect()

In [16]:
def train(df):
    Fold = PurgedGroupTimeSeriesSplit(n_splits=CFG.N_FOLDS,
                                      max_train_group_size=10000,
                                      max_val_group_size=200,
                                      val_group_gap=10)
    scores = np.empty([CFG.N_FOLDS])
    groups = df['time_id']
    for fold, (train_index, val_index) in enumerate(Fold.split(df, df[CFG.TARGET_COLS], groups=groups)):
        train = df.iloc[train_index].reset_index(drop=True)
        val = df.iloc[val_index].reset_index(drop=True)
        score = train_loop(train, val, fold)
        scores[fold] = score
    print(f'Average MAE across folds: {np.mean(scores)}')
         
train(train_df)

----------------- Fold: 1 -----------------
Epoch 1 - avg_train_loss: 89.2945 avg_val_loss: 93.1047 MAE: 6.8994
﹂ saving model with score: 6.8994
Epoch 2 - avg_train_loss: 89.2852 avg_val_loss: 93.0827 MAE: 6.8983
﹂ saving model with score: 6.8983
Epoch 3 - avg_train_loss: 89.2757 avg_val_loss: 93.0598 MAE: 6.8972
﹂ saving model with score: 6.8972
Epoch 4 - avg_train_loss: 89.2654 avg_val_loss: 93.0403 MAE: 6.8962
﹂ saving model with score: 6.8962
Epoch 5 - avg_train_loss: 89.2554 avg_val_loss: 93.0233 MAE: 6.8953
﹂ saving model with score: 6.8953
Epoch 6 - avg_train_loss: 89.2464 avg_val_loss: 93.0088 MAE: 6.8945
﹂ saving model with score: 6.8945
Epoch 7 - avg_train_loss: 89.2384 avg_val_loss: 92.9966 MAE: 6.8938
﹂ saving model with score: 6.8938
Epoch 8 - avg_train_loss: 89.2316 avg_val_loss: 92.9868 MAE: 6.8932
﹂ saving model with score: 6.8932
Epoch 9 - avg_train_loss: 89.2256 avg_val_loss: 92.9787 MAE: 6.8927
﹂ saving model with score: 6.8927
Epoch 10 - avg_train_loss: 89.2206 avg

In [17]:
def get_models():
    models = []
    for i in range(CFG.N_FOLDS):   
        model = CNN()
        model.load_state_dict(torch.load(f'/home/fraverta/development/ML/Project/model_fold_{i+1}'))
        model.eval()
        models.append(model)
    return models

def inference(models, y_test):
    preds = []
    for i in range(len(models)):
        print(f'Inferencing model {i+1}')
        pred = models[i](y_test)[0]
        preds.append(pred.detach().numpy())
    return np.mean(preds, axis=0)

In [18]:
df_test = pd.read_csv('trading_the_close_data/example_test_files/test.csv')
df_test_targets = pd.read_csv('trading_the_close_data/example_test_files/revealed_targets.csv')[['stock_id', 'date_id', 'seconds_in_bucket', 'revealed_target']]

# merge df_test and df_test_targets based on stock_id date_id	seconds_in_bucket
df_test = df_test.merge(df_test_targets, on=['stock_id', 'date_id', 'seconds_in_bucket'], how='inner')
# select all  columns except revealed_target
df_test, y_test = df_test[df_test.columns.difference(['revealed_target'])] , df_test['revealed_target']
df_test['target'] = y_test
df_test.drop(['currently_scored'], axis=1, inplace=True)
df_test = df_test

print(len(df_test))

cols_during_training = ['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'row_id']

assert not [c for c in df_test.columns if c not in cols_during_training]
assert not [c for c in cols_during_training if c not in df_test.columns]

# sort df_tes columns as C
df_test = df_test[cols_during_training]



new_df_processed = preprocess(df_test, mode='test')
new_dataset = get_dataset(new_df_processed)


33000
Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'row_id'],
      dtype='object')


In [19]:
def predict_with_models(models, dataset):
    dataloader = DataLoader(dataset, batch_size=len(new_dataset), shuffle=False)
    all_predictions = []
    for model in models:
        model_predictions = []
        for inputs, _ in dataloader:  # No labels in prediction
            inputs = inputs.to(device)  # Move inputs to the same device as model
            with torch.no_grad():
                outputs = model(inputs)
                model_predictions.append(outputs.cpu().numpy())
        all_predictions.append(np.concatenate(model_predictions, axis=0))
    # Average predictions across models
    avg_predictions = np.mean(all_predictions, axis=0)
    return avg_predictions

models = get_models()
preds = predict_with_models(models, new_dataset)



In [25]:
preds = preds.flatten()
print(preds[0])
print("MAE", abs(preds - y_test).mean())


# count the number of predictions that were same sign
same_sign = np.sign(preds) == np.sign(y_test)
print("Sign Accuracy", same_sign.sum() / len(preds))

-0.0028205246
MAE 5.404275962407698
Sign Accuracy 0.48984848484848487


In [21]:
# make a df from preds and y_test
df = pd.DataFrame({'preds': preds, 'y_test': y_test})
df.to_csv('preds.csv', index=False)