In [1]:
from dataclasses import dataclass
from typing import Dict, List
import copy
from tqdm.auto import tqdm

import polars as pl
import numpy as np
from numba import njit, prange
from scipy.stats import spearmanr

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.base import clone

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

from torch.profiler import profile, ProfilerActivity, record_function

from CONFIG import CONFIG
from PREPROCESSOR import PREPROCESSOR
from FEATURE_ENGINEERING import FEATURE_ENGINEERING

In [2]:
train_x = pl.scan_csv(CONFIG.TRAIN_X_PATH).filter(pl.col("date_id") <= CONFIG.MAX_TRAIN_DATE)

train_x = PREPROCESSOR(df=train_x)
train_x.clean()
train_x = train_x.transform().lazy()


train_x = FEATURE_ENGINEERING(df=train_x)
train_x = train_x.create_all_features().collect()

  for col in all.columns


In [3]:
def rank_correlation_sharpe(targets, predictions) -> float:
    """
    Calculates the rank correlation between predictions and target values,
    and returns its Sharpe ratio (mean / standard deviation).

    :param merged_df: DataFrame containing prediction columns (starting with 'prediction_')
                      and target columns (starting with 'target_')
    :return: Sharpe ratio of the rank correlation
    :raises ZeroDivisionError: If the standard deviation is zero
    """
    correlations = []

    for pred_row, target_row in zip(predictions, targets):
        if np.std(pred_row) == 0 or np.std(target_row) == 0:
            raise ZeroDivisionError("Zero standard deviation in a row.")

        rho = spearmanr(pred_row, target_row).correlation
        correlations.append(rho)

    daily_rank_corrs = np.array(correlations)
    std_dev = daily_rank_corrs.std(ddof=0)
    if std_dev == 0:
        raise ZeroDivisionError("Denominator is zero, unable to compute Sharpe ratio.")

    sharpe_ratio = daily_rank_corrs.mean() / std_dev
    return float(sharpe_ratio)


class RankCorrelationSharpeRatioLoss(torch.nn.Module):
    """
    PyTorch Loss Function for Rank Correlation Sharpe Ratio

    Loss = -Sharpe_Ratio = -(mean_daily_rank_correlation / std_daily_rank_correlation)

    Goal: Minimize loss = Maximize Sharpe ratio of daily rank correlations
    """

    def __init__(self, regularization_eps=1e-8, ranking_temperature=1.0):
        super(RankCorrelationSharpeRatioLoss, self).__init__()
        self.regularization_eps = regularization_eps
        self.ranking_temperature = ranking_temperature

    def forward(self, predictions, targets, mask=None):
        """
        Args:
            predictions: (batch_size, n_features) - daily predictions for each security
            targets: (batch_size, n_features) - daily targets for each security
            mask: (batch_size, n_features) - optional mask for valid securities (1=valid, 0=invalid)
                  Use this to handle trading halts, holidays, delistings

        Returns:
            loss: scalar tensor = -sharpe_ratio (negative for minimization)
        """
        batch_size, n_features = predictions.shape

        if batch_size == 1:
            # Cannot compute Sharpe ratio with single sample
            return torch.tensor(0.0, device=predictions.device, requires_grad=True)

        # Apply mask if provided (handle missing/invalid securities)
        if mask is not None:
            predictions = predictions * mask
            targets = targets * mask

        # Compute daily rank correlations for each sample in batch
        daily_rank_correlations = []

        for i in range(batch_size):
            pred_day = predictions[i]  # (n_features,)
            target_day = targets[i]  # (n_features,)

            if mask is not None:
                # Only consider valid securities for this day
                valid_mask = mask[i] > 0
                if valid_mask.sum() < 2:  # Need at least 2 for correlation
                    continue
                pred_day = pred_day[valid_mask]
                target_day = target_day[valid_mask]

            # Check for zero variance (all predictions/targets are the same)
            if self._has_zero_variance(pred_day) or self._has_zero_variance(target_day):
                continue  # Skip this day

            # Compute differentiable rank correlation
            rank_corr = self._differentiable_rank_correlation(pred_day, target_day)
            daily_rank_correlations.append(rank_corr)

        if len(daily_rank_correlations) == 0:
            # No valid correlations computed
            return torch.tensor(0.0, device=predictions.device, requires_grad=True)

        # Stack daily correlations
        daily_correlations = torch.stack(daily_rank_correlations)

        # Compute Sharpe ratio = mean / std
        mean_corr = daily_correlations.mean()
        std_corr = daily_correlations.std() + self.regularization_eps

        sharpe_ratio = mean_corr / std_corr

        # Return negative Sharpe ratio (minimize loss = maximize Sharpe ratio)
        return -sharpe_ratio

    def _has_zero_variance(self, x):
        """Check if tensor has zero variance (all values are the same)"""
        return (x.max() - x.min()) < self.regularization_eps

    def _differentiable_rank_correlation(self, x, y):
        """
        Compute differentiable rank correlation between two 1D tensors
        Uses Pearson correlation of differentiable ranks
        """
        # Get differentiable ranks
        x_ranks = self._differentiable_ranking(x)
        y_ranks = self._differentiable_ranking(y)

        # Compute Pearson correlation of ranks
        return self._pearson_correlation(x_ranks, y_ranks)

    def _differentiable_ranking(self, x):
        """
        Convert values to differentiable ranks using sigmoid approximation
        """
        n = x.shape[0]

        # Create pairwise comparison matrix
        x_expanded = x.unsqueeze(1)  # (n, 1)
        x_transposed = x.unsqueeze(0)  # (1, n)

        # Pairwise differences
        pairwise_diff = x_expanded - x_transposed  # (n, n)

        # Sigmoid approximation of step function
        sigmoid_comparisons = torch.sigmoid(pairwise_diff / self.ranking_temperature)

        # Sum to get ranks (how many elements this element is greater than)
        ranks = sigmoid_comparisons.sum(dim=1)

        return ranks

    def _pearson_correlation(self, x, y):
        """Compute Pearson correlation coefficient"""
        # Center the data
        x_centered = x - x.mean()
        y_centered = y - y.mean()

        # Compute correlation
        numerator = (x_centered * y_centered).sum()

        # Compute standard deviations with regularization
        x_std = torch.sqrt((x_centered**2).sum() + self.regularization_eps)
        y_std = torch.sqrt((y_centered**2).sum() + self.regularization_eps)

        correlation = numerator / (x_std * y_std)

        # Clamp to valid range
        return torch.clamp(correlation, -1.0, 1.0)

In [4]:
def flatten_collate_fn(batch: list) -> dict[str, torch.Tensor]:
    """
    Collate function for DataLoader to flatten the batch.

    Args:
        batch (list): List of tuples containing tensors.

        tuple[torch.Tensor]: Flattened tensors (type, instr, X, y).
    """
    continuous_batch = torch.stack([item["continuous"] for item in batch])
    categorical_batch = torch.stack([item["categorical"] for item in batch])
    target_batch = torch.stack([item["target"] for item in batch])

    return {
        "continuous": continuous_batch,
        "categorical": categorical_batch,
        "target": target_batch,
    }


class CustomTensorDataset(Dataset):
    def __init__(self, date_id, x, type, instr, y, sequence_length=10):
        """
        Args:
            data: DataFrame or numpy array of shape (275561, 70)
                  First 68 columns: continuous features
                  Last 2 columns: categorical features (should be integers)
            target: DataFrame or numpy array of shape (1927, 424)
                    Target values for each date
            sequence_length: Number of time steps to use for prediction
        """

        self.num_features = x.shape[-1]
        # Split continuous and categorical features
        self.dates = torch.tensor(date_id, dtype=torch.int16)
        self.continuous_data = torch.tensor(x, dtype=torch.float16)
        self.continuous_data = torch.nan_to_num(self.continuous_data, 0)
        self.continuous_data = (self.continuous_data - self.continuous_data.mean(dim=(0, 1), keepdim=True)) / self.continuous_data.std(
            dim=(0, 1), keepdim=True
        )
        self.continuous_data = torch.nan_to_num(self.continuous_data, 0)
        if torch.isnan(self.continuous_data).any() or torch.isinf(self.continuous_data).any():
            print("Input contains NaN or Inf values!")
        self.type = torch.tensor(type, dtype=torch.long)
        self.instr = torch.tensor(instr, dtype=torch.long)
        self.categorical_data = torch.stack((self.type, self.instr), dim=-1)
        self.y = torch.tensor(y, dtype=torch.float16)

        self.sequence_length = sequence_length
        self.n_tickers = CONFIG.N_INSTR
        self.unique_date, self.inverse_indices, self.counts = torch.unique(self.dates, return_inverse=True, return_counts=True)

        self.n_unique_dates = self.counts.sum().item()

        self.n_categorical_features = 2

        # Reshape data to (unique_date, n_tickers, n_features)
        self.reshaped_continuous = self.continuous_data.view(self.n_unique_dates, self.n_tickers, self.num_features)
        self.reshaped_categorical = self.categorical_data.view(self.n_unique_dates, self.n_tickers, self.n_categorical_features)

    def __len__(self):
        # We can create sequences from date indices, leaving room for sequence_length
        return max(0, self.n_unique_dates - self.sequence_length)

    def __getitem__(self, idx):
        # Get sequence of dates starting from idx
        continuous_seq = self.reshaped_continuous[idx : idx + self.sequence_length]  # (seq_len, 143, 68)
        categorical_seq = self.reshaped_categorical[idx : idx + self.sequence_length]  # (seq_len, 143, 2)
        # Target is the 424-dimensional vector for the next date
        if idx + self.sequence_length < self.n_unique_dates:
            target = self.y[idx + self.sequence_length]  # (424,)
        else:
            # For the last sequence, use the last available target
            target = self.y[-1]  # (424,)

        return {
            "continuous": continuous_seq,
            "categorical": categorical_seq,
            "target": target,
        }


# --- Model Definition ---
class TIMESERIES_NN(nn.Module):
    def __init__(
        self,
        num_numerical_features: int,
        unique_cats=[
            CONFIG.N_TYPES,
            CONFIG.N_INSTR,
        ],
        embedding_dims=[
            min(50, max(CONFIG.N_TYPES // 2, 4)),
            min(50, max(CONFIG.N_INSTR // 2, 4)),
        ],
        n_tickers: int = 143,
        sequence_length=30,
        hidden_size=128,
        rnn_layers=2,
        dropout=0.2,
        output_dim=424,
    ):
        super().__init__()

        # Store these for use in forward method
        self.n_tickers = n_tickers
        self.seq_len = sequence_length
        self.hidden_size = hidden_size

        # Create embedding layers for each categorical feature using ModuleDict
        self.embeddings = nn.ModuleList(
            [nn.Embedding(vocab_size, embed_dim, padding_idx=0) for vocab_size, embed_dim in zip(unique_cats, embedding_dims)]
        )

        self.input_dim = num_numerical_features + sum(embedding_dims)
        self.input_projection = nn.Linear(self.input_dim, hidden_size)

        # GRU layers for temporal modeling
        self.gru = nn.GRU(
            input_size=hidden_size,
            hidden_size=hidden_size,
            num_layers=rnn_layers,
            batch_first=True,
            dropout=dropout if rnn_layers > 1 else 0,
            bidirectional=False,
        )

        # Attention mechanism for ticker aggregation
        self.attention = nn.MultiheadAttention(embed_dim=hidden_size, num_heads=8, dropout=dropout, batch_first=True)

        # Layer normalization
        self.layer_norm1 = nn.LayerNorm(hidden_size)
        self.layer_norm2 = nn.LayerNorm(hidden_size)

        # Output projection layers
        self.fc_layers = nn.Sequential(
            nn.Linear(hidden_size, hidden_size * 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size * 2, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, output_dim),
        )

        self._init_weights()

    def _init_weights(self):
        """Initialize model weights"""
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.zeros_(module.bias)
            elif isinstance(module, nn.GRU):
                for name, param in module.named_parameters():
                    if "weight_ih" in name:
                        nn.init.xavier_uniform_(param.data)
                    elif "weight_hh" in name:
                        nn.init.orthogonal_(param.data)
                    elif "bias" in name:
                        nn.init.zeros_(param.data)

    def forward(self, batch):
        # Extract continuous and categorical data
        continuous = batch["continuous"]  # (batch_size, seq_len, n_tickers, n_continuous_features)
        categorical = batch["categorical"]  # (batch_size, seq_len, n_tickers, n_categorical_features)

        batch_size, seq_len, num_tickers, input_size = continuous.shape

        # Process embeddings if categorical features are provided
        embedded_features = []

        for i in range(len(CONFIG.CAT_COLS)):
            # Extract i-th categorical feature: (batch_size, seq_len, num_tickers)
            cat_feature = categorical[:, :, :, i]
            # Apply embedding: (batch_size, seq_len, num_tickers, embed_dim)
            embedded = self.embeddings[i](cat_feature)
            embedded_features.append(embedded)

        # Concatenate all embedded features along the last dimension
        embedded_concat = torch.cat(embedded_features, dim=-1)
        # Concatenate with continuous features
        x_combined = torch.cat([continuous, embedded_concat], dim=-1)

        # Reshape to process all tickers across all time steps
        # (batch_size * seq_len, num_tickers, total_input_size)
        x_reshaped = x_combined.view(batch_size * seq_len, num_tickers, self.input_dim)

        # Project input features
        # (batch_size * seq_len, num_tickers, hidden_size)
        x_proj = self.input_projection(x_reshaped)
        x_proj = torch.relu(x_proj)

        # Apply attention across tickers for each time step
        # This aggregates information across all tickers at each time point
        attn_output, _ = self.attention(x_proj, x_proj, x_proj)
        x_proj = self.layer_norm1(x_proj + attn_output)

        # Aggregate across tickers (mean pooling)
        # (batch_size * seq_len, hidden_size)
        x_agg = torch.mean(x_proj, dim=1)

        # Reshape back to sequence format for GRU
        # (batch_size, seq_len, hidden_size)
        x_seq = x_agg.view(batch_size, seq_len, self.hidden_size)

        # Apply GRU for temporal modeling
        self.gru.flatten_parameters()
        gru_out, hidden = self.gru(x_seq)

        # Use the last output from the sequence
        # (batch_size, hidden_size)
        last_output = gru_out[:, -1, :]
        last_output = self.layer_norm2(last_output)

        # Generate final prediction
        # (batch_size, output_size)
        output = self.fc_layers(last_output)

        return output

In [5]:
class NN:
    def __init__(
        self,
        lr: float = 0.001,
        batch_size: int = 1,
        epochs: int = 100,
        early_stopping_patience: int = 10,
        early_stopping: bool = True,
        lr_patience: int = 2,
        lr_factor: float = 0.5,
        lr_refit: float = 0.001,
        random_seed: int = CONFIG.RANDOM_STATE,
        **kwargs,
    ) -> None:
        self.lr = lr
        self.batch_size = batch_size
        self.epochs = epochs
        self.early_stopping_patience = early_stopping_patience
        self.early_stopping = early_stopping
        self.lr_patience = lr_patience
        self.lr_factor = lr_factor
        self.lr_refit = lr_refit
        self.random_seed = random_seed

        self.criterion = nn.MSELoss()

        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.model = None
        self.optimizer = None
        self.best_epoch = None
        self.features = None
        self.kwargs = kwargs

    def fit(self, train_set: tuple, val_set: tuple, verbose: bool = False) -> None:
        """Fit the model on the training set and validate on the validation set.

        Args:
            train_set (tuple): A tuple containing input data, targets for training.
            val_set (tuple): A tuple containing input data, targets for validation.
            verbose (bool, optional): If True, prints training progress. Defaults to False.
        """
        torch.manual_seed(self.random_seed)

        train_dataset = CustomTensorDataset(*train_set)
        train_dataloader = DataLoader(
            train_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            collate_fn=flatten_collate_fn,
            pin_memory=True,
            # num_workers=2,
            # persistent_workers=True,
            # prefetch_factor=2,
        )

        val_dataset = CustomTensorDataset(*val_set)
        val_dataloader = DataLoader(
            val_dataset,
            batch_size=self.batch_size,
            shuffle=False,
            collate_fn=flatten_collate_fn,
            pin_memory=True,
            # num_workers=6,
            # persistent_workers=True,
            # prefetch_factor=2,
        )

        self.model = TIMESERIES_NN(num_numerical_features=train_dataset.num_features, **self.kwargs).to(self.device)

        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=self.lr, weight_decay=0.01)

        train_sharpes, val_sharpes = [], []
        if verbose:
            print(f"Device: {self.device}")
            print(f"{'Epoch':^5} | {'Train Loss':^10} | {'Val Loss':^8} | {'Train sharpe':^9} | {'Val sharpe':^7} | {'LR':^7}")
            print("-" * 60)

        min_val_sharpe = -np.inf
        best_epoch = 0
        no_improvement = 0
        best_model = None
        for epoch in range(self.epochs):
            train_loss, train_sharpe = self.train_one_epoch(train_dataloader, verbose)
            val_loss, val_sharpe = self.validate_one_epoch(val_dataloader, verbose)
            lr_last = self.optimizer.param_groups[0]["lr"]

            train_sharpes.append(train_sharpe)
            val_sharpes.append(val_sharpe)

            if verbose:
                print(f"{epoch + 1:^5} | {train_loss:^10.4f} | {val_loss:^8.4f} | {train_sharpe:^9.4f} | {val_sharpe:^7.4f} | {lr_last:^7.5f}")

            if val_sharpe > min_val_sharpe:
                min_val_sharpe = val_sharpe
                best_model = copy.deepcopy(self.model.state_dict())
                no_improvement = 0
                best_epoch = epoch
            else:
                no_improvement += 1

            if self.early_stopping:
                if no_improvement >= self.early_stopping_patience + 1:
                    self.best_epoch = best_epoch + 1
                    if verbose:
                        print(f"Early stopping on epoch {best_epoch + 1}. Best score: {min_val_sharpe:.4f}")
                    break

        # Load the best model
        if self.early_stopping:
            self.model.load_state_dict(best_model)

    def train_one_epoch(self, train_dataloader: DataLoader, verbose: bool) -> None:
        """Train the model for one epoch.

        Args:
            train_dataloader (DataLoader): DataLoader for the training set.
            verbose (bool): If True, shows progress using tqdm.

        Returns:
            tuple[float, float]: A tuple containing:
                - Train loss (float).
                - Spearman Sharpe for the training set (float).
        """
        self.model.train()
        total_loss = 0.0

        y_total, preds_total = [], []

        itr = train_dataloader

        for batch in itr:
            batch = {key: value.to(self.device) for key, value in batch.items()}
            y_batch = batch["target"]

            self.optimizer.zero_grad()
            with torch.autocast(device_type="cuda"):
                out_y = self.model(batch)
                out_y = torch.nan_to_num(out_y)
                loss = self.criterion(out_y, y_batch)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)

            self.optimizer.step()

            total_loss += loss.item()

            y_total.append(batch["target"])
            preds_total.append(out_y.detach())

            torch.cuda.empty_cache()

        y_total = torch.cat(y_total).cpu().numpy()
        preds_total = torch.cat(preds_total).cpu().numpy()

        train_sharpe = rank_correlation_sharpe(y_total, preds_total)
        train_loss = total_loss / len(train_dataloader)

        return train_loss, train_sharpe

    def validate_one_epoch(self, val_dataloader: DataLoader, verbose=False) -> None:
        """Validate the model on the validation set.

        Args:
            val_dataloader (DataLoader): DataLoader for the validation set.
            verbose (bool, optional): If True, shows progress using tqdm. Defaults to False.

        Returns:
            tuple[float, float]: A tuple containing:
                - Validation loss (float).
                - Spearman Sharpe for the validation set (float).
        """
        model = copy.deepcopy(self.model)

        losses, all_y, all_preds = [], [], []

        itr = val_dataloader
        for batch in itr:
            batch = {key: value.to(self.device) for key, value in batch.items()}
            y_batch = batch["target"]

            # Predict
            with torch.inference_mode():
                model.eval()
                preds_batch = model(batch)
                preds_batch = torch.nan_to_num(preds_batch)
                loss = self.criterion(
                    preds_batch,
                    y_batch,
                )
                losses.append(loss.item())

                all_y.append(y_batch)
                all_preds.append(preds_batch)

            # Update weights
            if self.lr_refit > 0:
                optimizer = torch.optim.AdamW(model.parameters(), lr=self.lr_refit, weight_decay=0.01)
                optimizer.zero_grad()
                model.train()
                with torch.autocast(device_type="cuda"):
                    out_y = model(batch)
                    out_y = torch.nan_to_num(out_y)
                    loss = self.criterion(out_y, y_batch)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                optimizer.step()
            torch.cuda.empty_cache()

        all_y = torch.cat(all_y).cpu().numpy()
        all_preds = torch.cat(all_preds).cpu().numpy()
        loss = np.mean(losses)
        sharpe = rank_correlation_sharpe(all_y, all_preds)

        return loss, sharpe

    def update(
        self,
        type_data: np.array,
        instr_data: np.array,
        X: np.array,
        y: np.array,
        n_times: int,
    ):
        """Update the model with new data.

        Args:
            X (np.array): Input data.
            y (np.array): Target variable.
            n_times (int): Number of time steps.
        """
        if self.lr_refit == 0.0:
            return

        N, K, num_features = X.shape
        continuous_data = torch.tensor(X, dtype=torch.float16, device=self.device)
        continuous_data = torch.nan_to_num(continuous_data, 0)
        continuous_data = (continuous_data - continuous_data.mean(dim=(0, 1), keepdim=True)) / continuous_data.std(dim=(0, 1), keepdim=True)
        continuous_data = torch.nan_to_num(continuous_data, 0)
        type = torch.tensor(type_data, dtype=torch.long, device=self.device)
        instr = torch.tensor(instr_data, dtype=torch.long, device=self.device)
        categorical_data = torch.stack((type, instr), dim=-1)
        y = torch.tensor(y, dtype=torch.float16, device=self.device)

        # Reshape data to (unique_date, n_tickers, n_features)
        reshaped_continuous = continuous_data.view(N, CONFIG.N_INSTR, num_features)
        reshaped_categorical = categorical_data.view(N, CONFIG.N_INSTR, len(CONFIG.CAT_COLS))

        batch = {
            "continuous": reshaped_continuous.unsqueeze(0),
            "categorical": reshaped_categorical.unsqueeze(0),
        }

        self.model.train()

        optimizer = torch.optim.AdamW(self.model.parameters(), lr=self.lr_refit, weight_decay=0.01)
        optimizer.zero_grad()
        with torch.autocast(device_type="cuda"):
            out_y = self.model(batch)
            out_y = torch.nan_to_num(out_y)
            loss = self.criterion(out_y, y)
            loss = torch.nan_to_num(loss, nan=-10.0)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)
        optimizer.step()

    def predict(
        self,
        type_data: np.array,
        instr_data: np.array,
        X: np.array,
        n_times: int = None,
        hidden: torch.Tensor | list | None = None,
    ) -> tuple[np.array, torch.Tensor | list]:
        """Predict the target variable for the given input data.

        Args:
            X (np.array): Input data.
            n_times (int, optional): Number of time steps. Defaults to None.
            hidden (torch.Tensor or list or None, optional): Initial hidden state. Defaults to None.

        Returns:
            tuple[np.array, torch.Tensor or list]: A tuple containing:
                - Predictions (np.array).
                - Hidden state (torch.Tensor or list).
        """
        N, K, num_features = X.shape
        continuous_data = torch.tensor(X, dtype=torch.float16, device=self.device)
        continuous_data = torch.nan_to_num(continuous_data, 0)
        continuous_data = (continuous_data - continuous_data.mean(dim=(0, 1), keepdim=True)) / continuous_data.std(dim=(0, 1), keepdim=True)
        continuous_data = torch.nan_to_num(continuous_data, 0)
        type = torch.tensor(type_data, dtype=torch.long, device=self.device)
        instr = torch.tensor(instr_data, dtype=torch.long, device=self.device)
        categorical_data = torch.stack((type, instr), dim=-1)

        # Reshape data to (unique_date, n_tickers, n_features)
        reshaped_continuous = continuous_data.view(N, CONFIG.N_INSTR, num_features)
        reshaped_categorical = categorical_data.view(N, CONFIG.N_INSTR, len(CONFIG.CAT_COLS))

        batch = {
            "continuous": reshaped_continuous.unsqueeze(0),
            "categorical": reshaped_categorical.unsqueeze(0),
        }

        self.model.eval()
        with torch.inference_mode():
            preds = self.model(batch)
            preds = torch.nan_to_num(preds)
        return preds.cpu().numpy()

In [6]:
# --- Prepare DataLoader ---
# Create the dataset

train_x = pl.scan_csv(CONFIG.TRAIN_X_PATH).filter(pl.col("date_id") <= CONFIG.MAX_TRAIN_DATE)
train_y = pl.scan_csv(CONFIG.TRAIN_Y_PATH).filter(pl.col("date_id") <= CONFIG.MAX_TRAIN_DATE).fill_null(0).collect()

train_x = PREPROCESSOR(df=train_x)
train_x.clean()
train_x = train_x.transform().lazy()

train_x = FEATURE_ENGINEERING(df=train_x)
train_x = train_x.create_all_features().select(set(CONFIG.IMPT_COLS + CONFIG.CAT_COLS + [CONFIG.DATE_COL])).collect()

cats = train_x.select(CONFIG.CAT_COLS)
type_encoder = LabelEncoder().fit(train_x.select(CONFIG.CAT_COLS[0]).to_numpy().flatten())
instr_encoder = LabelEncoder().fit(train_x.select(CONFIG.CAT_COLS[1]).to_numpy().flatten())

NN_model = NN(
    **CONFIG.NN_PARAMS,
    batch_size=CONFIG.BATCH_SIZE,
    lr=0.0001,
    lr_refit=0.0005,
)

  for col in all.columns


In [None]:
def make_data(df: pl.DataFrame, type_encoder: LabelEncoder, instr_encoder: LabelEncoder):
    type = type_encoder.transform(df.select(CONFIG.CAT_COLS[0]).to_numpy().flatten())
    instr = instr_encoder.transform(df.select(CONFIG.CAT_COLS[1]).to_numpy().flatten())
    x = df.drop([CONFIG.DATE_COL] + CONFIG.CAT_COLS).to_numpy()

    return (
        type.reshape(-1, CONFIG.N_INSTR),
        instr.reshape(-1, CONFIG.N_INSTR),
        x.reshape(-1, CONFIG.N_INSTR, x.shape[1]),
    )


# test_size = (
#     TEST_SIZE
#     if len(dates_unique) > TEST_SIZE * (n_splits + 1)
#     else len(dates_unique) // (n_splits + 1)
# )  # For testing purposes on small samples

dates_unique = train_x.select(pl.col(CONFIG.DATE_COL).unique().sort()).to_series().to_numpy()

cv = KFold(n_splits=CONFIG.N_FOLDS, shuffle=False)
cv_split = cv.split(dates_unique)

scores = []
models = []
for fold, (train_idx, valid_idx) in enumerate(cv_split):
    if CONFIG.VERBOSE:
        print("-" * 20 + f"Fold {fold}" + "-" * 20)
        print(f"Train dates from {dates_unique[train_idx].min()} to {dates_unique[train_idx].max()}")
        print(f"Valid dates from {dates_unique[valid_idx].min()} to {dates_unique[valid_idx].max()}")

    dates_train = dates_unique[train_idx]
    dates_valid = dates_unique[valid_idx]

    df_train = train_x.filter(pl.col(CONFIG.DATE_COL).is_in(dates_train))

    type_train, instr_train, df_train = make_data(df=df_train, type_encoder=type_encoder, instr_encoder=instr_encoder)

    df_train_y = train_y.filter(pl.col(CONFIG.DATE_COL).is_in(dates_train)).drop(CONFIG.DATE_COL).to_numpy()

    df_valid = train_x.filter(pl.col(CONFIG.DATE_COL).is_in(dates_valid))
    type_valid, instr_valid, df_valid_arr = make_data(df=df_valid, type_encoder=type_encoder, instr_encoder=instr_encoder)
    df_valid_y = train_y.filter(pl.col(CONFIG.DATE_COL).is_in(dates_train)).drop(CONFIG.DATE_COL).to_numpy()

    model_fold = copy.deepcopy(NN_model)

    model_fold.fit(
        train_set=(
            dates_train,
            df_train,
            type_train,
            instr_train,
            df_train_y,
            CONFIG.SEQ_LEN,
        ),
        val_set=(
            dates_valid,
            df_valid_arr,
            type_valid,
            instr_valid,
            df_valid_y,
            CONFIG.SEQ_LEN,
        ),
        verbose=CONFIG.VERBOSE,
    )

    models.append(model_fold)

    torch.save(
        model_fold.model.state_dict(),
        f"C:/Users/Admin/Desktop/Personal-Projects/Kaggle/MITSUI&CO. Commodity Prediction Challenge/sample_{fold}.pth",
    )

    preds = []
    cnt_dates = 0
    model_save = copy.deepcopy(model_fold)

    for date_id in tqdm(dates_valid):
        df_valid_date = df_valid.filter(pl.col(CONFIG.DATE_COL).is_in(range(date_id - CONFIG.SEQ_LEN, date_id)))
        avail_dates = df_valid_date["date_id"].unique().len()
        if avail_dates < CONFIG.SEQ_LEN:
            continue

        type_valid_date, instr_valid_date, df_valid_date = make_data(df=df_valid_date, type_encoder=type_encoder, instr_encoder=instr_encoder)

        if model_fold.lr_refit and (cnt_dates > 0):
            period = range((date_id - 1 - CONFIG.BATCH_SIZE * CONFIG.SEQ_LEN), date_id - 1)
            df_upd = train_x.filter(pl.col(CONFIG.DATE_COL).is_in(period))
            df_upd_y = train_y.filter(pl.col(CONFIG.DATE_COL).is_in(date_id - 1)).drop(CONFIG.DATE_COL).to_numpy()
            type_upd, instr_upd, df_upd = make_data(
                df=df_upd,
                type_encoder=type_encoder,
                instr_encoder=instr_encoder,
            )
            if len(df_upd) > 0:
                model_save.update(type_upd, instr_upd, df_upd, df_upd_y, 1)

        preds_i = model_save.predict(type_valid_date, instr_valid_date, df_valid_date, n_times=1)
        preds += list(preds_i[-1].reshape(-1, CONFIG.NUM_TARGET_COLUMNS))
        cnt_dates += 1
    preds = np.array(preds)

    score = rank_correlation_sharpe(df_valid_y, preds)
    scores.append(score)

    print(f"Sharpe: {score:.5f}")

--------------------Fold 0--------------------
Train dates from 367 to 1826
Valid dates from 2 to 366
Device: cuda:0
Epoch | Train Loss | Val Loss | Train sharpe | Val sharpe |   LR   
------------------------------------------------------------
  1   |   0.0853   |  0.0020  |  -0.0093  | -0.1048 | 0.00010
  2   |   0.0063   |  0.0018  |  -0.0079  | -0.0306 | 0.00010
  3   |   0.0031   |  0.0017  |  0.0424   | -0.0838 | 0.00010
  4   |   0.0022   |  0.0017  |  0.0045   | -0.0139 | 0.00010
  5   |   0.0018   |  0.0017  |  0.0162   | 0.0567  | 0.00010
  6   |   0.0016   |  0.0017  |  0.0182   | -0.0426 | 0.00010
  7   |   0.0014   |  0.0017  |  -0.0178  | 0.0069  | 0.00010
  8   |   0.0013   |  0.0017  |  0.0808   | 0.0193  | 0.00010
  9   |   0.0012   |  0.0017  |  0.1019   | 0.0424  | 0.00010
 10   |   0.0012   |  0.0017  |  0.1007   | 0.0288  | 0.00010
 11   |   0.0012   |  0.0017  |  0.0680   | 0.0440  | 0.00010
 12   |   0.0011   |  0.0017  |  0.0565   | 0.0677  | 0.00010
 13   |   

  0%|          | 0/365 [00:00<?, ?it/s]

Sharpe: 0.15766
--------------------Fold 1--------------------
Train dates from 2 to 1826
Valid dates from 367 to 731
Device: cuda:0
Epoch | Train Loss | Val Loss | Train sharpe | Val sharpe |   LR   
------------------------------------------------------------
  1   |   0.0847   |  0.0010  |  -0.0238  | -0.0386 | 0.00010
  2   |   0.0061   |  0.0007  |  -0.0532  | 0.1570  | 0.00010
  3   |   0.0029   |  0.0006  |  0.0093   | 0.0251  | 0.00010
  4   |   0.0020   |  0.0006  |  0.0053   | 0.1483  | 0.00010
  5   |   0.0016   |  0.0006  |  0.0385   | 0.0731  | 0.00010
  6   |   0.0013   |  0.0006  |  0.0323   | 0.0739  | 0.00010
  7   |   0.0012   |  0.0006  |  0.0254   | 0.1049  | 0.00010
  8   |   0.0011   |  0.0006  |  0.0927   | 0.1071  | 0.00010
  9   |   0.0010   |  0.0006  |  0.0895   | 0.1092  | 0.00010
 10   |   0.0010   |  0.0006  |  0.1213   | 0.1128  | 0.00010
 11   |   0.0009   |  0.0006  |  0.0722   | 0.0738  | 0.00010
 12   |   0.0009   |  0.0006  |  0.1021   | 0.0973  | 0.

  0%|          | 0/365 [00:00<?, ?it/s]

Sharpe: 0.12193
--------------------Fold 2--------------------
Train dates from 2 to 1826
Valid dates from 732 to 1096
Device: cuda:0
Epoch | Train Loss | Val Loss | Train sharpe | Val sharpe |   LR   
------------------------------------------------------------
  1   |   0.0853   |  0.0009  |  -0.0594  | -0.0744 | 0.00010
  2   |   0.0062   |  0.0007  |  -0.0431  | 0.1221  | 0.00010
  3   |   0.0030   |  0.0006  |  -0.0142  | 0.0393  | 0.00010
  4   |   0.0021   |  0.0006  |  0.0048   | 0.1152  | 0.00010
  5   |   0.0017   |  0.0006  |  0.0220   | 0.0348  | 0.00010
  6   |   0.0015   |  0.0006  |  0.0338   | 0.0895  | 0.00010
  7   |   0.0013   |  0.0006  |  0.0328   | 0.1250  | 0.00010
  8   |   0.0012   |  0.0006  |  0.0861   | 0.1244  | 0.00010
  9   |   0.0012   |  0.0006  |  0.1041   | 0.1272  | 0.00010
 10   |   0.0011   |  0.0006  |  0.1079   | 0.1135  | 0.00010
 11   |   0.0011   |  0.0006  |  0.1182   | 0.0956  | 0.00010
 12   |   0.0010   |  0.0006  |  0.1179   | 0.1144  | 0

  0%|          | 0/365 [00:00<?, ?it/s]

Sharpe: 0.00880
--------------------Fold 3--------------------
Train dates from 2 to 1826
Valid dates from 1097 to 1461
Device: cuda:0
Epoch | Train Loss | Val Loss | Train sharpe | Val sharpe |   LR   
------------------------------------------------------------
  1   |   0.0854   |  0.0009  |  -0.0544  | -0.0560 | 0.00010
  2   |   0.0062   |  0.0007  |  -0.0284  | 0.0245  | 0.00010
  3   |   0.0031   |  0.0006  |  -0.0387  | 0.0399  | 0.00010


In [None]:
# --- Prepare DataLoader ---
# Create the dataset

train_x = pl.scan_csv(CONFIG.TRAIN_X_PATH)
train_x = PREPROCESSOR(df=train_x)
train_x.clean()
train_x = train_x.transform().lazy()

train_x = FEATURE_ENGINEERING(df=train_x)
train_x = train_x.create_all_features().collect()

train_y = pl.scan_csv(CONFIG.TRAIN_Y_PATH).fill_null(0).collect()

test_x = train_x.filter(pl.col("date_id") > CONFIG.MAX_TRAIN_DATE)

NameError: name 'pl' is not defined

In [None]:
model = NN(
    **CONFIG.NN_PARAMS,
    batch_size=CONFIG.BATCH_SIZE,
    lr=0.002,
    lr_refit=0.001,
)

model.model = TIMESERIES_NN(num_numerical_features=len(test_x.drop([CONFIG.DATE_COL] + CONFIG.CAT_COLS).columns), **CONFIG.NN_PARAMS)
model.model.load_state_dict(torch.load(f"C:/Users/Admin/Desktop/Personal-Projects/Kaggle/MITSUI&CO. Commodity Prediction Challenge/sample_{0}.pth"))

RuntimeError: Error(s) in loading state_dict for TIMESERIES_NN:
	size mismatch for feature_extractor.0.weight: copying a param with shape torch.Size([14157, 14157]) from checkpoint, the shape in current model is torch.Size([43186, 43186]).
	size mismatch for feature_extractor.0.bias: copying a param with shape torch.Size([14157]) from checkpoint, the shape in current model is torch.Size([43186]).
	size mismatch for feature_extractor.1.weight: copying a param with shape torch.Size([14157]) from checkpoint, the shape in current model is torch.Size([43186]).
	size mismatch for feature_extractor.1.bias: copying a param with shape torch.Size([14157]) from checkpoint, the shape in current model is torch.Size([43186]).
	size mismatch for feature_extractor.4.weight: copying a param with shape torch.Size([7078, 14157]) from checkpoint, the shape in current model is torch.Size([21593, 43186]).
	size mismatch for feature_extractor.4.bias: copying a param with shape torch.Size([7078]) from checkpoint, the shape in current model is torch.Size([21593]).
	size mismatch for feature_extractor.5.weight: copying a param with shape torch.Size([7078]) from checkpoint, the shape in current model is torch.Size([21593]).
	size mismatch for feature_extractor.5.bias: copying a param with shape torch.Size([7078]) from checkpoint, the shape in current model is torch.Size([21593]).
	size mismatch for feature_extractor.8.weight: copying a param with shape torch.Size([512, 7078]) from checkpoint, the shape in current model is torch.Size([512, 21593]).

In [None]:
preds = []
test_dates = test_x.select(CONFIG.DATE_COL).unique().to_series()
for date_id in tqdm(test_dates):
    predict_x = train_x.filter(pl.col(CONFIG.DATE_COL) == date_id)
    type_predict_x, instr_predict_x, df_predict_x = make_data(
        df=predict_x,
        type_encoder=type_encoder,
        instr_encoder=instr_encoder,
    )
    preds = model.predict(type_predict_x, instr_predict_x, df_predict_x, n_times=1)
    if model.lr_refit and (cnt_dates > 0):
        df_upd = train_x.filter(pl.col(CONFIG.DATE_COL).is_in(range(date_id - 1 - CONFIG.SEQ_LEN, date_id - 1)))
        df_upd_y = train_y.filter(pl.col(CONFIG.DATE_COL).is_in(range(date_id - 1 - CONFIG.SEQ_LEN, date_id - 1))).drop(CONFIG.DATE_COL).to_numpy()
        type_upd, instr_upd, df_upd = make_data(
            df=df_upd,
            type_encoder=type_encoder,
            instr_encoder=instr_encoder,
        )
        if len(df_upd) > 0:
            model_save.update(type_upd, instr_upd, df_upd, df_upd_y, 1)
            preds_i = model_save.predict(type_valid_date, instr_valid_date, df_valid_date, n_times=1)
    preds += list(preds_i[-1].reshape(-1, CONFIG.NUM_TARGET_COLUMNS))

preds = np.array(preds)

score = rank_correlation_sharpe(train_y.filter(pl.col(CONFIG.DATE_COL) > CONFIG.MAX_TRAIN_DATE).to_numpy(), preds)
scores.append(score)

print(f"Sharpe: {score:.5f}")

  0%|          | 0/90 [00:00<?, ?it/s]

shape: (90,)
Series: 'date_id' [i64]
[
	1840
	1843
	1846
	1837
	1849
	â€¦
	1914
	1905
	1834
	1828
	1831
]
