# This notebook has two main functions:

1. Pickle our pre-trained models to submit to the NumerAI Leaderboard
2. Create an ensemble of our two best models - Autoencoder & LGBM

This will help us identify the true performance of our model when used to make stock trading decisions.

## 0. Explanation of Model Submission
To participate in the tournament, you must submit live predictions every Tuesday-Saturday.

To automate this process, you can simply:
- Define your prediction pipeline as a function
- Serialize your function using the `cloudpickle` library
- Upload your model pickle file to Numerai
- Let Numerai run your model to submit live predictions every day

Read more about Model Uploads and other self-hosted automation options in our [docs](https://docs.numer.ai/numerai-tournament/submissions#automation).


In [None]:
# Install all dependencies
!pip install -r requirements.txt --quiet

## 1. Original Models


1.1 LightGBM

In [None]:
#load from model:

import lightgbm as lgb
import json
import pandas as pd
model = lgb.Booster(model_file='LGBM_Full.txt')
feature_metadata = json.load(open(f"features.json"))
feature_set = feature_metadata["feature_sets"]['all']

# Define your prediction pipeline as a function
def predict(live_features: pd.DataFrame) -> pd.DataFrame:
    live_predictions = model.predict(live_features[feature_set])
    submission = pd.Series(live_predictions, index=live_features.index)
    return submission.to_frame("prediction")

# Use the cloudpickle library to serialize your function
import cloudpickle
p = cloudpickle.dumps(predict)
with open("predict.pkl", "wb") as f:
    f.write(p)

# Download file if running in Google Colab
try:
    from google.colab import files
    files.download('lgbm.pkl')
except:
    pass

1.2 Autoencoder

In [None]:
import torch
import torch.nn as nn
# Define the Swish activation function
class Swish(nn.Module):
    def __init__(self):
        super(Swish, self).__init__()

    def forward(self, x):
        return x * torch.sigmoid(x)

# Define the Autoencoder
class Autoencoder(nn.Module):
    def __init__(self, feature_dim, encoding_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(feature_dim, 1500),
            nn.BatchNorm1d(1500),
            Swish(),
            nn.Linear(1500, 1000),
            nn.BatchNorm1d(1000),
            Swish(),
            nn.Linear(1000, 500),
            nn.BatchNorm1d(500),
            Swish(),
            nn.Linear(500, encoding_dim),
            nn.BatchNorm1d(encoding_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 500),
            nn.BatchNorm1d(500),
            Swish(),
            nn.Linear(500, 1000),
            nn.BatchNorm1d(1000),
            Swish(),
            nn.Linear(1000, 1500),
            nn.BatchNorm1d(1500),
            Swish(),
            nn.Linear(1500, feature_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

# Define the MLP
class MLP(nn.Module):
    def __init__(self, input_dim, dropout_rate):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.BatchNorm1d(1024),
            Swish(),
            nn.Dropout(dropout_rate),
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            Swish(),
            nn.Dropout(dropout_rate),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            Swish(),
            nn.Dropout(dropout_rate),
            nn.Linear(256, 1)
        )

    def forward(self, x):
        return self.layers(x)

import torch
import torch.nn as nn

# Load Models
dropout_rate = 0.4
feature_dim = 2376  # Your feature dimension
encoding_dim = 1000  # Your encoding dimension
input_dim = feature_dim + encoding_dim  # Total input dimension for MLP

autoencoder = Autoencoder(feature_dim, encoding_dim)
mlp = MLP(input_dim, dropout_rate)

autoencoder.load_state_dict(torch.load('autoencoder.pth')['autoencoder'])
mlp.load_state_dict(torch.load('autoencoder.pth')['mlp'])

autoencoder.eval()
mlp.eval()

import json
feature_metadata = json.load(open(f"features.json"))
feature_set = feature_metadata["feature_sets"]['all']

In [None]:
import pandas as pd
# Define your prediction pipeline as a function
def predict(live_features: pd.DataFrame) -> pd.DataFrame:
    # Ensure the model is in the evaluation mode
    from torch.utils.data import TensorDataset, DataLoader

    # Prepare the validation dataset
    X_val_tensor = torch.tensor(live_features[feature_set].values, dtype=torch.float32)
    y_val_tensor = torch.tensor(live_features['target'].values, dtype=torch.float32).view(-1, 1)

    val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
    val_dataloader = DataLoader(val_dataset, batch_size=128, shuffle=False)

    ## Make Predictions
    all_predictions = []

    with torch.no_grad():
        for data, _ in val_dataloader:
            encoded, _ = autoencoder(data)  # Only encoded is needed for prediction
            predictions = mlp(torch.cat((encoded, data), dim=1))
            all_predictions.extend(predictions.numpy())

    live_predictions = np.array(all_predictions).reshape(len(all_predictions),).tolist()

    submission = pd.Series(live_predictions, index=live_features.index)
    return submission.to_frame("prediction")

import cloudpickle
p = cloudpickle.dumps(predict)
with open("autoencoder.pkl", "wb") as f:
    f.write(p)

# Download file if running in Google Colab
try:
    from google.colab import files
    files.download('autoencoder.pkl')
except:
    pass

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

1.3 CNN

In [None]:
# !pip install torch

import torch
import torch.nn as nn
import numpy as np

# Define CNN Class
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=32, kernel_size=3, stride=2, padding=1)
        self.batch_norm1 = nn.BatchNorm1d(num_features=32)
        self.maxpool1 = nn.MaxPool1d(kernel_size=3, stride=2)
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, stride=2, padding=1)
        self.batch_norm2 = nn.BatchNorm1d(num_features=64)
        self.adaptive_avg_pool = nn.AdaptiveAvgPool1d(output_size=1)
        self.linear1 = nn.Linear(64, 1)


    def forward(self, x):
        x = self.conv1(x)
        x = torch.relu(x)
        x = self.batch_norm1(x)
        x = self.maxpool1(x)
        x = self.conv2(x)
        x = torch.relu(x)
        x = self.batch_norm2(x)
        x = self.adaptive_avg_pool(x)
        x = torch.flatten(x, 1)
        x = self.linear1(x)
        x = torch.relu(x)

        return x
import json
feature_metadata = json.load(open(f"features.json"))
feature_set = feature_metadata["feature_sets"]['all']


# Load

model = torch.load("/content/cnn.pt", map_location=torch.device('cpu'))
model.eval()


def predict(live_features: pd.DataFrame) -> pd.DataFrame:
    # Ensure the model is in the evaluation mode
    from torch.utils.data import TensorDataset, DataLoader

    # Prepare the validation dataset
    X_val_tensor = torch.Tensor(live_features[feature_set].values)
    y_val_tensor = torch.Tensor(live_features['target'].values)

    val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
    val_dataloader = DataLoader(val_dataset, batch_size=128, shuffle=False)

    ## Make Predictions
    all_predictions = []

    with torch.no_grad():
        for features, _ in val_dataloader:
          features = features.unsqueeze(1)
          predictions = model(features)
          all_predictions.extend(predictions.numpy())

        live_predictions = np.array(all_predictions).reshape(len(all_predictions),).tolist()

        submission = pd.Series(live_predictions, index=live_features.index)
        return submission.to_frame("prediction")

import cloudpickle
p = cloudpickle.dumps(predict)
with open("cnn.pkl", "wb") as f:
    f.write(p)

# Download file if running in Google Colab
try:
    from google.colab import files
    files.download('cnn.pkl')
except:
    pass

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
# Define your prediction pipeline as a function
def predict(live_features: pd.DataFrame) -> pd.DataFrame:
    # Ensure the model is in the evaluation mode
    from torch.utils.data import TensorDataset, DataLoader

    # Prepare the validation dataset
    X_val_tensor = torch.tensor(live_features[feature_set].values, dtype=torch.float32)
    val_dataset = TensorDataset(X_val_tensor)
    val_dataloader = DataLoader(val_dataset, batch_size=128, shuffle=False)

    ## Make Predictions
    all_predictions = []

    with torch.no_grad():
        for data in val_dataloader:
            encoded, _ = autoencoder(data)  # Only encoded is needed for prediction
            predictions = mlp(torch.cat((encoded, data), dim=1))
            all_predictions.extend(predictions.numpy())

    live_predictions = all_predictions.reshape(len(all_predictions),).tolist()

    submission = pd.Series(live_predictions, index=live_features.index)
    return submission.to_frame("prediction")

import cloudpickle
p = cloudpickle.dumps(predict)
with open("predict.pkl", "wb") as f:
    f.write(p)

# Download file if running in Google Colab
try:
    from google.colab import files
    files.download('predict.pkl')
except:
    pass

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## 2. Feature Neutral & Ensemble Models

2.0 Defining Models and Functions

In [None]:
import torch
import torch.nn as nn
import pandas as pd
import lightgbm as lgb
import json
from torch.utils.data import TensorDataset, DataLoader



# Define the Swish activation function
class Swish(nn.Module):
    def __init__(self):
        super(Swish, self).__init__()

    def forward(self, x):
        return x * torch.sigmoid(x)

# Define the Autoencoder
class Autoencoder(nn.Module):
    def __init__(self, feature_dim, encoding_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(feature_dim, 1500),
            nn.BatchNorm1d(1500),
            Swish(),
            nn.Linear(1500, 1000),
            nn.BatchNorm1d(1000),
            Swish(),
            nn.Linear(1000, 500),
            nn.BatchNorm1d(500),
            Swish(),
            nn.Linear(500, encoding_dim),
            nn.BatchNorm1d(encoding_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 500),
            nn.BatchNorm1d(500),
            Swish(),
            nn.Linear(500, 1000),
            nn.BatchNorm1d(1000),
            Swish(),
            nn.Linear(1000, 1500),
            nn.BatchNorm1d(1500),
            Swish(),
            nn.Linear(1500, feature_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

# Define the MLP
class MLP(nn.Module):
    def __init__(self, input_dim, dropout_rate):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.BatchNorm1d(1024),
            Swish(),
            nn.Dropout(dropout_rate),
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            Swish(),
            nn.Dropout(dropout_rate),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            Swish(),
            nn.Dropout(dropout_rate),
            nn.Linear(256, 1)
        )

    def forward(self, x):
        return self.layers(x)

import torch
import torch.nn as nn

# Load Models
dropout_rate = 0.4
feature_dim = 2376  # Your feature dimension
encoding_dim = 1000  # Your encoding dimension
input_dim = feature_dim + encoding_dim  # Total input dimension for MLP

autoencoder = Autoencoder(feature_dim, encoding_dim)
mlp = MLP(input_dim, dropout_rate)

autoencoder.load_state_dict(torch.load('autoencoder.pth')['autoencoder'])
mlp.load_state_dict(torch.load('autoencoder.pth')['mlp'])

autoencoder.eval()
mlp.eval()

#load from model:
lgb = lgb.Booster(model_file='LGBM_Full.txt')
feature_metadata = json.load(open(f"features.json"))
feature_set = feature_metadata["feature_sets"]['all']


import json
feature_metadata = json.load(open(f"features.json"))
feature_set = feature_metadata["feature_sets"]['all']

2.2 LGBM + Autoencoder ENSEMBLE

In [None]:

# Define your prediction pipeline as a function
def predict_ensemble(live_features: pd.DataFrame) -> pd.DataFrame:
    # #1 - LGBM

    lgb_predictions = lgb.predict(live_features[feature_set])
    lgb_predictions = pd.Series(lgb_predictions, index=live_features.index)

    # #2 - Autoencoder

    # Prepare the validation dataset
    X_val_tensor = torch.tensor(live_features[feature_set].values, dtype=torch.float32)
    y_val_tensor = torch.tensor(live_features['target'].values, dtype=torch.float32).view(-1, 1)

    val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
    val_dataloader = DataLoader(val_dataset, batch_size=128, shuffle=False)

    ## Make Predictions
    all_predictions = []

    with torch.no_grad():
        for data, _ in val_dataloader:
            encoded, _ = autoencoder(data)  # Only encoded is needed for prediction
            predictions = mlp(torch.cat((encoded, data), dim=1))
            all_predictions.extend(predictions.numpy())

    live_predictions = np.array(all_predictions).reshape(len(all_predictions),).tolist()

    auto_predictions = pd.Series(live_predictions, index=live_features.index)
    submission = lgb_predictions.add(auto_predictions, fill_value=0.5)/2

    return submission.to_frame("prediction")

import cloudpickle
p = cloudpickle.dumps(predict_ensemble)
with open("ensemble.pkl", "wb") as f:
    f.write(p)

# Download file if running in Google Colab
try:
    from google.colab import files
    files.download('ensemble.pkl')
except:
    pass

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>