In [None]:
# Install dependencies
!pip install -q numerapi pandas pyarrow matplotlib lightgbm scikit-learn cloudpickle scipy==1.10.1

# Inline plots
%matplotlib inline

In [None]:
!pip install torch -q

In [None]:
!pip install torchsummary

In [None]:
!pip install wandb
import wandb
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [None]:
# Initialize NumerAPI - the official Python API client for Numerai
from numerapi import NumerAPI
napi = NumerAPI()

# list the datasets and available versions
all_datasets = napi.list_datasets()
dataset_versions = list(set(d.split('/')[0] for d in all_datasets))
print("Available versions:\n", dataset_versions)

# Set data version to one of the latest datasets
DATA_VERSION = "v4.3"

# Print all files available for download for our version
current_version_files = [f for f in all_datasets if f.startswith(DATA_VERSION)]
print("availbable", DATA_VERSION, "files:\n", current_version_files)

Available versions:
 ['v4.2', 'v4.3', 'v4', 'v4.1']
availbable v4.3 files:
 ['v4.3/features.json', 'v4.3/live_benchmark_models.parquet', 'v4.3/live_example_preds.csv', 'v4.3/live_example_preds.parquet', 'v4.3/live_int8.parquet', 'v4.3/meta_model.parquet', 'v4.3/train_benchmark_models.parquet', 'v4.3/train_int8.parquet', 'v4.3/validation_benchmark_models.parquet', 'v4.3/validation_example_preds.csv', 'v4.3/validation_example_preds.parquet', 'v4.3/validation_int8.parquet']


In [None]:
import json

# download the feature metadata file
napi.download_dataset(f"{DATA_VERSION}/features.json");

# read the metadata and display
feature_metadata = json.load(open(f"{DATA_VERSION}/features.json"))
for metadata in feature_metadata:
  print(metadata, len(feature_metadata[metadata]))

2024-04-22 18:41:18,300 INFO numerapi.utils: starting download
v4.3/features.json: 1.12MB [00:00, 4.65MB/s]                            

feature_stats 2376
feature_sets 17
targets 41





In [None]:
feature_sets = feature_metadata["feature_sets"]
for feature_set in ["small", "medium", "all"]:
  print(feature_set, len(feature_sets[feature_set]))

small 42
medium 705
all 2376


In [None]:
import pandas as pd

# Define our feature set
feature_set = feature_sets["medium"]

# Download the training data - this will take a few minutes
napi.download_dataset(f"{DATA_VERSION}/train_int8.parquet");

# Load only the "medium" feature set to
# Use the "all" feature set to use all features
train = pd.read_parquet(
    f"{DATA_VERSION}/train_int8.parquet",
    columns=["era", "target"] + feature_set
)

# Downsample to every 4th era to reduce memory usage and speedup model training (suggested for Colab free tier)
# Comment out the line below to use all the data
train = train[train["era"].isin(train["era"].unique()[::4])]

2024-04-22 18:41:42,978 INFO numerapi.utils: starting download
v4.3/train_int8.parquet: 2.09GB [00:51, 40.6MB/s]                            


In [None]:
train

Unnamed: 0_level_0,era,target,feature_abating_unadaptable_weakfish,feature_ablest_mauritanian_elding,feature_acclimatisable_unfeigned_maghreb,feature_accommodable_crinite_cleft,feature_accretive_sorrier_skedaddle,feature_acetose_periotic_coronation,feature_adam_incantational_winemaker,feature_additive_untrustworthy_hierologist,...,feature_witchy_orange_muley,feature_wombed_liberatory_malva,feature_won_stalwart_eisenstein,feature_wrathful_prolix_colotomy,feature_wrinkliest_unmaintainable_usk,feature_wrought_muckier_temporality,feature_yauld_antediluvial_subprefecture,feature_yelled_hysteretic_eath,feature_yoruban_unapplied_tawse,feature_zygodactyl_exponible_lathi
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n003bba8a98662e4,0001,0.25,0,4,0,4,2,0,2,1,...,0,0,2,4,2,4,3,2,2,3
n003bee128c2fcfc,0001,0.75,4,2,2,2,2,3,2,1,...,3,3,2,0,2,2,1,3,2,1
n0048ac83aff7194,0001,0.25,4,4,2,0,2,0,2,4,...,0,1,2,0,2,3,2,1,2,2
n00691bec80d3e02,0001,0.75,1,4,1,1,2,0,2,2,...,2,1,2,2,2,3,2,2,2,2
n00b8720a2fdc4f2,0001,0.50,0,2,0,0,2,0,2,3,...,1,1,2,0,2,0,1,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nffc2d5e4b79a7ae,0573,0.25,4,2,4,4,3,1,0,0,...,4,1,2,1,3,2,2,2,1,1
nffc7d24176548a4,0573,0.50,0,3,3,4,2,0,2,3,...,2,0,2,4,2,0,2,1,2,3
nffc9844c1c7a6a9,0573,0.50,4,1,1,2,0,3,2,4,...,1,3,0,3,0,2,0,0,1,2
nffd79773f4109bb,0573,0.50,0,0,1,3,1,4,2,1,...,2,4,1,2,0,1,1,0,1,0


In [None]:
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset

In [None]:
wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/jupyter/.netrc


True

In [None]:
import os
os.environ['WANDB_NOTEBOOK_NAME'] = 'AutoEncoder+MLP+complex.ipynb'

In [None]:
wandb.init(project="BA865_complexMLP", config={
    "learning_rate": 0.001,
    "epochs": 10,
    "batch_size": 64,
    "feature_dim": 705,  # Ensure this matches your actual feature set size
    "encoding_dim": 300  # Example encoding dimension
})

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112427000001743, max=1.0…

In [None]:
config = wandb.config

In [None]:
# Define the Swish activation function
class Swish(nn.Module):
    def __init__(self):
        super(Swish, self).__init__()

    def forward(self, x):
        return x * torch.sigmoid(x)

In [None]:
# Define a more complex Autoencoder
class Autoencoder(nn.Module):
    def __init__(self, feature_dim, encoding_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(feature_dim, 1500),
            nn.BatchNorm1d(1500),
            Swish(),
            nn.Linear(1500, 1000),
            nn.BatchNorm1d(1000),
            Swish(),
            nn.Linear(1000, 500),
            nn.BatchNorm1d(500),
            Swish(),
            nn.Linear(500, encoding_dim),
            nn.BatchNorm1d(encoding_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 500),
            nn.BatchNorm1d(500),
            Swish(),
            nn.Linear(500, 1000),
            nn.BatchNorm1d(1000),
            Swish(),
            nn.Linear(1000, 1500),
            nn.BatchNorm1d(1500),
            Swish(),
            nn.Linear(1500, feature_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

In [None]:
# Define a more complex MLP
class MLP(nn.Module):
    def __init__(self, input_dim):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.BatchNorm1d(1024),
            Swish(),
            nn.Dropout(0.5),
            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            Swish(),
            nn.Dropout(0.5),
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            Swish(),
            nn.Dropout(0.4),
            nn.Linear(256, 1)
        )

    def forward(self, x):
        return self.layers(x)

In [None]:
train

Unnamed: 0_level_0,era,target,feature_abating_unadaptable_weakfish,feature_ablest_mauritanian_elding,feature_acclimatisable_unfeigned_maghreb,feature_accommodable_crinite_cleft,feature_accretive_sorrier_skedaddle,feature_acetose_periotic_coronation,feature_adam_incantational_winemaker,feature_additive_untrustworthy_hierologist,...,feature_witchy_orange_muley,feature_wombed_liberatory_malva,feature_won_stalwart_eisenstein,feature_wrathful_prolix_colotomy,feature_wrinkliest_unmaintainable_usk,feature_wrought_muckier_temporality,feature_yauld_antediluvial_subprefecture,feature_yelled_hysteretic_eath,feature_yoruban_unapplied_tawse,feature_zygodactyl_exponible_lathi
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n003bba8a98662e4,0001,0.25,0,4,0,4,2,0,2,1,...,0,0,2,4,2,4,3,2,2,3
n003bee128c2fcfc,0001,0.75,4,2,2,2,2,3,2,1,...,3,3,2,0,2,2,1,3,2,1
n0048ac83aff7194,0001,0.25,4,4,2,0,2,0,2,4,...,0,1,2,0,2,3,2,1,2,2
n00691bec80d3e02,0001,0.75,1,4,1,1,2,0,2,2,...,2,1,2,2,2,3,2,2,2,2
n00b8720a2fdc4f2,0001,0.50,0,2,0,0,2,0,2,3,...,1,1,2,0,2,0,1,1,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nffc2d5e4b79a7ae,0573,0.25,4,2,4,4,3,1,0,0,...,4,1,2,1,3,2,2,2,1,1
nffc7d24176548a4,0573,0.50,0,3,3,4,2,0,2,3,...,2,0,2,4,2,0,2,1,2,3
nffc9844c1c7a6a9,0573,0.50,4,1,1,2,0,3,2,4,...,1,3,0,3,0,2,0,0,1,2
nffd79773f4109bb,0573,0.50,0,0,1,3,1,4,2,1,...,2,4,1,2,0,1,1,0,1,0


In [None]:
autoencoder = Autoencoder(config.feature_dim, config.encoding_dim)
mlp = MLP(config.feature_dim + config.encoding_dim)
optimizer = torch.optim.Adam(list(autoencoder.parameters()) + list(mlp.parameters()), lr=config.learning_rate)
ae_criterion = nn.MSELoss()
mlp_criterion = nn.MSELoss()

In [None]:
from sklearn.model_selection import train_test_split

X_train_tensor = torch.tensor(train[feature_set].values, dtype=torch.float32)
y_train_tensor = torch.tensor(train['target'].values, dtype=torch.float32).view(-1, 1)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_tensor, y_train_tensor, test_size=0.2, random_state=42)

train_dataset = TensorDataset(X_train, y_train)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)

val_dataset = TensorDataset(X_val, y_val)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [None]:
for epoch in range(config.epochs):
    autoencoder.train()
    mlp.train()
    for data, targets in train_dataloader:
        optimizer.zero_grad()
        noise = torch.randn_like(data) * 0.1  # denoising the data
        noisy_data = data + noise

        encoded, decoded = autoencoder(noisy_data)
        ae_loss = ae_criterion(decoded, data)

        combined_features = torch.cat((encoded, data), dim=1)
        predictions = mlp(combined_features)
        mlp_loss = mlp_criterion(predictions, targets)

        loss = ae_loss + mlp_loss
        loss.backward()
        optimizer.step()
        wandb.log({"train_loss": loss.item(), "ae_loss": ae_loss.item(), "mlp_loss": mlp_loss.item()})

    # Validation phase
    val_loss = 0
    with torch.no_grad():
        autoencoder.eval()
        mlp.eval()
        for data, targets in val_dataloader:
            encoded, decoded = autoencoder(data)
            combined_features = torch.cat((encoded, data), dim=1)
            predictions = mlp(combined_features)
            batch_loss = mlp_criterion(predictions, targets).item()
            val_loss += batch_loss
            wandb.log({"val_batch_loss": batch_loss})

    val_loss /= len(val_dataloader)
    wandb.log({"val_loss": val_loss})
    print(f'Epoch {epoch + 1}: Train Loss: {loss.item():.4f}, Val Loss: {val_loss:.4f}')

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = {'autoencoder': autoencoder.state_dict(), 'mlp': mlp.state_dict()}
        wandb.run.summary["best_val_loss"] = best_val_loss
        wandb.save('best_model.pth')
        trigger_times = 0
    else:
        trigger_times += 1
        if trigger_times >= patience:
            print("Early stopping!")
            break

wandb.finish()

Epoch 1: Train Loss: 0.9413, Val Loss: 0.0497
Epoch 2: Train Loss: 0.8536, Val Loss: 0.0498
Epoch 3: Train Loss: 0.7325, Val Loss: 0.0498
Epoch 4: Train Loss: 0.8100, Val Loss: 0.0498
Epoch 5: Train Loss: 0.8539, Val Loss: 0.0496
Epoch 6: Train Loss: 0.7711, Val Loss: 0.0496
Epoch 7: Train Loss: 0.6396, Val Loss: 0.0499
Epoch 8: Train Loss: 0.6725, Val Loss: 0.0496
Epoch 9: Train Loss: 0.6925, Val Loss: 0.0496
Epoch 10: Train Loss: 0.7330, Val Loss: 0.0498


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
ae_loss,█▇▆▆▅▅▅▄▅▄▄▄▃▃▃▃▃▃▂▃▂▂▂▂▂▂▂▂▂▁▂▁▁▂▂▁▁▁▁▁
mlp_loss,▇▆▄▇▁▆▄▃▆▄▅▅▃▆▃▅▃▁▃▃▅▃█▂▄▆▄▄▄▂▄▅▄▅▆▇▃▃▄▄
train_loss,█▇▆▆▅▅▅▄▅▄▄▄▃▃▃▃▃▂▂▃▂▂▃▂▂▂▂▂▂▁▂▂▁▂▂▂▁▁▁▁
val_batch_loss,▄▁▃▄▃▂▄▅▅▃▃█▃▅▅▂▅▂▃▅▃▂▅▅▆▄▄▅▅▄▂▆▂▂▄▆▃▄▃▄
val_loss,▃▆▅▅▂▁█▂▂▄

0,1
ae_loss,0.69116
best_val_loss,0.04959
mlp_loss,0.04184
train_loss,0.73301
val_batch_loss,0.0451
val_loss,0.04975


In [None]:
torch.save(best_model, 'best_model.pth')

In [None]:
!pip install -U torchinfo



In [None]:
import torch
from torchinfo import summary
best_model = torch.load('best_model.pth')

In [None]:
autoencoder = Autoencoder(config.feature_dim, config.encoding_dim)
mlp = MLP(config.feature_dim + config.encoding_dim)

In [None]:
autoencoder.load_state_dict(best_model['autoencoder'])

<All keys matched successfully>

In [None]:
mlp.load_state_dict(best_model['mlp'])

<All keys matched successfully>

In [None]:
summary(autoencoder)

Layer (type:depth-idx)                   Param #
Autoencoder                              --
├─Sequential: 1-1                        --
│    └─Linear: 2-1                       1,059,000
│    └─BatchNorm1d: 2-2                  3,000
│    └─Swish: 2-3                        --
│    └─Linear: 2-4                       1,501,000
│    └─BatchNorm1d: 2-5                  2,000
│    └─Swish: 2-6                        --
│    └─Linear: 2-7                       500,500
│    └─BatchNorm1d: 2-8                  1,000
│    └─Swish: 2-9                        --
│    └─Linear: 2-10                      150,300
│    └─BatchNorm1d: 2-11                 600
├─Sequential: 1-2                        --
│    └─Linear: 2-12                      150,500
│    └─BatchNorm1d: 2-13                 1,000
│    └─Swish: 2-14                       --
│    └─Linear: 2-15                      501,000
│    └─BatchNorm1d: 2-16                 2,000
│    └─Swish: 2-17                       --
│    └─Linear: 2-18  

In [None]:
summary(mlp)

Layer (type:depth-idx)                   Param #
MLP                                      --
├─Sequential: 1-1                        --
│    └─Linear: 2-1                       1,030,144
│    └─BatchNorm1d: 2-2                  2,048
│    └─Swish: 2-3                        --
│    └─Dropout: 2-4                      --
│    └─Linear: 2-5                       524,800
│    └─BatchNorm1d: 2-6                  1,024
│    └─Swish: 2-7                        --
│    └─Dropout: 2-8                      --
│    └─Linear: 2-9                       131,328
│    └─BatchNorm1d: 2-10                 512
│    └─Swish: 2-11                       --
│    └─Dropout: 2-12                     --
│    └─Linear: 2-13                      257
Total params: 1,690,113
Trainable params: 1,690,113
Non-trainable params: 0