# Training Convolutional Neural Networks with Reconstructed Data Having Multiple Channels

In [None]:
use_gpu = True

In [None]:
# Install torch
!pip install torch -q

In [None]:
# Install torchinfo
!pip install torchinfo

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [None]:
# Install dependencies
!pip install -q numerapi pandas pyarrow matplotlib lightgbm scikit-learn cloudpickle scipy==1.10.1

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Import necessary libraries and packages
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset
from torchinfo import summary

if torch.cuda.is_available():
 print("GPU is available!")
else:
 print("GPU is not available.")

GPU is available!


In [None]:
gpu_available = torch.cuda.is_available()
gpu_available

True

In [None]:
# Initialize NumerAPI - the official Python API client for Numerai
from numerapi import NumerAPI
napi = NumerAPI()

# list the datasets and available versions
all_datasets = napi.list_datasets()
dataset_versions = list(set(d.split('/')[0] for d in all_datasets))
print("Available versions:\n", dataset_versions)

# Set data version to one of the latest datasets
DATA_VERSION = "v4.3"

# Print all files available for download for our version
current_version_files = [f for f in all_datasets if f.startswith(DATA_VERSION)]
print("availbable", DATA_VERSION, "files:\n", current_version_files)

Available versions:
 ['v4.2', 'v4.1', 'v4.3', 'v4']
availbable v4.3 files:
 ['v4.3/features.json', 'v4.3/live_benchmark_models.parquet', 'v4.3/live_example_preds.csv', 'v4.3/live_example_preds.parquet', 'v4.3/live_int8.parquet', 'v4.3/meta_model.parquet', 'v4.3/train_benchmark_models.parquet', 'v4.3/train_int8.parquet', 'v4.3/validation_benchmark_models.parquet', 'v4.3/validation_example_preds.csv', 'v4.3/validation_example_preds.parquet', 'v4.3/validation_int8.parquet']


In [None]:
# Download the feature metadata file
napi.download_dataset(f"{DATA_VERSION}/features.json");

# Read the metadata and display
feature_metadata = json.load(open(f"{DATA_VERSION}/features.json"))
for metadata in feature_metadata:
  print(metadata, len(feature_metadata[metadata]))

v4.3/features.json: 1.12MB [00:00, 1.75MB/s]                           

feature_stats 2376
feature_sets 17
targets 41





In [None]:
feature_sets = feature_metadata["feature_sets"]

sizes = ["small", "medium", "all"]
groups = [
  "intelligence",
  "wisdom",
  "charisma",
  "dexterity",
  "strength",
  "constitution",
  "agility",
  "serenity",
  "all"
]

# Compile the intersections of feature sets and feature groups
subgroups = {}
for size in sizes:
    subgroups[size] = {}
    for group in groups:
        subgroups[size][group] = (
            set(feature_sets[size])
            .intersection(set(feature_sets[group]))
        )

# Convert to data frame and display the feature count of each intersection
pd.DataFrame(subgroups).applymap(len).sort_values(by="all", ascending=False)

Unnamed: 0,small,medium,all
all,42,705,2376
constitution,2,134,335
charisma,3,116,290
agility,2,58,145
wisdom,3,56,140
strength,1,54,135
serenity,3,34,95
dexterity,4,21,51
intelligence,2,14,35


## Data Preprocessing

In [None]:
# Access the names of features belonging to these two groups
charisma = subgroups['all']['charisma']
constitution = subgroups['all']['constitution']

In [None]:
# Define our feature set
feature_set = feature_sets["all"]

# Download the training data - this will take a few minutes
napi.download_dataset(f"{DATA_VERSION}/train_int8.parquet");

# Load only the "medium" feature set to
# Use the "all" feature set to use all features
train = pd.read_parquet(
    f"{DATA_VERSION}/train_int8.parquet",
    columns=["era", "target"] + feature_set
)

# Use only 100 rows of the data
train = train.iloc[:100]

v4.3/train_int8.parquet: 2.09GB [02:10, 16.1MB/s]                            


Unnamed: 0_level_0,era,target,feature_aaronic_unexampled_arguer,feature_abactinal_inventable_luminescence,feature_abating_unadaptable_weakfish,feature_abdominal_subtriplicate_fin,feature_abducent_unbeneficed_lithophyte,feature_abducted_euphonic_pipewort,feature_ablest_mauritanian_elding,feature_abreast_viscoelastic_commander,...,feature_yelled_hysteretic_eath,feature_yokelish_metapsychological_lunt,feature_yorkist_authenticated_lotted,feature_yoruban_purplish_directoire,feature_yoruban_unapplied_tawse,feature_zincky_unseemly_butt,feature_zincoid_peccant_greywacke,feature_zoophoric_underglaze_algin,feature_zygodactyl_exponible_lathi,feature_zymotic_roundabout_figuration
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n003bba8a98662e4,1,0.25,2,2,0,2,3,2,4,2,...,2,2,2,2,2,2,2,2,3,2
n003bee128c2fcfc,1,0.75,3,2,4,2,1,2,2,2,...,3,2,2,2,2,2,2,2,1,2
n0048ac83aff7194,1,0.25,3,2,4,2,2,2,4,4,...,1,2,2,2,2,2,2,2,2,2
n00691bec80d3e02,1,0.75,1,2,1,2,2,2,4,2,...,2,2,2,2,2,2,2,2,2,2
n00b8720a2fdc4f2,1,0.5,1,2,0,2,1,2,2,3,...,1,2,2,2,2,2,2,2,1,2


In [None]:
# Reset the index of our dataset
train_set = train.reset_index()

In [None]:
# Extract the target and drop unnecessary columns for the train
target = train_set['target']
train = train_set.drop(columns=['era', 'id', 'target'])

In [None]:
# Use only 100 rows of the data
target = target.iloc[:100]

In [None]:
# Turn sets into lists
charisma = list(charisma)
constitution = list(constitution)

In [None]:
# Obtain dataframes containing 'charisma' and 'constitution' features
charisma_prep = train[charisma]
constitution_prep = train[constitution]

In [None]:
# Define a function for padding
def padding(df):
  n_samples = df.shape[0]
  n_features = len(df.columns)
  desired_features = 335
  padding_value = 0
  padding_size = desired_features - n_features
  padding_df = pd.DataFrame(np.zeros((n_samples, padding_size)), columns=[f'feature_{n_features + i}' for i in range(padding_size)])
  padded_df = pd.concat([df, padding_df], axis=1)
  return padded_df

In [None]:
# Padding the number of features to 335
charisma_channel = padding(charisma_prep)
constitution_channel = padding(constitution_prep)

In [None]:
# Combine two channels together
channel_data = np.array([charisma_channel, constitution_channel])

In [None]:
# Reshape the data
channel_data_reshaped = np.swapaxes(channel_data, 0, 1)

In [None]:
# Divide data into train, validation, and test sets
channel_train, channel_remain, target_train, target_remain = train_test_split(channel_data_reshaped, target, test_size=0.3, random_state=42)
channel_test, channel_val, target_test, target_val = train_test_split(channel_remain, target_remain, test_size=2/3, random_state=42)

In [None]:
# Turn sets into tensors
channel_data = torch.Tensor(channel_data)
target_train = torch.Tensor(target.values)
channel_val = torch.Tensor(channel_val)
target_val = torch.Tensor(target_val.values)
channel_test = torch.Tensor(channel_test)
target_test = torch.Tensor(target_test.values)

In [None]:
# Reshape the data
channel_data = channel_data.permute(1, 0, 2)

In [None]:
# Add a dimension to the target sets
target_train_resized = target_train.unsqueeze(1)
target_val_resized = target_val.unsqueeze(1)
target_test_resized = target_test.unsqueeze(1)

In [None]:
# Put tensors into dataloader
train_loader = TensorDataset(channel_data, target_train_resized)
train_dataloader = DataLoader(train_loader, batch_size=10, shuffle=True)

validation_loader = TensorDataset(channel_val, target_val_resized)
val_dataloader = DataLoader(validation_loader, batch_size=10, shuffle=False)

test_loader = TensorDataset(channel_test, target_test_resized)
test_dataloader = DataLoader(test_loader, batch_size=10, shuffle=False)

## Design Model Architecture

In [None]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=2, out_channels=32, kernel_size=3, stride=2, padding=1)
        self.batch_norm1 = nn.BatchNorm1d(num_features=32)
        self.maxpool1 = nn.MaxPool1d(kernel_size=3, stride=2)
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, stride=2, padding=1)
        self.batch_norm2 = nn.BatchNorm1d(num_features=64)
        self.adaptive_avg_pool = nn.AdaptiveAvgPool1d(output_size=1)
        self.linear1 = nn.Linear(64, 1)


    def forward(self, x):
        x = self.conv1(x)
        x = torch.relu(x)
        x = self.batch_norm1(x)
        x = self.maxpool1(x)
        x = self.conv2(x)
        x = torch.relu(x)
        x = self.batch_norm2(x)
        x = self.adaptive_avg_pool(x)
        x = torch.flatten(x, 1)
        x = self.linear1(x)
        x = torch.relu(x)

        return x


# Instantiate the model
model = CNN()
if gpu_available and use_gpu:
  model = model.cuda()
# Print the model architecture
print(model)

CNN(
  (conv1): Conv1d(2, 32, kernel_size=(3,), stride=(2,), padding=(1,))
  (batch_norm1): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (maxpool1): MaxPool1d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(32, 64, kernel_size=(3,), stride=(2,), padding=(1,))
  (batch_norm2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (adaptive_avg_pool): AdaptiveAvgPool1d(output_size=1)
  (linear1): Linear(in_features=64, out_features=1, bias=True)
)


In [None]:
# Display the model summary
summary(model, (10, 2, 335))

Layer (type:depth-idx)                   Output Shape              Param #
CNN                                      [10, 1]                   --
├─Conv1d: 1-1                            [10, 32, 168]             224
├─BatchNorm1d: 1-2                       [10, 32, 168]             64
├─MaxPool1d: 1-3                         [10, 32, 83]              --
├─Conv1d: 1-4                            [10, 64, 42]              6,208
├─BatchNorm1d: 1-5                       [10, 64, 42]              128
├─AdaptiveAvgPool1d: 1-6                 [10, 64, 1]               --
├─Linear: 1-7                            [10, 1]                   65
Total params: 6,689
Trainable params: 6,689
Non-trainable params: 0
Total mult-adds (M): 2.99
Input size (MB): 0.03
Forward/backward pass size (MB): 1.29
Params size (MB): 0.03
Estimated Total Size (MB): 1.34

In [None]:
# Choose optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.001)

In [None]:
# Choose loss function
criterion = nn.L1Loss()

In [None]:
# Define a function to track the loss of each epoch
def get_loss(loader, model):
  with torch.no_grad(): # Anything under torch.no_grad will be calculated with no gradients. Can only be used for testing, not training!

    loss = 0
    for i, (features, targets) in enumerate(loader): # The batches.
          if gpu_available and use_gpu:
              features = features.cuda()
              targets = targets.cuda()
          outputs = model(features)

          loss = loss + criterion(outputs, targets)
    return loss/ len(loader)

In [None]:
# Define early stop function
class EarlyStopper:
    def __init__(self, patience=1):
        self.patience = patience
        self.counter = 0
        self.min_validation_loss = float('inf')

    def early_stop(self, validation_loss):
        # If the new loss is lower than the old loss, reset the counter!
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
            # Keep track of the best model by saving it on the hard drive.
            torch.save(model, r"./best_model.pt")
        # otherwise, increment the counter.
        elif validation_loss > self.min_validation_loss:
            self.counter += 1
            # If there has been too many epochs with the loss being high, terminate.
            if self.counter >= self.patience:
                return True
        return False

In [None]:
# Instantiate early stopping
early_stopper = EarlyStopper(patience=3)

## Train CNN Model on the Reconstructed Data

In [None]:
# Train the model
for epoch in range(5):
    for i, (features, targets) in enumerate(train_dataloader):
        model.train()
        optimizer.zero_grad()
        if gpu_available and use_gpu:
          features = features.cuda()
          targets = targets.cuda()
        outputs = model(features)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        if i % 2 == 0:
          print("Epoch", epoch + 1, " batch", i+1, ". Training Loss: ", loss.item())

    training_loss = get_loss(train_dataloader, model)
    validation_loss = get_loss(val_dataloader, model)
    print(f'Epoch [{epoch + 1}/{5}], Train Loss: {training_loss.item()}, Validation Loss: {validation_loss.item()}')

    if early_stopper.early_stop(validation_loss):
        print("Validation loss hasn't dropped. Early stopping!")
        break

Epoch 1  batch 1 . Training Loss:  0.6903325319290161
Epoch 1  batch 3 . Training Loss:  0.49306827783584595
Epoch 1  batch 5 . Training Loss:  0.49187278747558594
Epoch 1  batch 7 . Training Loss:  0.43141603469848633
Epoch 1  batch 9 . Training Loss:  0.43294939398765564
Epoch [1/5], Train Loss: 0.4084923267364502, Validation Loss: 0.37857121229171753
Epoch 2  batch 1 . Training Loss:  0.5049095153808594
Epoch 2  batch 3 . Training Loss:  0.482426255941391
Epoch 2  batch 5 . Training Loss:  0.23577530682086945
Epoch 2  batch 7 . Training Loss:  0.41562342643737793
Epoch 2  batch 9 . Training Loss:  0.4032500386238098
Epoch [2/5], Train Loss: 0.4043997824192047, Validation Loss: 0.356304407119751
Epoch 3  batch 1 . Training Loss:  0.4563234746456146
Epoch 3  batch 3 . Training Loss:  0.27815306186676025
Epoch 3  batch 5 . Training Loss:  0.38014858961105347
Epoch 3  batch 7 . Training Loss:  0.36539992690086365
Epoch 3  batch 9 . Training Loss:  0.2529497444629669
Epoch [3/5], Train L

## Evaluate Model Performance on the Test Set

In [None]:
# Once training is done, load the best model (might not be the last model due to early stopping)
model1 = torch.load(r"./best_model.pt")
test_loss = get_loss(test_dataloader, model1)
print("best model performance Test set:", test_loss)

best model performance Test set: tensor(0.2328, device='cuda:0')
