In [1]:
UPLOAD_PREDS = "True" #@param ['False', 'True']
SAVE_PREDS = "True" #@param ['False', 'True']
MODEL_NAME = '<model_name_here>' #@param {type:"string"}
MODEL_ID = '<model_id_here>' #@param {type:"string"}

# Setup

## Connecting Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

In [3]:
from pathlib import Path
DIR = Path('gdrive/MyDrive/numerai')
DATADIR = DIR / 'data'
SRCDIR = DIR / 'src'
RESULTDIR = DIR / 'results'

In [4]:
# Copy .env from numerai folder to root dir
!cp gdrive/MyDrive/Data/numerai/.env .env

## Installing and Importing Dependencies
First, we install and import the necessary packages. This cell is currently set *not* to print any output; if you run into any issues and need to check for error messages, comment out the `%%capture` line

In [5]:
%%capture
# install
!pip uninstall --no-input pandas
!pip install --upgrade python-dotenv fastai numerapi
!pip install ipython-autotime
!pip install torchmetrics

# import dependencies
import gc
import os
import csv
from dotenv import load_dotenv, find_dotenv
from getpass import getpass
import numerapi
from fastai.tabular.all import *
from pathlib import Path
from scipy.stats import spearmanr
import sklearn.linear_model

from tqdm import tqdm
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import torch

In [6]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Set sensible defaults
sns.set()
sns.set_style("ticks")
sns.set_context('paper')

# Get number of CPUs
import multiprocessing
CPUs = multiprocessing.cpu_count()
print(f"CPU count: {CPUs}")

%load_ext autotime

CPU count: 8
time: 265 µs (started: 2022-10-02 18:53:22 +00:00)


## Setting up numerapi
We will use the [numerapi](https://github.com/uuazed/numerapi) package to access the data and make submissions. For this to work, numerapi needs to use your API keys (which can be obtained [here](https://numer.ai/submit)). We will set up two main ways of passing these API keys to a numerapi instance:
1. Read a `.env` file using the `python-dotenv` package. This will require you to upload a `.env` file (which contains your secret key and should *not* be kept under version control). Using this method means you will not have to directly enter your keys each time you use this notebook, though you will need to re-upload the `.env` file.
2. Manually entering the API keys -- if you don't have access to, or don't want to mess with, your `.env` file.

If you have a `.env` file, upload it to the default working directory, `content`, now. In either case, run the cell below to set up the numerapi instance. See [Appendix A](#app_a) for instructions on generating and downloading a .env file.

In [7]:
# Load the numerapi credentials from .env or prompt for them if not available
def credential():
    dotenv_path = find_dotenv()
    load_dotenv(dotenv_path)

    if os.getenv("NUMERAI_PUBLIC_KEY"):
        print("Loaded Numerai Public Key into Global Environment!")
    else:
        os.environ["NUMERAI_PUBLIC_KEY"] = getpass("Please enter your Numerai Public Key. You can find your key here: https://numer.ai/submit -> ")

    if os.getenv("NUMERAI_SECRET_KEY"):
        print("Loaded Numerai Secret Key into Global Environment!")
    else:
        os.environ["NUMERAI_SECRET_KEY"] = getpass("Please enter your Numerai Secret Key. You can find your key here: https://numer.ai/submit -> ")

credential()
public_key = os.environ.get("NUMERAI_PUBLIC_KEY")
secret_key = os.environ.get("NUMERAI_SECRET_KEY")
napi = numerapi.NumerAPI(verbosity="info", public_id=public_key, secret_key=secret_key)

Loaded Numerai Public Key into Global Environment!
Loaded Numerai Secret Key into Global Environment!
time: 3.72 ms (started: 2022-10-02 18:53:22 +00:00)


You can read up on the functionality of numerapi [here](https://github.com/uuazed/numerapi). You can use it to download the competition data, view other numerai users' public profiles, check submission status, manage your stake, and much more. In this case, we'll only be using it to download competition data and submit predictions.



# Data preparation


## Downloading Competition Data
In a more structured project, you'll probably want to keep the data in a seprate directory from your scripts etc. You could also link google colab to your google drive and store the data there in order to avoid needing to download and process the data every time. In this case, however, we'll keep everything in `./content`, and download the data fresh each time.

In [8]:
# check if a new round has started
if napi.check_new_round():
    print("new round has started within the last 24hours!")
else:
    print("no new round within the last 24 hours")

no new round within the last 24 hours
time: 270 ms (started: 2022-10-02 18:53:22 +00:00)


In [9]:
# Download the current dataset unless it's already there
if not os.path.exists(f'{DATADIR}/numerai_dataset_{napi.get_current_round()}'):
    napi.download_current_dataset(dest_path=DATADIR, unzip=True)
else:
    print("Current round already downloaded")

Current round already downloaded
time: 659 ms (started: 2022-10-02 18:53:22 +00:00)


## Reading the data into memory

If you look at the files we downloaded above, you'll see a `numerai_tournament_data.csv` file and a `numerai_training_data.csv` file. The "tournament" file contains many rows with targets which we can use for validation, so let's extract those and combine them with our training set. 

In [10]:
# Get the current round
ROUND_NUM = napi.get_current_round()
# ROUND_NUM = 260
print(f"Round number {ROUND_NUM}")

train_file = Path(f'{DATADIR}/numerai_dataset_{ROUND_NUM}/numerai_training_data.csv')
tourn_file = Path(f'{DATADIR}/numerai_dataset_{ROUND_NUM}/numerai_tournament_data.csv')

# Load training column names only, so we can specify data types
with open(train_file, 'r') as f:
    column_names = next(csv.reader(f))
    print(f"Detected {len(column_names)} columns")

# Specify the datatypes in memory-efficient way
DTYPES = {c: 'float16' for c in column_names if c.startswith(('feature', 'target'))}

# tournament data contains features only (for the test rows)
print('Reading tournament data...')
df_tourn = pd.read_csv(tourn_file, dtype=DTYPES, engine='c').set_index("id")
df = df_tourn

Round number 336
Detected 314 columns
Reading tournament data...
time: 1min 33s (started: 2022-10-02 18:53:23 +00:00)


In [11]:
# Get the names of the features
RAW_FEAT_COLS = [f for f in df.columns if "feature" in f]
FEAT_COLS = RAW_FEAT_COLS # identical prior to feature engineering

# Get the groups of the features
FEAT_GROUPS = {
    g: [c for c in RAW_FEAT_COLS if c.startswith(f"feature_{g}")]
    for g in ["intelligence", "wisdom", "charisma", "dexterity", "strength", "constitution"]
}

time: 1.71 ms (started: 2022-10-02 18:54:57 +00:00)


# Evaluation Metrics

In this section, we will define two key evaluation metrics used to assess the performance of models before submitting to the tournament. These metrics are:
- Average Spearman Correlation per era: The sum of each era's Spearman correlation divided by the number of eras.
- Sharpe Ratio: The average correlation per era divided by the standard deviation of the correlations per era.

Both are defined in reasonable detail [here](https://wandb.ai/carlolepelaars/numerai_tutorial/reports/How-to-get-Started-With-Numerai--VmlldzoxODU0NTQ). The methods defined below are modified versions of the methods described in that post.

In [12]:
def score_corr(df: pd.DataFrame) -> np.float32:
    """
    Calculate the correlation by using grouped per-era data
    :param df: A Pandas DataFrame containing the columns "era", "target" and "prediction"
    :return: The average per-era correlations.
    """
    def _score(sub_df: pd.DataFrame) -> np.float32:
        """ Calculate Spearman correlation for Pandas' apply method """
        return spearmanr(sub_df["target"],  sub_df["prediction"])[0]
    corrs = df.groupby("era").apply(_score)
    return corrs.mean()

def score_spear(y_true, y_pred, axis=0):
    """Calculate Spearman correlation"""
    return spearmanr(y_true, y_pred, axis=axis)[0]

def score_sharpe(df: pd.DataFrame) -> np.float32:
    """
    Calculate the Sharpe ratio by using grouped per-era data
    :param df: A Pandas DataFrame containing the columns "era", "target" and "prediction"
    :return: The Sharpe ratio for your predictions.
    """
    def _score(sub_df: pd.DataFrame) -> np.float32:
        """ Calculate Spearman correlation for Pandas' apply method """
        return spearmanr(sub_df["target"],  sub_df["prediction"])[0]
    corrs = df.groupby("era").apply(_score)
    return corrs.mean() / corrs.std()

def feature_exposures(df: pd.DataFrame, preds: pd.Series, feat_cols=None):
    """ Calculate feature exposure of a model's predictions.
        https://forum.numer.ai/t/model-diagnostics-feature-exposure/899
    """
    if feat_cols is None:
        feat_cols = RAW_FEAT_COLS
        # Use the raw feature columns by default

    exposures = []
    for f in feat_cols:
        fe = spearmanr(preds, df[f])[0]
        exposures.append(fe)
    return np.array(exposures)

def scores(df: pd.DataFrame, verbose=False, feat_cols=None) -> (np.float32, np.float32):
    """ Score models across a variety of metrics. """
    if feat_cols is None:
        feat_cols = RAW_FEAT_COLS
    val_sharpe = score_sharpe(df)
    val_corr = score_corr(df)
    fe = feature_exposures(df, df['prediction'], feat_cols=feat_cols)
    max_fe = np.max(fe)
    rms_fe = np.sqrt(np.mean(np.square(fe)))

    if verbose:
        print(f'Spearman:\t{val_corr:.4f}')
        print(f'Sharpe:\t\t{val_sharpe:.4f}')
        print(f'Max exposure:\t{max_fe:.4f}')
        print(f'RMS exposure:\t{rms_fe:.4f}')

    return val_corr, val_sharpe, max_fe, rms_fe

def visualise_feat_exposure(df: pd.DataFrame, model, feat_cols: list, fe_cols: list):
    """ Visualises mean and max feature exposure for a trained model over eras
    """
    _df = df.copy()
    _df['prediction'] = model.predict(_df[feat_cols])

    maxes, means, eras = [], [], _df.era.unique()
    for era in tqdm(eras):
        era_df = _df[_df.era == era]
        era_fe = feature_exposures(era_df, era_df['prediction'], feat_cols=fe_cols)
        maxes.append(era_fe.max())
        means.append(np.sqrt(np.mean(np.square(era_fe))))

    plt.plot(eras, means, marker='.', label='Mean FE')
    plt.plot(eras, maxes, marker='.', label='Max FE')
    plt.legend()
    plt.xlabel('Era')
    plt.ylabel('Exposure')
    plt.title(f'Exposure of {len(fe_cols)} features by era')


time: 6.03 ms (started: 2022-10-02 18:54:57 +00:00)


# Modeling the Data

### Autoencoder model

In [13]:
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.cuda.amp import autocast
from torchsummary import summary
from sklearn.model_selection import train_test_split

# Enable CUDA GPU support, if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('Device:', device)

# Write my own dataset object
class NumeraiDataset(Dataset):

    def __init__(self, df: pd.DataFrame):
        self.df = df[FEAT_COLS].copy()
        self.df = self.df.reset_index(drop=True)
        self.target = df['target'].values

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        X = torch.as_tensor(self.df.iloc[idx, :].values, dtype=torch.float32)
        y = torch.as_tensor(self.target[idx], dtype=torch.float32)

        return (X, y)


def train_net(net, trainloader, valloader, optimiser, enc_criterion, pred_criterion, epochs=1, checkpoints=None):
    """ Train the network for the specified number of epochs, tracking train
    and validation losses.
    """

    if checkpoints is not None:
        checkpoints = Path(checkpoints)
        if not os.path.exists(checkpoints):
            os.makedirs(str(checkpoints))

    # Create lists for tracking loss and accuracy
    epoch_losses_enc, epoch_val_losses_enc, epoch_losses_pred, epoch_val_losses_pred = [], [], [], []

    try:
        # loop over the dataset multiple times
        for epoch in tqdm(range(epochs), desc='Train'):
            running_loss_enc, running_loss_pred = 0.0, 0.0

            # Get a minibatch of data
            for i, (inputs, labels) in enumerate(trainloader, 0):

                # Move data to GPU (if available)
                inputs, labels = inputs.to(device), labels.to(device)

                # Reset gradients
                optimiser.zero_grad()

                # Forward pass (of autoencoder)
                outputs = net(inputs)

                # Calculate loss of autoencoder
                enc_loss = enc_criterion(outputs, inputs)
                # Calculate the loss of predictor
                preds = net.predict(inputs)
                pred_loss = pred_criterion(preds, labels)

                # Update and log training losses
                running_loss_enc += enc_loss.item()
                running_loss_pred += pred_loss.item()

                # Backpropagation and weight updating
                enc_loss.backward()
                pred_loss.backward()
                optimiser.step()

            # Calculate avg loss over epoch
            epoch_losses_enc.append(running_loss_enc / len(trainloader.dataset))
            epoch_losses_pred.append(running_loss_pred / len(trainloader.dataset))

            print(f"Training loss | Autoencoder: {epoch_losses_enc[-1]:.4f}\tPredictor: {epoch_losses_pred[-1]:.4f}")

            # Calculate validation loss over epoch
            val_loss_enc, val_loss_pred = 0.0, 0.0
            val_steps = 0
            for i, data in enumerate(valloader, 0):
                with torch.no_grad():
                    inputs, labels = data
                    # Move data to GPU (if available)
                    inputs, labels = inputs.to(device), labels.to(device)
                    outputs = net(inputs)
                    enc_loss = enc_criterion(outputs, inputs)
                    val_loss_enc += enc_loss.item()
                    preds = net.predict(inputs)
                    pred_loss = pred_criterion(preds, labels)
                    val_loss_pred += pred_loss.item()
                    val_steps += 1
            epoch_val_losses_enc.append(val_loss_enc / len(valloader.dataset))
            epoch_val_losses_pred.append(val_loss_pred / len(valloader.dataset))
            print(f"Validation loss | Autoencoder: {epoch_val_losses_enc[-1]:.4f}\tPredictor: {epoch_val_losses_pred[-1]:.4f}")

            # Checkpointing model states
            if checkpoints is not None:
                if epoch_val_losses_enc[-1] <= np.min(epoch_val_losses_enc) and epoch_val_losses_pred[-1] <= np.min(epoch_val_losses_pred):
                    print("Best model. Checkpointing...")
                    torch.save({
                        'epoch': epoch,
                        'model_state_dict': net.state_dict(),
                        'optimizer_state_dict': optimiser.state_dict(),
                        'val_losses_pred': epoch_val_losses_pred,
                        'val_losses_enc': epoch_val_losses_enc,
                    }, checkpoints/'best_checkpoint.tar')


        print('\nTraining complete')

    except KeyboardInterrupt as e:
        print(f'\nKeyboard interrupt! Stopping training after {epoch} of {epochs}...')

    return epoch_losses_enc, epoch_val_losses_enc, epoch_losses_pred, epoch_val_losses_pred


class Autoencoder(nn.Module):
    # Based on https://github.com/astorfi/differentially-private-cgan/blob/master/UCI/autoencoder.py

    def __init__(self):
        super(Autoencoder, self).__init__()
        n_channels_base = 4

        self.encoder = nn.Sequential(
            nn.Conv1d(in_channels=1, out_channels=n_channels_base, kernel_size=5, stride=2, padding=0, dilation=1,
                      groups=1, bias=True, padding_mode='zeros'),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv1d(in_channels=n_channels_base, out_channels=2 * n_channels_base, kernel_size=5, stride=2, padding=0,
                      dilation=1,
                      groups=1, bias=True, padding_mode='zeros'),
            nn.BatchNorm1d(2 * n_channels_base),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv1d(in_channels=2 * n_channels_base, out_channels=4 * n_channels_base, kernel_size=5, stride=3,
                      padding=0, dilation=1,
                      groups=1, bias=True, padding_mode='zeros'),
            nn.BatchNorm1d(4 * n_channels_base),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv1d(in_channels=4 * n_channels_base, out_channels=8 * n_channels_base, kernel_size=5, stride=3,
                      padding=0, dilation=1,
                      groups=1, bias=True, padding_mode='zeros'),
            nn.BatchNorm1d(8 * n_channels_base),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv1d(in_channels=8 * n_channels_base, out_channels=16 * n_channels_base, kernel_size=3, stride=1,
                      padding=0, dilation=1,
                      groups=1, bias=True, padding_mode='zeros'),
            nn.ReLU(),
        )

        self.decoder = nn.Sequential(
            nn.ConvTranspose1d(in_channels=16 * n_channels_base, out_channels=8 * n_channels_base, kernel_size=5,
                               stride=1, padding=0, dilation=1,
                               groups=1, bias=True, padding_mode='zeros'),
            nn.LeakyReLU(0.2, inplace=True),
            nn.ConvTranspose1d(in_channels=8 * n_channels_base, out_channels=4 * n_channels_base, kernel_size=5,
                               stride=4, padding=0,
                               dilation=1,
                               groups=1, bias=True, padding_mode='zeros'),
            nn.BatchNorm1d(4 * n_channels_base),
            nn.LeakyReLU(0.2, inplace=True),
            nn.ConvTranspose1d(in_channels=4 * n_channels_base, out_channels=2 * n_channels_base, kernel_size=7,
                               stride=4,
                               padding=0, dilation=1,
                               groups=1, bias=True, padding_mode='zeros'),
            nn.BatchNorm1d(2 * n_channels_base),
            nn.LeakyReLU(0.2, inplace=True),
            nn.ConvTranspose1d(in_channels=2 * n_channels_base, out_channels=1, kernel_size=10, stride=2,
                               padding=0, dilation=1,
                               groups=1, bias=True, padding_mode='zeros'),
            nn.Sigmoid(),
        )

        self.fc = nn.Sequential(
           nn.Linear(in_features=320+310, out_features=256),
           nn.ReLU(),
           nn.Linear(in_features=256, out_features=64),
           nn.ReLU(),
           nn.Linear(in_features=64, out_features=1),
           nn.Sigmoid(),
        )


    def forward(self, x):
        x = self.encoder(x.view(-1, 1, x.shape[1]))
        x = self.decoder(x)
        return torch.squeeze(x, dim=1)

    def encode(self, x):
        x = self.encoder(x.view(-1, 1, x.shape[1]))
        return torch.squeeze(x, dim=1)

    def decode(self, x):
        x = self.decoder(x)
        return torch.squeeze(x, dim=1)

    def predict(self, x):
        with torch.no_grad():
            x_enc = self.encode(x)
        x_enc = x_enc.view(-1, x_enc.shape[1]*x_enc.shape[2])
        ins = torch.cat([x_enc, x], dim=1)
        out = self.fc(ins)
        return torch.squeeze(out, dim=1)

net = Autoencoder().to(device)
summary(net, (len(FEAT_COLS), ))


Device: cuda:0
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv1d-1               [-1, 4, 153]              24
         LeakyReLU-2               [-1, 4, 153]               0
            Conv1d-3                [-1, 8, 75]             168
       BatchNorm1d-4                [-1, 8, 75]              16
         LeakyReLU-5                [-1, 8, 75]               0
            Conv1d-6               [-1, 16, 24]             656
       BatchNorm1d-7               [-1, 16, 24]              32
         LeakyReLU-8               [-1, 16, 24]               0
            Conv1d-9                [-1, 32, 7]           2,592
      BatchNorm1d-10                [-1, 32, 7]              64
        LeakyReLU-11                [-1, 32, 7]               0
           Conv1d-12                [-1, 64, 5]           6,208
             ReLU-13                [-1, 64, 5]               0
  ConvTranspose1d-14    

In [14]:
# Load a pytorch model
net = Autoencoder().to(device)
net.load_state_dict(torch.load(SRCDIR / 'GTRUDA_autoencoder_statedict.pt'))
net.eval()

Autoencoder(
  (encoder): Sequential(
    (0): Conv1d(1, 4, kernel_size=(5,), stride=(2,))
    (1): LeakyReLU(negative_slope=0.2, inplace=True)
    (2): Conv1d(4, 8, kernel_size=(5,), stride=(2,))
    (3): BatchNorm1d(8, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): LeakyReLU(negative_slope=0.2, inplace=True)
    (5): Conv1d(8, 16, kernel_size=(5,), stride=(3,))
    (6): BatchNorm1d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): LeakyReLU(negative_slope=0.2, inplace=True)
    (8): Conv1d(16, 32, kernel_size=(5,), stride=(3,))
    (9): BatchNorm1d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): LeakyReLU(negative_slope=0.2, inplace=True)
    (11): Conv1d(32, 64, kernel_size=(3,), stride=(1,))
    (12): ReLU()
  )
  (decoder): Sequential(
    (0): ConvTranspose1d(64, 32, kernel_size=(5,), stride=(1,))
    (1): LeakyReLU(negative_slope=0.2, inplace=True)
    (2): ConvTranspose1d(32, 16, kernel_size=(5,)

time: 533 ms (started: 2022-10-02 18:55:10 +00:00)


## Making Predictions with the final model

In [15]:
# PyTorch version
print(df_tourn.shape)
df_predictions = df_tourn.reset_index()[['id']]
print(df_predictions.shape)
testloader = DataLoader(NumeraiDataset(df_tourn), batch_size=2**12, shuffle=False, num_workers=CPUs)
preds = []
with torch.no_grad():
    for data in testloader:
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        predicted = list(net.predict(inputs).cpu().numpy())
        preds.extend(predicted)
df_tourn['prediction'] = preds
print(df_tourn.shape)

print('Scoring predictions on validation data...')
_df = df_tourn[df_tourn.data_type == 'validation']
val_corr, val_sharpe, max_fe, rms_fe = scores(_df, verbose=True)

# Free up memory
# del df_tourn

df_predictions['prediction'] = preds
print(df_predictions.shape)

(2104564, 313)
(2104564, 1)
(2104564, 314)
Scoring predictions on validation data...
Spearman:	0.0215
Sharpe:		0.9875
Max exposure:	0.1931
RMS exposure:	0.0816
(2104564, 2)
time: 1min 29s (started: 2022-10-02 18:55:11 +00:00)


In [16]:
df_predictions.prediction.describe().round(3)

count    2104564.000
mean           0.499
std            0.029
min            0.180
25%            0.489
50%            0.498
75%            0.509
max            0.792
Name: prediction, dtype: float64

time: 85 ms (started: 2022-10-02 18:56:40 +00:00)


# Submission

In [17]:
PRED_FILENAME = f"{RESULTDIR}/predictions_{MODEL_NAME}_{napi.get_current_round()}.csv"

if SAVE_PREDS:
    df_predictions.to_csv(PRED_FILENAME, index=False)

if PRED_FILENAME is not None and UPLOAD_PREDS:
    napi.upload_predictions(PRED_FILENAME, model_id=MODEL_ID)

time: 17.1 s (started: 2022-10-02 18:56:40 +00:00)
