# Setup

In [1]:
MODEL_NAME = 'V3X'
MODEL_ID = '<model_id_here>'

## Connecting Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

In [3]:
from pathlib import Path
DIR = Path('gdrive/MyDrive/numerai')
DATADIR = DIR / 'data'
SRCDIR = DIR / 'src'
RESULTDIR = DIR / 'results'

In [4]:
# Copy .env from numerai folder to root dir
!cp gdrive/MyDrive/Data/numerai/.env .env

## Installing and Importing Dependencies
First, we install and import the necessary packages. This cell is currently set *not* to print any output; if you run into any issues and need to check for error messages, comment out the `%%capture` line

In [5]:
%%capture
# install
!pip uninstall --no-input pandas
!pip install --upgrade python-dotenv fastai numerapi
!pip install ipython-autotime

# import dependencies
import gc
import os
from dotenv import load_dotenv, find_dotenv
from getpass import getpass
import numerapi
from fastai.tabular.all import *
from pathlib import Path
from scipy.stats import spearmanr
import sklearn.linear_model

from tqdm import tqdm
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns

In [6]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

# Set sensible defaults
sns.set()
sns.set_style("ticks")
sns.set_context('paper')

%load_ext autotime

time: 273 µs (started: 2023-01-07 17:02:27 +00:00)


## Setting up numerapi
We will use the [numerapi](https://github.com/uuazed/numerapi) package to access the data and make submissions. For this to work, numerapi needs to use your API keys (which can be obtained [here](https://numer.ai/submit)). We will set up two main ways of passing these API keys to a numerapi instance:
1. Read a `.env` file using the `python-dotenv` package. This will require you to upload a `.env` file (which contains your secret key and should *not* be kept under version control). Using this method means you will not have to directly enter your keys each time you use this notebook, though you will need to re-upload the `.env` file.
2. Manually entering the API keys -- if you don't have access to, or don't want to mess with, your `.env` file.

If you have a `.env` file, upload it to the default working directory, `content`, now. In either case, run the cell below to set up the numerapi instance. See [Appendix A](#app_a) for instructions on generating and downloading a .env file.

In [None]:
# Load the numerapi credentials from .env or prompt for them if not available
def credential():
    dotenv_path = find_dotenv()
    load_dotenv(dotenv_path)

    if os.getenv("NUMERAI_PUBLIC_KEY"):
        print("Loaded Numerai Public Key into Global Environment!")
    else:
        os.environ["NUMERAI_PUBLIC_KEY"] = getpass("Please enter your Numerai Public Key. You can find your key here: https://numer.ai/submit -> ")

    if os.getenv("NUMERAI_SECRET_KEY"):
        print("Loaded Numerai Secret Key into Global Environment!")
    else:
        os.environ["NUMERAI_SECRET_KEY"] = getpass("Please enter your Numerai Secret Key. You can find your key here: https://numer.ai/submit -> ")

    # if os.getenv("NUMERAI_MODEL_ID"):
    #     print("Loaded Numerai Model ID into Global Environment!")
    # else:
    #     os.environ["NUMERAI_MODEL_ID"] = getpass("Please enter your Numerai Model ID. You can find your key here: https://numer.ai/submit -> ")

credential()
public_key = os.environ.get("NUMERAI_PUBLIC_KEY")
secret_key = os.environ.get("NUMERAI_SECRET_KEY")
# model_id = os.environ.get("NUMERAI_MODEL_ID")
model_id = MODEL_ID
napi = numerapi.NumerAPI(verbosity="info", public_id=public_key, secret_key=secret_key)

You can read up on the functionality of numerapi [here](https://github.com/uuazed/numerapi). You can use it to download the competition data, view other numerai users' public profiles, check submission status, manage your stake, and much more. In this case, we'll only be using it to download competition data and submit predictions.



# Data preparation


## Downloading Competition Data
In a more structured project, you'll probably want to keep the data in a seprate directory from your scripts etc. You could also link google colab to your google drive and store the data there in order to avoid needing to download and process the data every time. In this case, however, we'll keep everything in `./content`, and download the data fresh each time.

In [8]:
# check if a new round has started
if napi.check_new_round():
    print("new round has started within the last 24hours!")
else:
    print("no new round within the last 24 hours")

new round has started within the last 24hours!
time: 882 ms (started: 2023-01-07 17:02:27 +00:00)


In [9]:
# Download the current dataset unless it's already there
if not os.path.exists(f'{DATADIR}/numerai_dataset_{napi.get_current_round()}'):
    napi.download_current_dataset(dest_path=DATADIR, unzip=True)
else:
    print("Current round already downloaded")

gdrive/MyDrive/Data/numerai/data/numerai_dataset_394.zip: 488MB [00:28, 17.3MB/s]                           


time: 1min 2s (started: 2023-01-07 17:02:28 +00:00)


## Reading the data into memory

If you look at the files we downloaded above, you'll see a `numerai_tournament_data.csv` file and a `numerai_training_data.csv` file. The "tournament" file contains many rows with targets which we can use for validation, so let's extract those and combine them with our training set. Note that this cell saves a new `csv` after combining the training and validation data, so we can avoid the time-consuming parsing process if we run this cell again in the same session.

Question: Does this differ _during_ a tournament?

In [10]:
# Get the current round
ROUND_NUM = napi.get_current_round()
print(f"Round number {ROUND_NUM}")

# training data contains features and targets
print('Reading training data...')
train_file = Path(f'{DATADIR}/numerai_dataset_{ROUND_NUM}/numerai_training_data.csv')
df_train = pd.read_csv(train_file).set_index("id")

# tournament data contains features only (for the test rows)
print('Reading tournament data...')
tourn_file = Path(f'{DATADIR}/numerai_dataset_{ROUND_NUM}/numerai_tournament_data.csv')
df_tourn = pd.read_csv(tourn_file).set_index("id")


Round number 394
Reading training data...
Reading tournament data...
time: 2min 3s (started: 2023-01-07 17:03:30 +00:00)


In [11]:
# Get the names of the features
FEAT_COLS = [f for f in df_train.columns if "feature" in f]

# Get the groups of the features
FEAT_GROUPS = {
    g: [c for c in FEAT_COLS if c.startswith(f"feature_{g}")]
    for g in ["intelligence", "wisdom", "charisma", "dexterity", "strength", "constitution"]
}

time: 2.03 ms (started: 2023-01-07 17:05:34 +00:00)


## Making the dataframes more memory efficient

In [12]:
# Convert all features (and target) to float16 to save memory
for _df in tqdm([df_train, df_tourn]):
    _df[[*FEAT_COLS, 'target']] = _df[[*FEAT_COLS, 'target']].astype(np.float16)

100%|██████████| 2/2 [03:56<00:00, 118.29s/it]

time: 3min 56s (started: 2023-01-07 17:05:34 +00:00)





In [None]:
# Copy tournament data into validation
df_val = df_tourn[df_tourn['data_type'] == 'validation']
# Remove df_tourn from memory
del df_tourn

In [None]:
# Convert era to int32
pd.options.mode.chained_assignment = 'warn'
df_train['era'] = df_train['era'].str[3:].astype('int32')
df_val['era'] = df_val['era'].str[3:].astype('int32')

# Modeling the Data

In this section, we will define our evaluation metrics; run two different models (a linear regression model from `scikit-learn` and a neural network from `fastai`); and generate submission dataframes from those files.

## Evaluation Metrics

In this section, we will define two key evaluation metrics used to assess the performance of models before submitting to the tournament. These metrics are:
- Average Spearman Correlation per era: The sum of each era's Spearman correlation divided by the number of eras.
- Sharpe Ratio: The average correlation per era divided by the standard deviation of the correlations per era.

Both are defined in reasonable detail [here](https://wandb.ai/carlolepelaars/numerai_tutorial/reports/How-to-get-Started-With-Numerai--VmlldzoxODU0NTQ). The methods defined below are modified versions of the methods described in that post.

In [None]:
def score_corr(df: pd.DataFrame) -> np.float32:
    """
    Calculate the correlation by using grouped per-era data
    :param df: A Pandas DataFrame containing the columns "era", "target" and "prediction"
    :return: The average per-era correlations.
    """
    def _score(sub_df: pd.DataFrame) -> np.float32:
        """ Calculate Spearman correlation for Pandas' apply method """
        return spearmanr(sub_df["target"],  sub_df["prediction"])[0]
    corrs = df.groupby("era").apply(_score)
    return corrs.mean()

def score_spear(y_true, y_pred, axis=0):
    """Calculate Spearman correlation"""
    return spearmanr(y_true, y_pred, axis=axis)[0]

def score_sharpe(df: pd.DataFrame) -> np.float32:
    """
    Calculate the Sharpe ratio by using grouped per-era data
    :param df: A Pandas DataFrame containing the columns "era", "target" and "prediction"
    :return: The Sharpe ratio for your predictions.
    """
    def _score(sub_df: pd.DataFrame) -> np.float32:
        """ Calculate Spearman correlation for Pandas' apply method """
        return spearmanr(sub_df["target"],  sub_df["prediction"])[0]
    corrs = df.groupby("era").apply(_score)
    return corrs.mean() / corrs.std()

def scores(df: pd.DataFrame, verbose=False) -> (np.float32, np.float32):
    """ Score models Spearman and Sharpe. """
    val_sharpe = score_sharpe(df)
    val_corr = score_corr(df)
    if verbose:
        print(f'\nSpearman:\t{val_corr: .4f}\nSharpe:\t\t{val_sharpe: .4f}')
    return val_corr, val_sharpe

def validate_preds(preds, df_val: pd.DataFrame, verbose=True):
    _df = pd.DataFrame({'prediction':preds,
                        'target':df_val.target,
                        'era':df_val.era}).reset_index()

    return scores(_df, verbose=verbose)


## Training models

### Linear baseline
This model closely follows the tutorial example [here](https://colab.research.google.com/github/numerai/example-scripts/blob/master/making-your-first-submission-on-numerai.ipynb). We will use the `scikit-learn` package, with which we can implement and fit our regression model in just a couple of lines of code.

In [None]:
model = sklearn.linear_model.LinearRegression()
model = model.fit(df_train[FEAT_COLS], df_train['target'])

In [17]:
val_preds = model.predict(df_val[FEAT_COLS])
_df = pd.DataFrame({'prediction':val_preds,
                        'target':df_val.target,
                        'era':df_val.era}).reset_index()

corr_val, sharpe_val = scores(_df, verbose=True)


Spearman:	 0.0162
Sharpe:		 0.5237
time: 1.01 s (started: 2023-01-07 17:09:44 +00:00)


## Ensemble model

In [None]:
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.base import BaseEstimator
from sklearn.metrics import mean_squared_error, make_scorer

class EraEnsemble(BaseEstimator):
    def __init__(self,
                 n_subs=10,
                 pca_frac=None,
                 subalg=XGBRegressor,
                #  subalg_args={},
                 mainalg=LassoCV,
                #  mainalg_args={},
                 ):
        self.n_subs = n_subs
        self.submodels = []
        self.sub_preds = []
        self.pca_frac = pca_frac
        self.transforms = []
        self.subalg = subalg
        self.mainalg = mainalg
        # self.subalg_args = subalg_args,
        # self.mainalg_args = mainalg_args

    def get_params(self, *args, **kwargs):
        return {
            'n_subs': self.n_subs,
            'pca_frac': self.pca_frac,
            'subalg': self.subalg,
            'mainalg': self.mainalg,
            }

    def fit(self, df, y, validation=df_val):
        # Figure out how to partition eras
        n_eras = df.era.nunique()
        min_era = df.era.min()
        max_era = df.era.max()
        STEP = n_eras//self.n_subs

        # Loop over era ranges
        for i in range(min_era, max_era, STEP):
            _data = df[df.era.between(i, i+STEP)]
            _target = _data['target']
            _data = _data[FEAT_COLS]
            if self.pca_frac < 1.0:
                _pca = PCA(n_components=self.pca_frac).fit(_data)
                _data = _pca.transform(_data)
                self.transforms.append(_pca)
            if self.subalg == XGBRegressor:
                submodel = self.subalg(verbosity=0).fit(_data, _target)
            else:
                submodel = self.subalg().fit(_data, _target)
            self.submodels.append(submodel)
            if self.pca_frac < 1.0:
                _preds = submodel.predict(_pca.transform(df[FEAT_COLS]))
            else:
                _preds = submodel.predict(df[FEAT_COLS])
            self.sub_preds.append(_preds)
            # if self.use_pca:
            #     _val_preds = submodel.predict(_pca.transform(df_val[FEAT_COLS]))
            # else:
            #     _val_preds = submodel.predict(df_val[FEAT_COLS])
            # corr_val, sharpe_val = validate_preds(_val_preds, df_val)
        _X = np.array(self.sub_preds).T
        self.mainmodel = self.mainalg().fit(_X, y)

        return self

    def predict(self, df):
        _preds = []
        for i, sm in enumerate(self.submodels):
            if self.pca_frac < 1.0:
                _data = self.transforms[i].transform(df[FEAT_COLS])
            else:
                _data = df[FEAT_COLS]
            _preds.append(sm.predict(_data))
        _X = np.array(_preds).T

        return self.mainmodel.predict(_X)


In [None]:
MODEL_FILE = SRCDIR / 'era_ensemble_v3x.xgb'
RETRAIN_MODEL = True

model = EraEnsemble(
    mainalg=RidgeCV,
    n_subs=3,
    pca_frac=1.0,
    subalg=XGBRegressor,
)

# TODO develop techniques for saving and loading custom ensemble model

if MODEL_FILE.is_file() and not RETRAIN_MODEL:
    print(f"Loading pre-trained model from '{MODEL_FILE}' ...")
    model.load_model(str(MODEL_FILE))
else:
    print("Training model...")
    model.fit(df_train, df_train['target'])
    # print(f"Saving model to '{MODEL_FILE}' ...")
    # model.save_model(str(MODEL_FILE))

preds = model.predict(df_val)
val_corr, val_sharpe = validate_preds(preds, df_val)

Training model...

Spearman:	 0.0215
Sharpe:		 0.7300
time: 6min 14s (started: 2023-01-07 17:09:45 +00:00)


#### Making Predictions with the last Model

In [None]:
ids = []
preds = []

# Read and predict in chunks to prevent memory issues
tourn_iter_csv = pd.read_csv(tourn_file, iterator=True, chunksize=1e5)
for chunk in tqdm(tourn_iter_csv):
    df = chunk[FEAT_COLS]
    out = model.predict(df)
    ids.extend(chunk["id"])
    preds.extend(out)
tourn_iter_csv.close()

df_predictions = pd.DataFrame({
    'id':ids,
    'prediction':preds
})
df_predictions.head()
print(df_predictions.shape)

22it [02:17,  6.26s/it]


(2175811, 2)
time: 2min 18s (started: 2023-01-07 17:15:59 +00:00)


In [None]:
PRED_FILENAME = f"{RESULTDIR}/predictions_{MODEL_NAME}_{napi.get_current_round()}.csv"
df_predictions.to_csv(PRED_FILENAME, index=False)

# Submission

In [None]:
if PRED_FILENAME is not None:
    napi.upload_predictions(PRED_FILENAME, model_id=MODEL_ID)