In [71]:
from dotenv import load_dotenv
load_dotenv()
import pandas as pd
import numerapi
import os
import warnings
warnings.filterwarnings("ignore")
from utils import (
    validation_metrics,
    EXAMPLE_PREDS_COL
)
import numpy as np
import scipy
import utils
import json
from lightgbm import LGBMRegressor
import gc
import json

In [2]:
public_id = os.environ.get("NUMERAI_PUBLIC_KEY")
secret_key = os.environ.get("NUMERAI_SECRET_KEY")
napi = numerapi.NumerAPI(public_id, secret_key)

In [22]:
current_round = napi.get_current_round()
TRAINING_DATA_FILE = "data/training_data.parquet"
TOURNAMENT_DATA_FILE = f"data/tournament_data_{current_round}.parquet"
VALIDATION_DATA_FILE = "data/validation_data.parquet"
EXAMPLE_VALIDATION_PREDICTIONS_FILE = "data/example_validation_predictions.parquet"
FEATURES_FILE = "data/features.json"

MODEL_NAME = "target_model"
TARGET_MODEL_FILE = f"output/{MODEL_NAME}"
VALIDATION_PREDICTIONS_FILE = f"output/validation_predictions_{current_round}.csv"
TOURNAMENT_PREDICTIONS_FILE = f"output/tournament_predictions_{current_round}.csv"

TARGET_COL = 'target'
ERA_COL = 'era'
DATA_TYPE_COL = 'data_type'

In [4]:
# Tournament data changes every week so we specify the round in their name. Training
# and validation data only change periodically, so no need to download them every time.
print('Downloading dataset files...')
napi.download_dataset("numerai_training_data.parquet", TRAINING_DATA_FILE)
napi.download_dataset("numerai_tournament_data.parquet", TOURNAMENT_DATA_FILE)
napi.download_dataset("numerai_validation_data.parquet", VALIDATION_DATA_FILE)
napi.download_dataset("example_validation_predictions.parquet", EXAMPLE_VALIDATION_PREDICTIONS_FILE)
napi.download_dataset("features.json", FEATURES_FILE)

Downloading dataset files...


2022-03-17 11:58:14,365 INFO numerapi.utils: target file already exists
2022-03-17 11:58:14,365 INFO numerapi.utils: download complete
2022-03-17 11:58:15,612 INFO numerapi.utils: target file already exists
2022-03-17 11:58:15,614 INFO numerapi.utils: download complete
2022-03-17 11:58:28,996 INFO numerapi.utils: target file already exists
2022-03-17 11:58:28,997 INFO numerapi.utils: download complete
2022-03-17 11:58:42,312 INFO numerapi.utils: target file already exists
2022-03-17 11:58:42,313 INFO numerapi.utils: download complete
2022-03-17 11:58:46,794 INFO numerapi.utils: target file already exists
2022-03-17 11:58:46,796 INFO numerapi.utils: download complete


In [5]:
all_columns = utils.get_all_columns(TRAINING_DATA_FILE)
features = [c for c in all_columns if c.startswith("feature_")]
targets = [c for c in all_columns if c.startswith("target_")]
other_columns = [c for c in all_columns if not c.startswith("feature_") and not c.startswith("target_")]
pd.DataFrame([{
    "features": len(features),
    "targets": len(targets),
    "other": len(other_columns),
    "stats": "nb_cols"
}]).set_index("stats")

Unnamed: 0_level_0,features,targets,other
stats,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
nb_cols,1050,21,2


In [7]:
print('Reading minimal training data')
# read the feature metadata and get the "small" feature set
with open(FEATURES_FILE, "r") as f:
    feature_metadata = json.load(f)
features = feature_metadata["feature_sets"]["small"]
# read in just those features along with era and target columns
read_columns = features + [ERA_COL, DATA_TYPE_COL] + targets

Reading minimal training data


In [20]:
# note: sometimes when trying to read the downloaded data you get an error about invalid magic parquet bytes...
# if so, delete the file and rerun the napi.download_dataset to fix the corrupted file
training_data = pd.read_parquet(TRAINING_DATA_FILE, columns=read_columns)

In [38]:
# getting the per era correlation of each feature vs the target
all_feature_corrs = training_data.groupby(ERA_COL).apply(
    lambda era: era[features].corrwith(era[TARGET_COL])
)


In [77]:
# find the riskiest features by comparing their correlation vs
# the target in each half of training data; we'll use these later
def get_biggest_change_features(corrs, n=None):
    all_eras = corrs.index.sort_values()
    if n is None:
        n = len(corrs.columns) // 2
    h1_eras = all_eras[:len(all_eras) // 2]
    h2_eras = all_eras[len(all_eras) // 2:]

    h1_corr_means = corrs.loc[h1_eras, :].mean()
    h2_corr_means = corrs.loc[h2_eras, :].mean()

    corr_diffs = h2_corr_means - h1_corr_means
    worst_n = corr_diffs.abs().sort_values(ascending=False).head(n).index.tolist()
    return worst_n

riskiest_features = get_biggest_change_features(all_feature_corrs, 50)
# "garbage collection" (gc) gets rid of unused data and frees up memory
gc.collect()

0

In [78]:
params = {"n_estimators": 2000,
            "learning_rate": 0.01,
            "max_depth": 5,
            "num_leaves": 2 ** 5,
            "colsample_bytree": 0.1}
model = LGBMRegressor(**params)

# train on all of train and save the model so we don't have to train next time
# spinner.start('Training model')
model.fit(training_data.filter(like='feature_', axis='columns'),
            training_data[TARGET_COL])
# print(f"saving new model: {TARGET_MODEL_FILE}")
# save_model(model, TARGET_MODEL_FILE)
# spinner.succeed()
gc.collect()

48

In [79]:
validation_data = pd.read_parquet(VALIDATION_DATA_FILE, columns=read_columns)
tournament_data = pd.read_parquet(TOURNAMENT_DATA_FILE, columns=read_columns)

tournament_data_features_only = tournament_data[features + [DATA_TYPE_COL]]
nans_per_col = tournament_data_features_only[tournament_data_features_only["data_type"] == "live"].isna().sum()
del tournament_data_features_only
# check for nans and fill nans
if nans_per_col.any():
    total_rows = len(tournament_data[tournament_data[DATA_TYPE_COL] == "live"])
    print(f"Number of nans per column this week: {nans_per_col[nans_per_col > 0]}")
    print(f"out of {total_rows} total rows")
    print(f"filling nans with 0.5")
    tournament_data.loc[:, features] = tournament_data.loc[:, features].fillna(0.5)
else:
    print("No nans in the features this week!")

No nans in the features this week!


In [80]:
# double check the feature that the model expects vs what is available to prevent our
# pipeline from failing if Numerai adds more data and we don't have time to retrain!
model_expected_features = model.booster_.feature_name()
if set(model_expected_features) != set(features):
    print(f"New features are available! Might want to retrain model {MODEL_NAME}.")
validation_data.loc[:, f"preds_{MODEL_NAME}"] = model.predict(
    validation_data.loc[:, model_expected_features])
tournament_data.loc[:, f"preds_{MODEL_NAME}"] = model.predict(
    tournament_data.loc[:, model_expected_features])

gc.collect()

24

In [72]:

def neutralize(df,
               columns,
               neutralizers=None,
               proportion=1.0,
               normalize=True,
               era_col="era"):
    if neutralizers is None:
        neutralizers = []
    unique_eras = df[era_col].unique()
    computed = []
    for u in unique_eras:
        df_era = df[df[era_col] == u]
        scores = df_era[columns].values
        if normalize:
            scores2 = []
            for x in scores.T:
                x = (scipy.stats.rankdata(x, method='ordinal') - .5) / len(x)
                x = scipy.stats.norm.ppf(x)
                scores2.append(x)
            scores = np.array(scores2).T
        exposures = df_era[neutralizers].values

        scores -= proportion * exposures.dot(
            np.linalg.pinv(exposures.astype(np.float32)).dot(scores.astype(np.float32)))

        scores /= scores.std(ddof=0)

        computed.append(scores)

    return pd.DataFrame(np.concatenate(computed),
                        columns=columns,
                        index=df.index)



# neutralize our predictions to the riskiest features
validation_data[f"preds_{MODEL_NAME}_neutral_riskiest_50"] = neutralize(
    df=validation_data,
    columns=[f"preds_{MODEL_NAME}"],
    neutralizers=riskiest_features,
    proportion=1.0,
    normalize=True,
    era_col=ERA_COL
)

tournament_data[f"preds_{MODEL_NAME}_neutral_riskiest_50"] = neutralize(
    df=tournament_data,
    columns=[f"preds_{MODEL_NAME}"],
    neutralizers=riskiest_features,
    proportion=1.0,
    normalize=True,
    era_col=ERA_COL
)

In [73]:

model_to_submit = f"preds_{MODEL_NAME}_neutral_riskiest_50"

# rename best model to "prediction" and rank from 0 to 1 to meet upload requirements
validation_data["prediction"] = validation_data[model_to_submit].rank(pct=True)
tournament_data["prediction"] = tournament_data[model_to_submit].rank(pct=True)


In [74]:
# save predictions to csv
validation_data["prediction"].to_csv(VALIDATION_PREDICTIONS_FILE)
tournament_data["prediction"].to_csv(TOURNAMENT_PREDICTIONS_FILE)

In [75]:
validation_preds = pd.read_parquet(EXAMPLE_VALIDATION_PREDICTIONS_FILE)
validation_data[EXAMPLE_PREDS_COL] = validation_preds["prediction"]

In [76]:
# get some stats about each of our models to compare...
# fast_mode=True so that we skip some of the stats that are slower to calculate
validation_stats = validation_metrics(validation_data, [model_to_submit], example_col=EXAMPLE_PREDS_COL, fast_mode=True)
print(validation_stats[["mean", "sharpe"]].to_markdown())

|                                        |      mean |   sharpe |
|:---------------------------------------|----------:|---------:|
| preds_target_model_neutral_riskiest_50 | 0.0205944 | 0.742502 |
