In [2]:
from dotenv import load_dotenv
load_dotenv()
import pandas as pd
import numerapi
import os
import warnings
warnings.filterwarnings("ignore")
from utils import (
    save_model,
    load_model,
    neutralize,
    get_biggest_change_features,
    validation_metrics,
    ERA_COL,
    DATA_TYPE_COL,
    TARGET_COL,
    EXAMPLE_PREDS_COL
)
import json
from lightgbm import LGBMRegressor
import gc
import json
from halo import Halo



In [3]:
public_id = os.environ.get("NUMERAI_PUBLIC_KEY")
secret_key = os.environ.get("NUMERAI_SECRET_KEY")
napi = numerapi.NumerAPI(public_id, secret_key)



In [29]:
current_round = napi.get_current_round()
TRAINING_DATA_FILE = "data/training_data.parquet"
TOURNAMENT_DATA_FILE = f"data/tournament_data_{current_round}.parquet"
VALIDATION_DATA_FILE = "data/validation_data.parquet"
EXAMPLE_VALIDATION_PREDICTIONS_FILE = "data/example_validation_predictions.parquet"
FEATURES_FILE = "data/features.json"

MODEL_NAME = "target_model"
TARGET_MODEL_FILE = f"output/{MODEL_NAME}"
VALIDATION_PREDICTIONS_FILE = f"output/validation_predictions_{current_round}.csv"
TOURNAMENT_PREDICTIONS_FILE = f"output/tournament_predictions_{current_round}.csv"



In [5]:
# Tournament data changes every week so we specify the round in their name. Training
# and validation data only change periodically, so no need to download them every time.
print('Downloading dataset files...')
napi.download_dataset("numerai_training_data.parquet", TRAINING_DATA_FILE)
napi.download_dataset("numerai_tournament_data.parquet", TOURNAMENT_DATA_FILE)
napi.download_dataset("numerai_validation_data.parquet", VALIDATION_DATA_FILE)
napi.download_dataset("example_validation_predictions.parquet", EXAMPLE_VALIDATION_PREDICTIONS_FILE)
napi.download_dataset("features.json", FEATURES_FILE)

Downloading dataset files...


2022-03-14 13:57:45,180 INFO numerapi.utils: target file already exists
2022-03-14 13:57:45,181 INFO numerapi.utils: download complete
2022-03-14 13:57:46,378 INFO numerapi.utils: target file already exists
2022-03-14 13:57:46,379 INFO numerapi.utils: download complete
2022-03-14 13:57:59,777 INFO numerapi.utils: target file already exists
2022-03-14 13:57:59,779 INFO numerapi.utils: download complete
2022-03-14 13:58:13,209 INFO numerapi.utils: target file already exists
2022-03-14 13:58:13,211 INFO numerapi.utils: download complete
2022-03-14 13:58:18,343 INFO numerapi.utils: target file already exists
2022-03-14 13:58:18,345 INFO numerapi.utils: download complete




In [6]:
print('Reading minimal training data')
# read the feature metadata and get the "small" feature set
with open("features.json", "r") as f:
    feature_metadata = json.load(f)
features = feature_metadata["feature_sets"]["small"]
# read in just those features along with era and target columns
read_columns = features + [ERA_COL, DATA_TYPE_COL, TARGET_COL]

Reading minimal training data


In [7]:
# note: sometimes when trying to read the downloaded data you get an error about invalid magic parquet bytes...
# if so, delete the file and rerun the napi.download_dataset to fix the corrupted file
training_data = pd.read_parquet(TRAINING_DATA_FILE, columns=read_columns)

Unnamed: 0_level_0,feature_agile_unrespited_gaucho,feature_antichristian_slangiest_idyllist,feature_apomictical_motorized_vaporisation,feature_assenting_darn_arthropod,feature_beery_somatologic_elimination,feature_bhutan_imagism_dolerite,feature_branched_dilatory_sunbelt,feature_buxom_curtained_sienna,feature_cambial_bigoted_bacterioid,feature_canalicular_peeling_lilienthal,...,feature_undivorced_unsatisfying_praetorium,feature_unforbidden_highbrow_kafir,feature_univalve_abdicant_distrail,feature_unsealed_suffixal_babar,feature_unvaried_social_bangkok,feature_unwonted_trusted_fixative,feature_winsome_irreproachable_milkfish,era,data_type,target_nomi_20
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n003bba8a98662e4,0.75,1.00,0.50,0.25,1.00,0.00,0.50,0.50,0.00,0.75,...,0.50,0.00,0.00,0.50,0.25,1.00,0.75,0001,train,0.25
n003bee128c2fcfc,0.50,0.25,0.50,0.75,0.50,0.25,0.75,0.75,0.25,0.25,...,0.75,0.25,0.75,0.50,1.00,0.25,0.25,0001,train,0.75
n0048ac83aff7194,0.50,0.75,1.00,0.75,0.75,1.00,0.75,0.75,0.75,0.75,...,1.00,1.00,0.00,0.75,1.00,0.75,0.25,0001,train,0.50
n00691bec80d3e02,0.50,0.25,0.50,0.50,0.00,0.75,0.00,0.00,0.50,0.50,...,0.00,0.75,0.00,0.25,0.00,0.00,0.00,0001,train,0.75
n00b8720a2fdc4f2,0.00,1.00,0.25,1.00,0.00,0.00,0.25,0.25,0.75,0.25,...,0.50,0.00,0.00,0.00,0.50,0.50,0.00,0001,train,0.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nffcc1dbdf2212e6,1.00,1.00,1.00,0.50,1.00,1.00,1.00,1.00,0.50,1.00,...,0.75,1.00,0.50,0.75,0.75,0.75,0.25,0574,train,0.75
nffd71b7f6a128df,0.00,0.50,0.25,0.25,1.00,0.50,0.00,0.25,0.75,0.00,...,0.00,1.00,0.00,0.00,0.00,0.00,1.00,0574,train,0.00
nffde3b371d67394,0.75,1.00,1.00,1.00,0.00,1.00,1.00,1.00,0.50,1.00,...,0.00,0.25,0.50,0.75,0.00,0.75,0.50,0574,train,0.25
nfff1a1111b35e84,0.50,0.00,0.00,1.00,0.50,0.25,1.00,1.00,0.25,0.00,...,0.25,0.00,0.25,0.50,0.25,1.00,0.25,0574,train,0.50




In [10]:
# getting the per era correlation of each feature vs the target
all_feature_corrs = training_data.groupby(ERA_COL).apply(
    lambda era: era[features].corrwith(era[TARGET_COL])
)




In [14]:
# find the riskiest features by comparing their correlation vs
# the target in each half of training data; we'll use these later
riskiest_features = get_biggest_change_features(all_feature_corrs, 50)
# "garbage collection" (gc) gets rid of unused data and frees up memory
gc.collect()



In [19]:
params = {"n_estimators": 2000,
            "learning_rate": 0.01,
            "max_depth": 5,
            "num_leaves": 2 ** 5,
            "colsample_bytree": 0.1}

# spinner = Halo(text='', spinner='dots')
model = LGBMRegressor(**params)

# train on all of train and save the model so we don't have to train next time
# spinner.start('Training model')
model.fit(training_data.filter(like='feature_', axis='columns'),
            training_data[TARGET_COL])
# print(f"saving new model: {TARGET_MODEL_FILE}")
# save_model(model, TARGET_MODEL_FILE)
# spinner.succeed()
gc.collect()

LGBMRegressor(colsample_bytree=0.1, learning_rate=0.01, max_depth=5,
              n_estimators=2000, num_leaves=32)



In [21]:
validation_data = pd.read_parquet(VALIDATION_DATA_FILE, columns=read_columns)
tournament_data = pd.read_parquet(TOURNAMENT_DATA_FILE, columns=read_columns)
nans_per_col = tournament_data[tournament_data["data_type"] == "live"].isna().sum()

# check for nans and fill nans
if nans_per_col.any():
    total_rows = len(tournament_data[tournament_data["data_type"] == "live"])
    print(f"Number of nans per column this week: {nans_per_col[nans_per_col > 0]}")
    print(f"out of {total_rows} total rows")
    print(f"filling nans with 0.5")
    tournament_data.loc[:, features] = tournament_data.loc[:, features].fillna(0.5)
else:
    print("No nans in the features this week!")

Number of nans per column this week: target_nomi_20    5307
dtype: int64
out of 5307 total rows
filling nans with 0.5


In [23]:
# double check the feature that the model expects vs what is available to prevent our
# pipeline from failing if Numerai adds more data and we don't have time to retrain!
model_expected_features = model.booster_.feature_name()
if set(model_expected_features) != set(features):
    print(f"New features are available! Might want to retrain model {MODEL_NAME}.")
validation_data.loc[:, f"preds_{MODEL_NAME}"] = model.predict(
    validation_data.loc[:, model_expected_features])
tournament_data.loc[:, f"preds_{MODEL_NAME}"] = model.predict(
    tournament_data.loc[:, model_expected_features])

gc.collect()

31



In [24]:
# neutralize our predictions to the riskiest features
validation_data[f"preds_{MODEL_NAME}_neutral_riskiest_50"] = neutralize(
    df=validation_data,
    columns=[f"preds_{MODEL_NAME}"],
    neutralizers=riskiest_features,
    proportion=1.0,
    normalize=True,
    era_col=ERA_COL
)

tournament_data[f"preds_{MODEL_NAME}_neutral_riskiest_50"] = neutralize(
    df=tournament_data,
    columns=[f"preds_{MODEL_NAME}"],
    neutralizers=riskiest_features,
    proportion=1.0,
    normalize=True,
    era_col=ERA_COL
)



In [26]:

model_to_submit = f"preds_{MODEL_NAME}_neutral_riskiest_50"

# rename best model to "prediction" and rank from 0 to 1 to meet upload requirements
validation_data["prediction"] = validation_data[model_to_submit].rank(pct=True)
tournament_data["prediction"] = tournament_data[model_to_submit].rank(pct=True)




In [35]:
# save predictions to csv
validation_data["prediction"].to_csv(VALIDATION_PREDICTIONS_FILE)
tournament_data["prediction"].to_csv(TOURNAMENT_PREDICTIONS_FILE)



In [31]:
validation_preds = pd.read_parquet(EXAMPLE_VALIDATION_PREDICTIONS_FILE)
validation_data[EXAMPLE_PREDS_COL] = validation_preds["prediction"]



In [32]:
# get some stats about each of our models to compare...
# fast_mode=True so that we skip some of the stats that are slower to calculate
validation_stats = validation_metrics(validation_data, [model_to_submit], example_col=EXAMPLE_PREDS_COL, fast_mode=True)
print(validation_stats[["mean", "sharpe"]].to_markdown())

|                                             |     mean |   sharpe |
|:--------------------------------------------|---------:|---------:|
| preds_data/target_model_neutral_riskiest_50 | 0.021721 |  1.16183 |


In [33]:
EXAMPLE_PREDS_COL

'example_preds'

