In [1]:
from numerapi import NumerAPI
import dotenv, os, json, pandas as pd, lightgbm as lgb

dotenv.load_dotenv('./.env')

P_ID = os.getenv('NUMERAI_PUBLIC_ID')
S_ID = os.getenv('NUMERAI_SECRET_ID')

In [2]:
# Initialize numerapi
napi = NumerAPI(public_id = P_ID, secret_key = S_ID)

all_datasets = napi.list_datasets()
#print(all_datasets)

# Set data version to one of the latest datasets
DATA_VERSION = "v5.0"

# download the feature metadata file
if not os.path.exists(f"{DATA_VERSION}/features.json"):
    napi.download_dataset(f"{DATA_VERSION}/features.json")

# read the metadata and display
feature_metadata = json.load(open(f"{DATA_VERSION}/features.json")) # all the feature sets and targets (the names)
#print(json.dumps(feature_metadata, indent=2))
for metadata in feature_metadata:
  print(metadata, len(feature_metadata[metadata]))

# display the feature sets
feature_sets = feature_metadata["feature_sets"]
#print(feature_sets["small"])
for feature_set in ["small", "medium", "all"]:
  print(feature_set, len(feature_sets[feature_set]))

# Only work with the medium feature set
medium_feature_set = feature_sets["medium"]

feature_sets 17
targets 37
small 42
medium 705
all 2376


In [3]:
# Download the training data 
if not os.path.exists(f"{DATA_VERSION}/train.parquet"):  
  napi.download_dataset(f"{DATA_VERSION}/train.parquet")

# Load only the "medium" feature set 
train_feature_set = pd.read_parquet(
    f"{DATA_VERSION}/train.parquet",
    columns = ["era", "target"] + medium_feature_set
)
#print(train_feature_set.head())

# Downsample to every 4th era to reduce memory usage and speedup model training
train_feature_set_reduced = train_feature_set[train_feature_set["era"].isin(train_feature_set["era"].unique()[::4])]
#print(train_feature_set_reduced.head())

In [None]:
# https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html
model = lgb.LGBMRegressor(
  n_estimators=2000,
  learning_rate=0.01,
  max_depth=5,
  num_leaves=2**5-1,
  colsample_bytree=0.1
)

model.fit(
  train_feature_set_reduced[medium_feature_set],
  train_feature_set_reduced["target"]
)

In [None]:
# Download validation data - this will take a few minutes
napi.download_dataset(f"{DATA_VERSION}/validation.parquet")

# Load the validation data and filter for data_type == "validation"
validation = pd.read_parquet(
    f"{DATA_VERSION}/validation.parquet",
    columns=["era", "data_type", "target"] + feature_set
)
validation = validation[validation["data_type"] == "validation"]
del validation["data_type"]

# Downsample to every 4th era to reduce memory usage and speedup evaluation (suggested for Colab free tier)
# Comment out the line below to use all the data (slower and higher memory usage, but more accurate evaluation)
validation = validation[validation["era"].isin(validation["era"].unique()[::4])]

# Eras are 1 week apart, but targets look 20 days (o 4 weeks/eras) into the future,
# so we need to "embargo" the first 4 eras following our last train era to avoid "data leakage"
last_train_era = int(train["era"].unique()[-1])
eras_to_embargo = [str(era).zfill(4) for era in [last_train_era + i for i in range(4)]]
validation = validation[~validation["era"].isin(eras_to_embargo)]

# Generate predictions against the out-of-sample validation features
# This will take a few minutes 🍵
validation["prediction"] = model.predict(validation[feature_set])
validation[["era", "prediction", "target"]]