In [1]:
from lightgbm import LGBMRegressor
from numerapi import NumerAPI
import numpy as np
import pandas as pd
import os
from dotenv import load_dotenv
load_dotenv()
link_to_window_path="/mnt/f/Python_Projects/Numerai/Training/"
os.path.join(link_to_window_path,'XXXX')

'/mnt/f/Python_Projects/Numerai/Training/XXXX'

In [2]:
napi = NumerAPI(public_id=os.environ["NUMERAI_PUBLIC_ID"],secret_key=os.environ["NUMERAI_SECRET_KEY"])


In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [4]:
import json
feature_location=os.path.join(link_to_window_path,"Data/v4.1/features.json")
features_json=open(feature_location)
features_dict=json.load(features_json)

small_features=features_dict['feature_sets']['small']
medium_features=features_dict['feature_sets']['medium']
v2_features=features_dict['feature_sets']['v2_equivalent_features']
v3_features=features_dict['feature_sets']['v3_equivalent_features']
fncv3_features=features_dict['feature_sets']['fncv3_features']
v4_features=list(features_dict['feature_stats'].keys())
target_list=features_dict['targets']

core_columns=["id","era","data_type"]+target_list
cols_to_open=list(set(core_columns+medium_features))
len(cols_to_open)

681

In [5]:
training_path="Data/v4.1/numerai_training_data_Apr2023_int8.parquet"
training_path = os.path.join(link_to_window_path,training_path)
training_data=pd.read_parquet(training_path,
                              columns=cols_to_open)

In [6]:
training_data.shape

(2420521, 680)

In [7]:
feature_cols = [c for c in training_data if c.startswith("feature_")]
len(feature_cols)

641

In [8]:
# Ideal params would be more like 20000, 0.001, 6, 2**6, 0.1, but this is slow enough as it is
model_params = {"n_estimators": 2000,
                "learning_rate": 0.01,
                "max_depth": 5,
                "num_leaves": 2 ** 5,
                "colsample_bytree": 0.1}

In [9]:
trained_model=LGBMRegressor(**model_params)

In [10]:
# Fill all NaNs with the median value for each feature
training_data[feature_cols]=training_data[feature_cols].fillna(training_data[feature_cols].median(skipna=True)).astype("int8")

In [11]:
trained_model.fit(training_data[feature_cols],
                  training_data['target_cyrus_v4_20'])

In [12]:
import joblib
# save model
joblib.dump(trained_model, 'example_lgbm_target_cyrus.pkl')
# load model
gbm_pickle = joblib.load('example_lgbm_target_cyrus.pkl')

In [28]:
trained_model

In [15]:
preds=gbm_pickle.predict(training_data[feature_cols])

In [16]:
def neutralize(predictions, features, proportion=1.0):
    # given predictions p and feature matrix F, the orthogonal component p' with regards to F is:
    # p' = p - (F dot (F_inverse dot p))
    inverse_features = np.linalg.pinv(features.values, rcond=1e-6)
    exposure = proportion * features.values.dot(inverse_features.dot(predictions))
    return predictions - exposure