## Model tunning

In [None]:
# Step 1: Initialize NumerAPI and Set Data Version
from numerapi import NumerAPI
import pandas as pd
import numpy as np
import json
import lightgbm as lgb
import cloudpickle
import os
import shutil
from scipy.stats import spearmanr

napi = NumerAPI()
DATA_VERSION = "v5.0"

In [None]:
# Step 2: Download Data
print("Downloading datasets...")
napi.download_dataset(f"{DATA_VERSION}/train.parquet")
napi.download_dataset(f"{DATA_VERSION}/features.json")
napi.download_dataset(f"{DATA_VERSION}/validation.parquet")

In [None]:
# Step 3: Load Data
print("Loading data...")
# Note: Requires 'pyarrow' installed in the .venv. Run `pip install pyarrow` if needed.
feature_metadata = json.load(open(f"{DATA_VERSION}/features.json"))
features = feature_metadata["feature_sets"]["medium"] 
train = pd.read_parquet(f"{DATA_VERSION}/train.parquet", columns=["era"] + features + ["target"])

validation = pd.read_parquet(f"{DATA_VERSION}/validation.parquet", columns=["era"] + features + ["target"])
validation = validation[validation["target"].notnull()]  # Filter rows with non-null targets

In [None]:
# Step 4: Downsample for Speed
print("Downsampling training data...")
train = train[train["era"].isin(train["era"].unique()[::4])]  # Skip every 4th era

In [None]:
# Step 5: Train Model
print("Training model...")
model = lgb.LGBMRegressor(
    n_estimators=20000,
    learning_rate=0.001,
    max_depth=8,
    num_leaves=2**10-1,
    colsample_bytree=0.1,
    min_data_in_leaf=10000,
)
model.fit(
    train[features],
    train["target"]
)

In [None]:
# Step 6: Define reduce_mem_usage Function
def reduce_mem_usage(df):
    """Optimize DataFrame memory usage by converting data types."""
    for col in df.columns:
        if df[col].dtype == "float64":
            df[col] = df[col].astype("float32")
        if df[col].dtype == "int64":
            df[col] = df[col].astype("int32")
    return df

In [None]:
# Step 7: Load Live Data and Generate Predictions
try:
    print("Loading live data...")
    # Load live data, selecting only feature columns
    live_data = pd.read_parquet(f"{DATA_VERSION}/live.parquet", columns=features)
    print(f"Live data columns: {list(live_data.columns)}")
    
    # Optimize memory usage
    live_data = reduce_mem_usage(live_data)
    
    # Generate predictions
    live_predictions = model.predict(live_data[features])
    # Create submission DataFrame with index as identifier
    submission = pd.Series(live_predictions, index=live_data.index).to_frame("prediction")
    print("Predictions generated.")
except Exception as e:
    print(f"Error loading live data or predicting: {e}")
    raise

In [None]:
# Step 8: Save Predict Function and Verify Model File
print("Saving predict function...")
def predict(live_features: pd.DataFrame, _live_benchmark_models: pd.DataFrame) -> pd.DataFrame:
    live_predictions = model.predict(live_features[features])
    submission = pd.Series(live_predictions, index=live_features.index)
    return submission.to_frame("prediction")

# Pickle predict function
model_file = "lgbm_model.pkl"
with open(model_file, "wb") as f:
    cloudpickle.dump(predict, f)

# Verify the model file exists
if os.path.exists(model_file):
    print(f"Model file '{model_file}' saved successfully in {os.getcwd()}.")
else:
    raise FileNotFoundError(f"Failed to save '{model_file}' in {os.getcwd()}.")

In [None]:
# Step 9: Evaluate Model Locally
print("Evaluating model on validation data...")
def compute_metrics(validation_data, features, model):
    """Compute Numerai metrics: CORR, FNC, Sharpe, Feature Exposure."""
    # Generate predictions
    preds = model.predict(validation_data[features])
    validation_data["prediction"] = preds
    
    # CORR: Mean Spearman correlation per era
    corrs = []
    for era in validation_data["era"].unique():
        era_data = validation_data[validation_data["era"] == era]
        corr, _ = spearmanr(era_data["prediction"], era_data["target"])
        corrs.append(corr)
    mean_corr = np.mean(corrs)
    sharpe = mean_corr / np.std(corrs) if np.std(corrs) > 0 else np.nan
    
    # Simplified FNC: Neutralize predictions against features
    def neutralize_series(series, features_data):
        from sklearn.linear_model import LinearRegression
        X = features_data[features].fillna(0)
        y = series.fillna(0)
        reg = LinearRegression().fit(X, y)
        neutralized = y - reg.predict(X)
        return neutralized / neutralized.std() if neutralized.std() > 0 else neutralized
    
    neutralized_preds = neutralize_series(validation_data["prediction"], validation_data)
    fnc_corrs = []
    for era in validation_data["era"].unique():
        era_data = validation_data[validation_data["era"] == era]
        corr, _ = spearmanr(neutralized_preds[era_data.index], era_data["target"])
        fnc_corrs.append(corr)
    mean_fnc = np.mean(fnc_corrs)
    
    # Feature Exposure: Std of correlations between predictions and features
    feature_corrs = [spearmanr(validation_data["prediction"], validation_data[f])[0] for f in features]
    feature_exposure = np.std(feature_corrs)
    
    return {
        "CORR": mean_corr,
        "FNC": mean_fnc,
        "Sharpe": sharpe,
        "Feature Exposure": feature_exposure
    }

# Compute and display metrics
metrics = compute_metrics(validation, features, model)
print("Local Validation Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

# Optional: Save predictions for further analysis
validation["prediction"].to_csv("validation_predictions.csv")
print(f"Validation predictions saved to 'validation_predictions.csv' in {os.getcwd()}.")