## Model tunning

In [9]:
# Step 1: Initialize NumerAPI and Set Data Version
from numerapi import NumerAPI
import pandas as pd
import json
import lightgbm as lgb
import cloudpickle
import pyarrow
import os
import shutil

napi = NumerAPI()
DATA_VERSION = "v5.0"

In [10]:
# Step 2: Download Data
print("Downloading datasets...")
napi.download_dataset(f"{DATA_VERSION}/train.parquet")
napi.download_dataset(f"{DATA_VERSION}/features.json")
napi.download_dataset(f"{DATA_VERSION}/validation.parquet")

Downloading datasets...


2025-09-12 17:05:34,872 INFO numerapi.utils: target file already exists
2025-09-12 17:05:34,873 INFO numerapi.utils: download complete
2025-09-12 17:05:36,263 INFO numerapi.utils: target file already exists
2025-09-12 17:05:36,264 INFO numerapi.utils: download complete
2025-09-12 17:05:37,697 INFO numerapi.utils: target file already exists
2025-09-12 17:05:37,698 INFO numerapi.utils: download complete


'v5.0/validation.parquet'

In [None]:
# Step 3: Load Data
print("Loading data...")
feature_metadata = json.load(open(f"{DATA_VERSION}/features.json"))
features = feature_metadata["feature_sets"]["small"]  # Use "small" feature set
train = pd.read_parquet(f"{DATA_VERSION}/train.parquet", columns=["era"] + features + ["target"])


Loading data...


KeyError: 'data_type'

In [None]:
# Step 4: Downsample for Speed
print("Downsampling training data...")
train = train[train["era"].isin(train["era"].unique()[::4])]  # Skip every 4th era

Downsampling training data...


In [None]:
# Step 5: Train Model
print("Training model...")
model = lgb.LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.01,
    max_depth=5,
    num_leaves=2**5-1,
    colsample_bytree=0.1
)
model.fit(
    train[features],
    train["target"]
)

Training model...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003058 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 210
[LightGBM] [Info] Number of data points in the train set: 688184, number of used features: 42
[LightGBM] [Info] Start training from score 0.500008


0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,5
,learning_rate,0.01
,n_estimators,2000
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [None]:
# Step 6: Define reduce_mem_usage Function
def reduce_mem_usage(df):
    """Optimize DataFrame memory usage by converting data types."""
    for col in df.columns:
        if df[col].dtype == "float64":
            df[col] = df[col].astype("float32")
        if df[col].dtype == "int64":
            df[col] = df[col].astype("int32")
    return df

In [None]:
# Step 7: Load Live Data and Generate Predictions
try:
    print("Loading live data...")
    # Load live data, selecting only feature columns
    live_data = pd.read_parquet(f"{DATA_VERSION}/live.parquet", columns=features)
    print(f"Live data columns: {list(live_data.columns)}")
    
    # Optimize memory usage
    live_data = reduce_mem_usage(live_data)
    
    # Generate predictions
    live_predictions = model.predict(live_data[features])
    # Create submission DataFrame with index as identifier
    submission = pd.Series(live_predictions, index=live_data.index).to_frame("prediction")
    print("Predictions generated.")
except Exception as e:
    print(f"Error loading live data or predicting: {e}")
    raise

Loading live data...
Live data columns: ['feature_antistrophic_striate_conscriptionist', 'feature_bicameral_showery_wallaba', 'feature_bridal_fingered_pensioner', 'feature_collectivist_flaxen_gueux', 'feature_concurring_fabled_adapter', 'feature_crosscut_whilom_ataxy', 'feature_departmental_inimitable_sentencer', 'feature_dialectal_homely_cambodia', 'feature_donnard_groutier_twinkle', 'feature_elusive_vapoury_accomplice', 'feature_geminate_crummiest_scourer', 'feature_glandered_unimproved_peafowl', 'feature_hempen_unionist_cone', 'feature_illuminated_gambrel_noria', 'feature_jacobinical_symmetric_roll', 'feature_jewish_stained_disembowelment', 'feature_lacklustre_centroidal_schweitzer', 'feature_limiest_heliolithic_york', 'feature_maledictive_latter_psellism', 'feature_mendelian_undiscording_avion', 'feature_musicianly_aspirate_creativity', 'feature_petty_upraised_caddice', 'feature_pottier_unmanly_collyrium', 'feature_reclaimed_insurrectional_moneyer', 'feature_saddening_unsound_rustl

In [None]:
# Step 8: Save Predict Function and Verify Model File
print("Saving predict function...")
def predict(live_features: pd.DataFrame, _live_benchmark_models: pd.DataFrame) -> pd.DataFrame:
    live_predictions = model.predict(live_features[features])
    submission = pd.Series(live_predictions, index=live_features.index)
    return submission.to_frame("prediction")

# Pickle predict function
model_file = "first_tune.pkl"
with open(model_file, "wb") as f:
    cloudpickle.dump(predict, f)

# Verify the model file exists
if os.path.exists(model_file):
    print(f"Model file '{model_file}' saved successfully in {os.getcwd()}.")
else:
    raise FileNotFoundError(f"Failed to save '{model_file}' in {os.getcwd()}.")

# Optional: Copy the model file to a specific directory (uncomment and modify as needed)
# target_dir = "/path/to/your/directory"  # Specify your desired directory
# os.makedirs(target_dir, exist_ok=True)
# shutil.copy(model_file, os.path.join(target_dir, model_file))
# print(f"Model file copied to {target_dir}/{model_file}.")

Saving predict function...
Model file 'first_tune.pkl' saved successfully in e:\CODES\Numerai.
