This notebook trains an ensemble model for use in the Jane Street Kaggle competition.

In [None]:
import numpy as np
import pandas as pd
import polars as pl

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor, RidgeCV

from sklearn.tree import DecisionTreeRegressor

from sklearn.svm import LinearSVR, SVR

from sklearn.multioutput import MultiOutputRegressor

from sklearn.neighbors import KNeighborsRegressor

from sklearn.ensemble import RandomForestRegressor, StackingRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor

from sklearn.neural_network import MLPRegressor

import pickle

import os

import gc

import kaggle_evaluation.jane_street_inference_server

pandas_dataframes = []

for filenumber in range(0, 10):
        pandas_dataframes = pandas_dataframes + [pl.scan_parquet(f"/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id={filenumber}/part-0.parquet").collect().to_pandas().sample(frac=0.1)]

gc.collect()

training_data = pd.concat(pandas_dataframes, ignore_index=True)

pandas_dataframes = []

for filenumber in range(0, 10):
        pandas_dataframes = pandas_dataframes + [pl.scan_parquet(f"/kaggle/input/jane-street-real-time-market-data-forecasting/train.parquet/partition_id={filenumber}/part-0.parquet").collect().to_pandas().sample(frac=0.1)]

gc.collect()

training_data_2 = pd.concat(pandas_dataframes, ignore_index=True)

pandas_dataframes = []

training_data_small = training_data.sample(n=40000)

training_features = training_data[[(f"feature_{rangeval:02}") for rangeval in range(0, 79)]].fillna(0)

training_result = training_data["responder_6"]
training_weight = training_data["weight"]

training_features_small = training_data_small[[(f"feature_{rangeval:02}") for rangeval in range(0, 79)]].fillna(0)

training_result_small = training_data_small["responder_6"]
training_weight_small = training_data_small["weight"]

training_data_medium = training_data.sample(n=800000)

training_features_medium = training_data_medium[[(f"feature_{rangeval:02}") for rangeval in range(0, 79)]].fillna(0)

training_result_medium = training_data_medium["responder_6"]
training_weight_medium = training_data_medium["weight"]

training_data_medlarge = training_data_2.sample(n=2000000)

training_features_medlarge = training_data_medlarge[[(f"feature_{rangeval:02}") for rangeval in range(0, 79)]].fillna(0)

training_result_medlarge = training_data_medlarge["responder_6"]
training_weight_medlarge = training_data_medlarge["weight"]

gc.collect()

gb_model = GradientBoostingRegressor(verbose=2)
gb_model.fit(training_features_medium, training_result_medium, sample_weight=training_weight_medium)

gc.collect()

hgb_model = HistGradientBoostingRegressor(verbose=1000)
hgb_model.fit(training_features, training_result, sample_weight=training_weight)

gc.collect()

rf_model = RandomForestRegressor(verbose=100, n_estimators=200)
rf_model.fit(training_features_small, training_result_small, sample_weight=training_weight_small)

nn_model = MLPRegressor(verbose=True, max_iter=350, hidden_layer_sizes=(54,))
nn_model.fit(training_features_medium, training_result_medium)

nn2_model = MLPRegressor(verbose=True, max_iter=500, hidden_layer_sizes=(200,100), learning_rate_init=0.0005)
nn2_model.fit(training_features_medium, training_result_medium)

lr_model = LinearRegression()
lr_model.fit(training_features, training_result, sample_weight=training_weight)

training_features_squared = np.square(training_features) * np.sign(training_features)
qr_model = LinearRegression()
qr_model.fit(training_features_squared, training_result, sample_weight=training_weight)

training_features_sqrt = np.sqrt(np.abs(training_features)) * np.sign(training_features)
sqrtr_model = LinearRegression()
sqrtr_model.fit(training_features_sqrt, training_result, sample_weight=training_weight)

models = [gb_model, hgb_model, rf_model, nn_model, nn2_model, lr_model, qr_model, sqrtr_model]
powers_by_model = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 0.5]

model_ensemble_outputs = []
for i in range(0, len(models)):
    current_training_features = np.power(np.abs(training_features_medlarge), powers_by_model[i]) * np.sign(training_features_medlarge)
    model_ensemble_outputs = model_ensemble_outputs + [pd.DataFrame(models[i].predict(current_training_features))]


model_ensemble_output_merged = pd.concat(model_ensemble_outputs, axis=1)

final_model = RidgeCV()
final_model.fit(model_ensemble_output_merged, training_result_medlarge, sample_weight=training_weight_medlarge)

print(final_model.score(model_ensemble_output_merged, training_result_medlarge, sample_weight=training_weight_medlarge))

with open("/kaggle/working/finalmodelpickle4.pkl", "wb") as pklfile:
    pickle.dump([models, powers_by_model, final_model], pklfile)