This notebook runs the model that was trained in Training.ipynb, allowing it to be submitted to the Kaggle competition.

In [None]:
import numpy as np
import pandas as pd
import polars as pl

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor, RidgeCV

from sklearn.tree import DecisionTreeRegressor

from sklearn.svm import LinearSVR, SVR

from sklearn.multioutput import MultiOutputRegressor

from sklearn.neighbors import KNeighborsRegressor

from sklearn.ensemble import RandomForestRegressor, StackingRegressor, GradientBoostingRegressor

from sklearn.neural_network import MLPRegressor

import pickle

import time

import os

import kaggle_evaluation.jane_street_inference_server

model_data = []

with open('/kaggle/input/contest-model-2/scikitlearn/default/1/finalmodelpickle4.pkl', 'rb') as pickle_file:
    model_data = pickle.load(pickle_file)

models = model_data[0]
powers_by_model = model_data[1]
final_model = model_data[2]

models[0].verbose = 0
models[2].verbose = 0
models[3].verbose = False

def predict(test: pl.DataFrame, lags: pl.DataFrame | None) -> pl.DataFrame | pd.DataFrame:
    test_pandas = test.to_pandas()

    test_pd = test_pandas[[(f"feature_{rangeval:02}") for rangeval in range(0, 79)]].fillna(0)

    test_pd_abs = np.abs(test_pd)

    test_pd_sign = np.sign(test_pd)

    row_ids = test_pandas[["row_id"]]

    model_ensemble_outputs = []
    for i in range(0, len(models)):
        if powers_by_model[i] == 1.0:
            model_ensemble_outputs = model_ensemble_outputs + [pd.DataFrame(models[i].predict(test_pd))]
        else:
            current_training_features = np.power(test_pd_abs, powers_by_model[i]) * test_pd_sign
            model_ensemble_outputs = model_ensemble_outputs + [pd.DataFrame(models[i].predict(current_training_features))]

    model_ensemble_output_merged = pd.concat(model_ensemble_outputs, axis=1)

    final_output = pd.DataFrame(final_model.predict(model_ensemble_output_merged))

    final_output.columns = ["responder_6"]

    output_columns = [row_ids, final_output]

    function_output = pd.concat(output_columns, axis=1)

    predictions = pl.from_pandas(function_output)

    if isinstance(predictions, pl.DataFrame):
        assert predictions.columns == ['row_id', 'responder_6']
    elif isinstance(predictions, pd.DataFrame):
        assert (predictions.columns == ['row_id', 'responder_6']).all()
    else:
        raise TypeError('The predict function must return a DataFrame')
    assert len(predictions) == len(test)
    return predictions

if True:
    inference_server = kaggle_evaluation.jane_street_inference_server.JSInferenceServer(predict)
    if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
        inference_server.serve()
    else:
        inference_server.run_local_gateway(
            (
                '/kaggle/input/jane-street-real-time-market-data-forecasting/test.parquet',
                '/kaggle/input/jane-street-real-time-market-data-forecasting/lags.parquet',
            )
        )