In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn import set_config, get_config
from sklearn.ensemble import StackingRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline

from model.core import MODEL_DIR, load_Xy
from model.elasticnet import similar_transformer
from model.xgb import categorical_features

In [15]:
set_config(transform_output='pandas')

In [8]:
X, y = load_Xy()
xgb_model = joblib.load(MODEL_DIR / './models/xgb_model.joblib')
#xgb_model2 = joblib.load(MODEL_DIR / './models/xgb_model2.joblib')
#xgb_model3 = joblib.load(MODEL_DIR / './models/xgb_model3.joblib')
svr_model = joblib.load(MODEL_DIR / './models/linear_svr_model.joblib')
rbf_model = joblib.load(MODEL_DIR / './models/rbf_model.joblib')
ridge_model = joblib.load(MODEL_DIR / './models/ridge_model.joblib')
elasticnet_model = joblib.load(MODEL_DIR / './models/elasticnet_model.joblib')

0,1,2
,transformers,"[('categorical', ...), ('numerical', ...), ...]"
,remainder,'drop'
,sparse_threshold,0
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'MISSING_VALUE'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,dtype,<class 'numpy.float64'>
,separator,'='
,sparse,False
,sort,True


In [9]:
model = StackingRegressor(
    estimators=[
        ('ridge', ridge_model),
        ('elasticnet', elasticnet_model),
        ('xgb', xgb_model),
        #('xgb2', xgb_model2),
        #('xgb3', xgb_model3),
        ('svr', svr_model),
        ('rbf', rbf_model),
    ],
    cv=5,
    verbose=1,
)
ridge_cv = cross_validate(ridge_model, X, y, scoring='neg_mean_squared_error', cv=5)
elasticnet_cv = cross_validate(elasticnet_model, X, y, scoring='neg_mean_squared_error', cv=5)
xgb_cv = cross_validate(xgb_model, X, y, scoring='neg_mean_squared_error', cv=5)
svr_cv = cross_validate(svr_model, X, y, scoring='neg_mean_squared_error', cv=5)
rbf_cv = cross_validate(rbf_model, X, y, scoring='neg_mean_squared_error', cv=5)
stack_cv = cross_validate(model, X, y, scoring='neg_mean_squared_error', cv=5)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   25.8s finished
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   25.7s finished
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.2s finished
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed

In [10]:
ridge_cv

{'fit_time': array([0.04700041, 0.05851293, 0.0470047 , 0.04668069, 0.04856992]),
 'score_time': array([0.0130024 , 0.01299739, 0.01351047, 0.01400113, 0.01399922]),
 'test_score': array([-25567.28133596, -34916.24473728, -25462.45550807, -42515.00537346,
        -40826.99614764])}

In [11]:
elasticnet_cv

{'fit_time': array([0.03700018, 0.03752112, 0.03899884, 0.03851104, 0.03699756]),
 'score_time': array([0.0130024 , 0.01500106, 0.01299763, 0.01299977, 0.01300645]),
 'test_score': array([-24989.48268867, -35005.38675827, -32409.13641316, -39937.76270537,
        -41830.3930961 ])}

In [24]:
X.fillna(0.).values

array([[0.0, 0.0, 0.0, ...,
        '[358, 843, 1068, 1078, 1079, 1103, 1185, 2180, 3349, 9061]',
        1.0, 968.9387124290148],
       ['2D Platformer', 'M2', 0.0, ...,
        '[43367, 51485, 55190, 56033, 57187, 81183, 87728, 110503, 117311, 200989]',
        1.0, 1432.7693318282927],
       ['Metroidvania', 'Nintendo', 0.0, ...,
        '[1025, 1026, 1070, 1101, 1741, 3222, 18182, 26226, 26820, 103329]',
        1.0, 1803.6924564563933],
       ...,
       [0.0, 0.0, 96.94656488549616, ...,
        '[96217, 106987, 22387, 115653, 81249, 103303, 115280, 55038, 113360, 27092]',
        1.0, 1771.662373388982],
       ['Rhythm', 'Drool', 88.46153846153845, ...,
        '[56033, 36198, 31194, 3222, 114455, 18869, 55173, 111130, 19301, 19150]',
        1.0, 1333.6010630510505],
       ['Action RPG', 'Round 8 Studio of NEOWIZ', 100.0, ...,
        '[81249, 19164, 103303, 102584, 111130, 18225, 26574, 96217, 116530, 106987]',
        1.0, 1606.7849028955127]], shape=(322, 40), dtype=obj

In [12]:
xgb_cv

{'fit_time': array([6.07340574, 5.90862703, 5.93781948, 5.80294514, 5.89161968]),
 'score_time': array([0.03900242, 0.03600097, 0.03610611, 0.03699946, 0.03499985]),
 'test_score': array([-24004.42171164, -39197.51186638, -31425.8145868 , -47013.72935369,
        -62315.04163012])}

In [13]:
stack_cv

{'fit_time': array([32.99667525, 33.14212537, 33.04780626, 32.7185595 , 32.79580045]),
 'score_time': array([0.09951186, 0.09849286, 0.10150719, 0.09851217, 0.10051489]),
 'test_score': array([-22215.51419516, -32231.9103747 , -23678.90263743, -40579.18441449,
        -38031.27550616])}

In [10]:
set_config(transform_output='default')

In [15]:
from pytabkit import Ensemble_HPO_Regressor
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from model.core import (
    PandasDictVectorizer,
    PandasCountVectorizer,
    CategoricalEncoder,
    SummarizeSimilar,
    LIST_FEATURES,
    NUMERICAL_FEATURES,
    CATEGORICAL_FEATURES,
    DICT_FEATURES,
    MODEL_DIR,
    load_Xy,
    SIMILAR_FEATURES,
)

# categorize features by preprocessing
numerical_features = NUMERICAL_FEATURES.copy()
categorical_features = CATEGORICAL_FEATURES.copy()
dict_features = DICT_FEATURES.copy()
list_features = LIST_FEATURES.copy()
similar_features = SIMILAR_FEATURES.copy()


In [3]:
# define preprocessors
list_transformers = []
for n in list_features:
    list_transformers.append((n, PandasCountVectorizer(input='content'), n))

dict_transformers = []
for n in dict_features:
    dict_transformers.append((n, PandasDictVectorizer(sparse=False), n))

categorical_transformer = Pipeline([
    ('encode', CategoricalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)),
    #('impute', SimpleImputer(fill_value=-1)),
])

similar_transformer = Pipeline([
    ('summarize', SummarizeSimilar()),
    ('impute', SimpleImputer())
])

nonlist_transformers = [
    ('categorical', categorical_transformer, categorical_features),
    ('numerical', SimpleImputer(), numerical_features),
    ('similar', similar_transformer, similar_features),
]


column_transformer = ColumnTransformer(
    transformers=nonlist_transformers+list_transformers+dict_transformers,
    sparse_threshold=0
)

In [4]:
preprocessor = column_transformer
preprocessor.set_output(transform='pandas')

0,1,2
,transformers,"[('categorical', ...), ('numerical', ...), ...]"
,remainder,'drop'
,sparse_threshold,0
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'
,ngram_range,"(1, ...)"

0,1,2
,dtype,<class 'numpy.float64'>
,separator,'='
,sparse,False
,sort,True


In [5]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [20]:
model = Ensemble_HPO_Regressor(
    n_cv=5,
    use_full_caruana_ensembling=True,
    use_tabarena_spaces=True,
    n_hpo_steps=50,
    #val_metric_name='mse',
    device='cuda:0',
)
pipe = Pipeline([
    ('preprocess', preprocessor),
    ('model', model)
])

In [21]:
ptk_cv = cross_validate(pipe, X, y, scoring='neg_mean_squared_error', cv=5, params={'model__cat_col_names': ['categorical__genre_metacritic', 'categorical__developer_metacritic']}, error_score='raise')

self.fit_params[0]={'early_stopping_rounds': 300, 'n_estimators': 10000, 'learning_rate': np.float64(0.006259412766996183), 'feature_fraction': 0.6878428925584417, 'bagging_fraction': 0.7441573724081981, 'bagging_freq': 1, 'num_leaves': np.float64(3.0), 'min_data_in_leaf': np.float64(5.0), 'extra_trees': np.True_, 'min_data_per_group': np.float64(93.0), 'cat_l2': np.float64(0.01787306292976289), 'cat_smooth': np.float64(0.05146331293850836), 'max_cat_to_onehot': np.float64(9.0), 'lambda_l1': np.float64(0.0029349163644412707), 'lambda_l2': np.float64(0.0012074471066395999)}
params["n_estimators"]=10000
params["n_estimators"]=10000
params["n_estimators"]=10000
params["n_estimators"]=10000
params["n_estimators"]=10000
self.fit_params[0]={'early_stopping_rounds': 300, 'n_estimators': 10000, 'learning_rate': np.float64(0.011183442040921405), 'feature_fraction': 0.9844105840958441, 'bagging_fraction': 0.7455927191168633, 'bagging_freq': 1, 'num_leaves': np.float64(48.0), 'min_data_in_leaf': 

  sub_x_cat = x_cat[[slice(None)] * (x_cat.dim() - 1) + [slice(i, i + 1)]]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
`Trainer.fit` stopped: `max_epochs=256` reached.
  sub_x_cat = x_cat[[slice(None)] * (x_cat.dim() - 1) + [slice(i, i + 1)]]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
`Trainer.fit` stopped: `max_epochs=256` reached.
  sub_x_cat = x_cat[[slice(None)] * (x_cat.dim() - 1) + [slice(i, i + 1)

self.fit_params[0]={'early_stopping_rounds': 300, 'n_estimators': 10000, 'learning_rate': np.float64(0.03853163688510364), 'feature_fraction': 0.8442636606476621, 'bagging_fraction': 0.8588386477127903, 'bagging_freq': 1, 'num_leaves': np.float64(25.0), 'min_data_in_leaf': np.float64(7.0), 'extra_trees': np.True_, 'min_data_per_group': np.float64(92.0), 'cat_l2': np.float64(0.010660512743739792), 'cat_smooth': np.float64(6.639928021347322), 'max_cat_to_onehot': np.float64(13.0), 'lambda_l1': np.float64(0.0003158051146006117), 'lambda_l2': np.float64(0.00031256557847496747)}
params["n_estimators"]=10000
params["n_estimators"]=10000
params["n_estimators"]=10000
params["n_estimators"]=10000
params["n_estimators"]=10000
self.fit_params[0]={'early_stopping_rounds': 300, 'n_estimators': 10000, 'learning_rate': np.float64(0.0437748117509887), 'feature_fraction': 0.8791186111557266, 'bagging_fraction': 0.7597890902354373, 'bagging_freq': 1, 'num_leaves': np.float64(5.0), 'min_data_in_leaf': np

  sub_x_cat = x_cat[[slice(None)] * (x_cat.dim() - 1) + [slice(i, i + 1)]]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
`Trainer.fit` stopped: `max_epochs=256` reached.
  sub_x_cat = x_cat[[slice(None)] * (x_cat.dim() - 1) + [slice(i, i + 1)]]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
`Trainer.fit` stopped: `max_epochs=256` reached.
  sub_x_cat = x_cat[[slice(None)] * (x_cat.dim() - 1) + [slice(i, i + 1)]]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
`Trainer.fit` stopped: `max_epochs=512` reached.
  sub_x_cat = x_cat[[slice(None)] * (x_cat.dim() - 1) + [slice(i, i + 1)]]
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores

self.fit_params[0]={'early_stopping_rounds': 300, 'n_estimators': 10000, 'learning_rate': np.float64(0.009329162239751363), 'feature_fraction': 0.9623533624888353, 'bagging_fraction': 0.7545425002434324, 'bagging_freq': 1, 'num_leaves': np.float64(8.0), 'min_data_in_leaf': np.float64(11.0), 'extra_trees': np.False_, 'min_data_per_group': np.float64(2.0), 'cat_l2': np.float64(1.4130118687566935), 'cat_smooth': np.float64(0.0041451506238024795), 'max_cat_to_onehot': np.float64(9.0), 'lambda_l1': np.float64(2.1050973776048983e-05), 'lambda_l2': np.float64(0.0004770588844473105)}
params["n_estimators"]=10000
params["n_estimators"]=10000
params["n_estimators"]=10000
params["n_estimators"]=10000
params["n_estimators"]=10000
self.fit_params[0]={'early_stopping_rounds': 300, 'n_estimators': 10000, 'learning_rate': np.float64(0.007052766285036304), 'feature_fraction': 0.5630757826089058, 'bagging_fraction': 0.9610168331018621, 'bagging_freq': 1, 'num_leaves': np.float64(84.0), 'min_data_in_leaf

KeyboardInterrupt: 

In [35]:
ptk_cv

{'fit_time': array([164.49928069, 171.4677515 , 173.02885723, 162.18768382,
        166.05649424]),
 'score_time': array([0.02450991, 0.02599907, 0.03000116, 0.03100276, 0.03651214]),
 'test_score': array([-24110.61659285, -38046.71304304, -27556.09545123, -58761.20199099,
        -57711.39999812])}