In [1]:
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
import xgboost
import numpy as np
import polars as pl

X, y = load_breast_cancer(return_X_y=True, as_frame=True)
X = pl.from_pandas(X)
y = pl.from_pandas(y)
y

target
i64
0
0
0
0
0
0
0
0
0
0


In [2]:
prep = StandardScaler()
prep.fit(X.to_numpy())
_X = prep.transform(X.to_numpy())
prep.feature_names_in_ = np.asarray(X.columns, dtype=object)
_X

array([[ 1.09706398, -2.07333501,  1.26993369, ...,  2.29607613,
         2.75062224,  1.93701461],
       [ 1.82982061, -0.35363241,  1.68595471, ...,  1.0870843 ,
        -0.24388967,  0.28118999],
       [ 1.57988811,  0.45618695,  1.56650313, ...,  1.95500035,
         1.152255  ,  0.20139121],
       ...,
       [ 0.70228425,  2.0455738 ,  0.67267578, ...,  0.41406869,
        -1.10454895, -0.31840916],
       [ 1.83834103,  2.33645719,  1.98252415, ...,  2.28998549,
         1.91908301,  2.21963528],
       [-1.80840125,  1.22179204, -1.81438851, ..., -1.74506282,
        -0.04813821, -0.75120669]])

In [3]:
model = xgboost.XGBClassifier(max_depth=4, n_estimators=10, scale_pos_weight=10.)
model.fit(_X, y.to_numpy())
model._Booster.feature_names = prep.feature_names_in_.tolist()

In [4]:
y_proba_orig = model.predict_proba(_X)
y_proba_orig

array([[0.9639573 , 0.03604269],
       [0.96466434, 0.03533568],
       [0.9794746 , 0.02052541],
       ...,
       [0.976814  , 0.02318602],
       [0.9794746 , 0.02052541],
       [0.02328539, 0.9767146 ]], dtype=float32)

In [5]:
np.save("y_proba_orig.npy", y_proba_orig, allow_pickle=False)

In [6]:
import json

# model.save_model("xgb.json")
with open("xgb.json", "r") as fp:
    model_json = json.load(fp)
model_json

{'learner': {'attributes': {'best_iteration': '9',
   'best_ntree_limit': '10',
   'scikit_learn': '{"use_label_encoder": null, "n_estimators": 10, "objective": "binary:logistic", "max_depth": 4, "max_leaves": null, "max_bin": null, "grow_policy": null, "learning_rate": null, "verbosity": null, "booster": null, "tree_method": null, "gamma": null, "min_child_weight": null, "max_delta_step": null, "subsample": null, "sampling_method": null, "colsample_bytree": null, "colsample_bylevel": null, "colsample_bynode": null, "reg_alpha": null, "reg_lambda": null, "scale_pos_weight": 10.0, "base_score": null, "missing": NaN, "num_parallel_tree": null, "random_state": null, "n_jobs": null, "monotone_constraints": null, "interaction_constraints": null, "importance_type": null, "gpu_id": null, "validate_parameters": null, "predictor": null, "enable_categorical": false, "feature_types": null, "max_cat_to_onehot": null, "max_cat_threshold": null, "eval_metric": null, "early_stopping_rounds": null, "c

In [24]:
set(pl.DataFrame({"feature": model.feature_names_in_, "imp": model.feature_importances_}).filter(pl.col("imp")>0.)["feature"].to_list()).difference(set(X.columns[i] for i in sorted(df_split_indices)))

set()

In [7]:
model_from_json = xgboost.XGBClassifier()
model_from_json.load_model("xgb.json")
y_proba_from_json = model_from_json.predict_proba(_X)
y_proba_from_json

array([[0.9639573 , 0.03604269],
       [0.96466434, 0.03533568],
       [0.9794746 , 0.02052541],
       ...,
       [0.976814  , 0.02318602],
       [0.9794746 , 0.02052541],
       [0.02328539, 0.9767146 ]], dtype=float32)

In [12]:
(~np.equal(y_proba_orig, y_proba_from_json)).sum()

0

In [22]:
print(*(X.columns[i] for i in sorted(df_split_indices)), sep="\n")

mean radius
mean texture
mean compactness
mean concavity
mean concave points
mean symmetry
radius error
perimeter error
area error
smoothness error
fractal dimension error
worst radius
worst texture
worst perimeter
worst area
worst smoothness
worst compactness
worst concave points
worst symmetry
worst fractal dimension


In [20]:
j_trees = model_json["learner"]["gradient_booster"]["model"]["trees"]

df_split_indices = set()
for i, tree in enumerate(j_trees):
    print("="*8 + f"{i}" + "=" * 8)
    left_children: list[int] = tree["left_children"]
    right_children: list[int] = tree["right_children"]
    parents: list[int] = tree["parents"]
    split_indices: list[int] = tree["split_indices"]
    split_conditions: list[float] = tree["split_conditions"]
    default_left: list[int] = tree["default_left"]
    split_types: list[int] = tree["split_type"]
    base_weights: list[float] = tree["base_weights"]
    loss_changes: list[float] = tree["loss_changes"]
    sum_hessian: list[float] = tree["sum_hessian"]
    # print(tree["categories_segments"])
    # print(tree["categories_sizes"])
    # node index for categorical nodes
    # print(tree["categories_nodes"])
    # print(tree["categories"])
    # print(f"{len(left_children)=}")
    # print(f"{len(right_children)=}")
    # print(f"{len(parents)=}")
    # print(f"{len(split_indices)=}")
    # print(f"{len(split_conditions)=}")
    # print(f"{len(default_left)=}")
    # print(f"{len(split_types)=}")
    # print(f"{len(base_weights)=}")
    # print(f"{len(loss_changes)=}")
    # print(f"{len(sum_hessian)=}")
    x = pl.DataFrame({"left_children": left_children, "right_children": right_children, "parents": parents, "split_indices": split_indices, "split_conditions": split_conditions, "default_left": default_left, "split_type": split_types, "base_weights": base_weights, "loss_changes": loss_changes, "sum_hessian": sum_hessian})
    with pl.Config(tbl_rows=30, tbl_cols=10):
        print(x)
    df_split_indices.update(split_indices)
len(df_split_indices)

shape: (13, 10)
┌─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬─────────┬────────┐
│ left_ch ┆ right_c ┆ parents ┆ split_i ┆ split_c ┆ default ┆ split_t ┆ base_we ┆ loss_ch ┆ sum_he │
│ ildren  ┆ hildren ┆ ---     ┆ ndices  ┆ onditio ┆ _left   ┆ ype     ┆ ights   ┆ anges   ┆ ssian  │
│ ---     ┆ ---     ┆ i64     ┆ ---     ┆ ns      ┆ ---     ┆ ---     ┆ ---     ┆ ---     ┆ ---    │
│ i64     ┆ i64     ┆         ┆ i64     ┆ ---     ┆ i64     ┆ i64     ┆ f64     ┆ f64     ┆ f64    │
│         ┆         ┆         ┆         ┆ f64     ┆         ┆         ┆         ┆         ┆        │
╞═════════╪═════════╪═════════╪═════════╪═════════╪═════════╪═════════╪═════════╪═════════╪════════╡
│ 1       ┆ 2       ┆ 2147483 ┆ 22      ┆ 0.30348 ┆ 1       ┆ 0       ┆ 1.77390 ┆ 540.525 ┆ 945.5  │
│         ┆         ┆ 647     ┆         ┆ 2       ┆         ┆         ┆ 38      ┆ 9       ┆        │
│ 3       ┆ 4       ┆ 0       ┆ 27      ┆ 0.93634 ┆ 1       ┆ 0       ┆ 1.9

20

In [8]:
cols_imp = set(pl.DataFrame({"feature": model.feature_names_in_, "imp": model.feature_importances_}).filter(pl.col("imp")>0.)["feature"].to_list())
map_reindex = [i for i, c in enumerate(model_json["learner"]["feature_names"]) if c in cols_imp]
map_reindex = dict((j, i) for i, j in enumerate(map_reindex))
model_json["learner"]["feature_names"] = [model_json["learner"]["feature_names"][i] for i in map_reindex]
for i in range(int(model_json["learner"]["gradient_booster"]["model"]["gbtree_model_param"]["num_trees"])):
    model_json["learner"]["gradient_booster"]["model"]["trees"][i]["split_indices"] = [map_reindex[j] for j in model_json["learner"]["gradient_booster"]["model"]["trees"][i]["split_indices"]]
    model_json["learner"]["gradient_booster"]["model"]["trees"][i]["tree_param"]["num_feature"] = f"{len(map_reindex)}"
model_json["learner"]["learner_model_param"]["num_feature"] = f"{len(map_reindex)}"
with open("xgb_compact.json", "w") as fp:
    json.dump(model_json, fp, separators=(",", ":"))

In [9]:
map_reindex

{0: 0,
 1: 1,
 5: 2,
 6: 3,
 7: 4,
 8: 5,
 10: 6,
 12: 7,
 13: 8,
 14: 9,
 19: 10,
 20: 11,
 21: 12,
 22: 13,
 23: 14,
 24: 15,
 25: 16,
 27: 17,
 28: 18,
 29: 19}

In [40]:
model_compact.feature_names_in_

array(['mean radius', 'mean texture', 'mean compactness',
       'mean concavity', 'mean concave points', 'mean symmetry',
       'radius error', 'perimeter error', 'area error',
       'smoothness error', 'fractal dimension error', 'worst radius',
       'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [10]:
model_compact = xgboost.XGBClassifier()
model_compact.load_model("xgb_compact.json")
index_ = [i for i, c in enumerate(X.columns) if c in model_compact.feature_names_in_.tolist()]
y_compact = model_compact.predict_proba(_X[:, index_])
y_compact

array([[0.9639573 , 0.03604269],
       [0.96466434, 0.03533568],
       [0.9794746 , 0.02052541],
       ...,
       [0.976814  , 0.02318602],
       [0.9794746 , 0.02052541],
       [0.02328539, 0.9767146 ]], dtype=float32)

In [43]:
(~np.equal(y_proba_orig, y_compact)).sum()

0