# XGBoost Leaf Weights

Author: https://www.github.com/deburky

In [None]:
import re

import numpy as np
import pandas as pd
import xgboost as xgb

In [None]:
# -----------------------------
# 1. Toy dataset
# -----------------------------
X = np.array([[0], [1], [2], [3], [4], [5]])
y = np.array([0, 0, 1, 1, 0, 1])  # binary labels
dtrain = xgb.DMatrix(X, label=y)

# -----------------------------
# 2. Train one round of XGBoost
# -----------------------------
params = {
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "max_depth": 1,
    "eta": 1.0,
    "lambda": 0.0,
    "min_child_weight": 0,
    "gamma": 0.0,
}

bst = xgb.train(params, dtrain, num_boost_round=1)

# -----------------------------
# 3. Inspect the tree dump
# -----------------------------
dump = bst.get_dump(dump_format="text")
print("=== XGBoost Tree Dump ===")
print(dump[0])

# Extract split rule using regex (e.g. "0:[f0<1.5]")
m = re.search(r"\[f(\d+)<([\d\.]+)\]", dump[0])
if not m:
    raise ValueError("No split found in tree!")

feature_idx = int(m[1])
threshold = float(m[2])
print(f"\nParsed split: feature f{feature_idx} < {threshold}")
# --------------------------
# 4. Manual gradient & hessian
# --------------------------
y_pred = np.zeros_like(y, dtype=float)  # first iteration raw score = 0
p = 1.0 / (1.0 + np.exp(-y_pred))  # sigmoid
g = p - y  # gradient
h = p * (1.0 - p)  # hessian

# Partition dataset using the actual split
left_idx = X[:, feature_idx] < threshold
right_idx = ~left_idx


def node_stats(idx):
    return g[idx].sum(), h[idx].sum()


G_L, H_L = node_stats(left_idx)
G_R, H_R = node_stats(right_idx)

print("\nManual node stats:")
print(f"Left  -> G={G_L:.3f}, H={H_L:.3f}")
print(f"Right -> G={G_R:.3f}, H={H_R:.3f}")

# --------------------------
# 5. Manual leaf weights
# --------------------------
lam = params["lambda"]
w_L = -G_L / (H_L + lam)
w_R = -G_R / (H_R + lam)

print("\nManual leaf weights:")
print(f"Left leaf weight: {w_L:.6f}")
print(f"Right leaf weight: {w_R:.6f}")

=== XGBoost Tree Dump ===
0:[f0<2] yes=1,no=2,missing=2
	1:leaf=-2
	2:leaf=1


Parsed split: feature f0 < 2.0

Manual node stats:
Left  -> G=1.000, H=0.500
Right -> G=-1.000, H=1.000

Manual leaf weights:
Left leaf weight: -2.000000
Right leaf weight: 1.000000


In [None]:
con = np.column_stack((X, y))
df = pd.DataFrame(con, columns=["x", "y"])

# Define split
split_val = 2
df["split"] = np.where(df["x"] <= split_val, f"x <= {split_val}", f"x > {split_val}")

# Count 0s and 1s per side
table = df.groupby(["split", "y"]).size().unstack(fill_value=0)
table.columns = ["y=0", "y=1"]
table.reset_index(inplace=True)

display(table)

# Add totals and conditional probability of y=1
table["total"] = table["y=0"] + table["y=1"]
table["p(y=1)"] = table["y=1"] / table["total"]

table["p(y=1)"] = (table["p(y=1)"] * 100).round(0).astype(int).astype(str) + "%"
display(table)

Unnamed: 0,split,y=0,y=1
0,x <= 2,2,1
1,x > 2,1,2


Unnamed: 0,split,y=0,y=1,total,p(y=1)
0,x <= 2,2,1,3,33%
1,x > 2,1,2,3,67%
