# imports

In [162]:
from z3 import *
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

In [163]:
set_option(rational_to_decimal=True)

# model

In [266]:
from z3 import *
import numpy as np


class XGBoostExplainer:
    """Apenas classificação binária e base_score = None
    data = X. labels = y
    """

    def __init__(self, model, data):
        """_summary_

        Args:
            model (XGBoost): xgboost model fited
            data (DataFrame): dataframe (X or X_train)
            labels (array): y (targets)
        """
        self.model = model
        self.data = data.values
        self.columns = data.columns
        self.max_categories = 2

    def fit(self):
        """Initialize Z3 expressions from model and categoric features from data.
        z3 expressions are built here for pkl compatibility (use fit after export pkl)
        """
        set_option(rational_to_decimal=True)

        self.categoric_features = self.get_categoric_features(self.data)
        self.T_model = self.model_trees_expression(self.model)
        self.T = self.T_model

    def explain(self, instance, reorder="asc"):
        self.I = self.instance_expression(instance)
        self.D = self.decision_function_expression(self.model, [instance])
        return self.explain_expression(self.I, self.T, self.D, self.model, reorder)

    def get_categoric_features(self, data: np.ndarray):
        """
        Recebe um dataset e retorna uma fórmula no z3 com:
        - Restrições de valor máximo e mínimo para features contínuas.
        - Restrições de igualdade para features categóricas binárias.
        """
        categoric_features = []
        for i in range(data.shape[1]):
            feature_values = data[:, i]
            unique_values = np.unique(feature_values)
            if len(unique_values) <= self.max_categories:
                categoric_features.append(self.columns[i])

        return categoric_features

    def feature_constraints(self, constraints=[]):
        """TODO
        esperado receber limites das features pelo usuário
        formato previso: matriz/dataframe [feaature, min/max, valor]
        constraaint_expression = "constraaint_df_to_feature()"
        """
        return

    def model_trees_expression(self, model):
        """
        Constrói expressões lógicas para todas as árvores de decisão em um dataframe de XGBoost.
        Para árvores que são apenas folhas, gera diretamente um And com o valor da folha.

        Args:
            df (pd.DataFrame): Dataframe contendo informações das árvores.
            class_index (int): Índice da classe atual.

        Returns:
            z3.ExprRef: Fórmula representando todos os caminhos de todas as árvores.
        """
        df = model.get_booster().trees_to_dataframe()
        if model.get_booster().feature_names == None:
            feature_map = {f"f{i}": col for i, col in enumerate(self.columns)}
            df["Feature"] = df["Feature"].replace(feature_map)

        df["Split"] = df["Split"].round(4)
        self.booster_df = df
        class_index = 0  # if model.n_classes_ == 2:
        all_tree_formulas = []

        for tree_index in df["Tree"].unique():
            tree_df = df[df["Tree"] == tree_index]
            o = Real(f"o_{tree_index}_{class_index}")

            if len(tree_df) == 1 and tree_df.iloc[0]["Feature"] == "Leaf":
                leaf_value = tree_df.iloc[0]["Gain"]
                all_tree_formulas.append(And(o == leaf_value))
                continue
            path_formulas = []

            def get_conditions(node_id):
                conditions = []
                current_node = tree_df[tree_df["ID"] == node_id]
                if current_node.empty:
                    return conditions

                parent_node = tree_df[
                    (tree_df["Yes"] == node_id) | (tree_df["No"] == node_id)
                ]
                if not parent_node.empty:
                    parent_data = parent_node.iloc[0]
                    feature = parent_data["Feature"]
                    split_value = parent_data["Split"]
                    x = Real(feature)
                    if parent_data["Yes"] == node_id:
                        conditions.append(x < split_value)
                    else:
                        conditions.append(x >= split_value)
                    conditions = get_conditions(parent_data["ID"]) + conditions

                return conditions

            for _, node in tree_df[tree_df["Feature"] == "Leaf"].iterrows():
                leaf_value = node["Gain"]
                leaf_id = node["ID"]
                conditions = get_conditions(leaf_id)
                path_formula = And(*conditions)
                implication = Implies(path_formula, o == leaf_value)
                path_formulas.append(implication)

            all_tree_formulas.append(And(*path_formulas))
        return And(*all_tree_formulas)

    def decision_function_expression(self, model, x):
        n_classes = 1 if model.n_classes_ <= 2 else model.n_classes_
        predicted_class = model.predict(x)[0]
        n_estimators = len(model.get_booster().get_dump())

        estimator_pred = Solver()
        estimator_pred.add(self.I)
        estimator_pred.add(self.T)
        variables = [Real(f"o_{i}_0") for i in range(n_estimators)]
        if estimator_pred.check() == sat:
            solvermodel = estimator_pred.model()
            total_sum = sum(
                float(solvermodel.eval(var).as_fraction()) for var in variables
            )
        else:
            total_sum = 0
            print("estimator error")
        init_value = model.predict(x, output_margin=True)[0] - total_sum
        # print("init:", round(init_value, 2))

        equation_list = []
        for class_number in range(n_classes):
            estimator_list = []
            for estimator_number in range(
                int(len(model.get_booster().get_dump()) / n_classes)
            ):
                o = Real(f"o_{estimator_number}_{class_number}")
                estimator_list.append(o)
            equation_o = Sum(estimator_list) + init_value
            equation_list.append(equation_o)

        if n_classes <= 2:
            if predicted_class == 0:
                final_equation = equation_list[0] < 0
            else:
                final_equation = equation_list[0] > 0
        else:
            compare_equation = []
            for class_number in range(n_classes):
                if predicted_class != class_number:
                    compare_equation.append(
                        equation_list[predicted_class] > equation_list[class_number]
                    )
            final_equation = And(compare_equation)

        return final_equation

    def instance_expression(self, instance):
        formula = [Real(self.columns[i]) == value for i, value in enumerate(instance)]
        return formula

    def explain_expression(self, I, T, D, model, reorder):
        i_expression = I.copy()
        T_s = T
        D_s = D

        importances = model.feature_importances_
        non_zero_indices = np.where(importances != 0)[0]

        if reorder == "asc":
            sorted_feature_indices = non_zero_indices[
                np.argsort(importances[non_zero_indices])
            ]
            i_expression = [i_expression[i] for i in sorted_feature_indices]
        elif reorder == "desc":
            sorted_feature_indices = non_zero_indices[
                np.argsort(-importances[non_zero_indices])
            ]
            i_expression = [i_expression[i] for i in sorted_feature_indices]

        for feature in i_expression.copy():

            i_expression.remove(feature)

            # prove(Implies(And(And(i_expression), T), D))
            if self.is_proved(Implies(And(And(i_expression), T_s), D_s)):
                continue
                # print('proved')
            else:
                # print('not proved')
                i_expression.append(feature)
        # print(self.is_proved(Implies(And(And(i_expression), T_s), D_s)))
        return i_expression

    def is_proved(self, f):
        s = Solver()
        s.add(Not(f))
        if s.check() == unsat:
            return True
        else:
            return False

    def delta_expression(self, expression):
        # print(delta_expressions)
        return  # delta_expressions

    def get_deltas(self, exp):
        if exp and isinstance(exp[0], str):
            expz3 = []
            for token in exp:
                tokens = token.split(" == ")
                expz3.append(Real(tokens[0]) == (tokens[1]))
            exp = expz3
        for expression in exp:
            if str(expression.arg(0)) in self.categoric_features:
                self.caterogic_expressions.append(expression)
                exp = list(filter(lambda expr: not expr.eq(expression), exp))
            else:
                self.cumulative_range_expresson.append(expression)

        delta_list = []
        for expression in exp:

            self.cumulative_range_expresson = list(
                filter(
                    lambda expr: not expr.eq(expression),
                    self.cumulative_range_expresson,
                )
            )
            lower_min, upper_min = self.optmize_delta(expression)

            if lower_min != None:
                delta_value_lower = self.get_delta_value(str(lower_min.value()))
                self.cumulative_range_expresson.append(
                    expression.arg(0) >= expression.arg(1) - delta_value_lower
                )
            else:
                # print("unsat == open range lower")
                delta_value_lower = None

            if upper_min != None:
                delta_value_upper = self.get_delta_value(str(upper_min.value()))
                self.cumulative_range_expresson.append(
                    expression.arg(0) <= expression.arg(1) + delta_value_upper
                )
            else:
                # print("unsat == open range upper")
                delta_value_upper = None

            # print(expression, delta_value_lower, delta_value_upper)
            delta_list.append([expression, delta_value_lower, delta_value_upper])

        self.delta_list = delta_list
        return delta_list

    def get_delta_value(self, value):
        if "+ epsilon" in value:
            delta_value = float(value.split(" + ")[0])
        elif "epsilon" == value:
            delta_value = 0
        elif "0" == value:
            print("ERROR: delta == 0, explanation is incorrect")
            delta_value = 0
        else:
            delta_value = round(float(value) - 0.01, 2)

        return delta_value

    def optmize_delta(self, expression):
        delta_upper = Real("delta_upper")
        delta_lower = Real("delta_lower")

        self.delta_features = []

        delta_expressions = []
        delta_expressions.append(expression.arg(0) >= expression.arg(1) - delta_lower)
        delta_expressions.append(expression.arg(0) <= expression.arg(1) + delta_upper)

        self.delta_expressions = delta_expressions

        expression_list = []
        expression_list.append(And(self.cumulative_range_expresson))
        expression_list.append(And(self.caterogic_expressions))
        expression_list.append(And(self.delta_expressions))
        expression_list.append(self.T)
        expression_list.append(Not(self.D))
        expression_list.append(delta_upper >= 0)
        expression_list.append(delta_lower >= 0)

        opt_lower = Optimize()
        opt_lower.add(And(expression_list))
        opt_lower.add(delta_upper == 0)
        lower_min = opt_lower.minimize(delta_lower)
        if opt_lower.check() != sat:
            # print("lower unsat")
            lower_min = None

        opt_upper = Optimize()
        opt_upper.add(And(expression_list))
        opt_upper.add(delta_lower == 0)
        upper_min = opt_upper.minimize(delta_upper)
        if opt_upper.check() != sat:
            # print("upper unsat")
            upper_min = None

        return lower_min, upper_min

    def explain_range(self, instance, reorder="asc", dataset_bounds=True, exp=None):
        self.cumulative_range_expresson = []
        self.caterogic_expressions = []
        self.range_metric = 0
        if exp == None:
            exp = self.explain(instance, reorder)
        if exp != []:
            delta_list = self.get_deltas(exp)
            range_exp = []
            for expression, delta_lower, delta_upper in delta_list:
                expname = str(expression.arg(0))

                expvalue = float(expression.arg(1).as_fraction())
                lower = None
                upper = None
                if delta_lower is not None:
                    lower = round(expvalue - delta_lower, 2)
                if delta_upper is not None:
                    upper = round(expvalue + delta_upper, 2)

                if dataset_bounds == True:
                    idx = list(self.columns).index(expname)
                    min_idx = np.min(self.data[:, idx])
                    max_idx = np.max(self.data[:, idx])
                    if lower is not None and lower < min_idx:
                        lower = min_idx
                    if upper is not None and upper > max_idx:
                        upper = max_idx

                    # self.range_metric += (upper - lower)
                if lower == upper:
                    range_exp.append(f"{expression.arg(0)} == {expression.arg(1)}")
                else:
                    if lower is None:
                        range_exp.append(f"{expname} <= {upper}")
                    elif upper is None:
                        range_exp.append(f"{expname} >= {lower}")
                    else:
                        range_exp.append(f"{lower} <= {expname} <= {upper}")

            for expression in self.caterogic_expressions:
                range_exp.append(f"{expression.arg(0)} == {expression.arg(1)}")

            return range_exp
        else:
            return exp


# test Iris

In [258]:
class ColumnEncoderDecoder:
    def __init__(self):
        self.mapping = {}

    def encode(self, df):
        """Substitui os nomes das colunas por x0, x1, ..., xn"""
        self.mapping = {f"x{i}": col for i, col in enumerate(df.columns)}
        df_encoded = df.rename(
            columns={col: new_col for new_col, col in self.mapping.items()}
        )
        return df_encoded

    def decode(self, text):
        """Substitui x0, x1, ..., xn pelos nomes originais das colunas"""
        for new_col, original_col in self.mapping.items():
            text = text.replace(new_col, original_col)
        return text

In [259]:
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = iris.target

# y = np.where(y == 0, 0, 1)  # converte em binario
y[y == 2] = 0
# X = X.iloc[:, :2] # corta colunas do df

encoder_decoder = ColumnEncoderDecoder()
X = encoder_decoder.encode(X)

encoded_text = "decode de x1"
decoded_text = encoder_decoder.decode(encoded_text)

# print("\nTexto decodificado:")
# print(decoded_text)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=101
)

xgbc = XGBClassifier(n_estimators=100, max_depth=3, learning_rate=0.1)
xgbc.fit(X_train, y_train)

preds = xgbc.predict(X_test)
preds

array([0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0,
       1, 1, 1, 1, 1, 0, 0, 0])

In [260]:
# ['5.51 <= x2 <= 5.69', '6.01 <= x0 <= 6.19', '2.51 <= x1 <= 2.69']

In [261]:
xgbc.predict([X_test.values[0]])
X_test.values[0]

array([5.5, 4.2, 1.4, 0.2])

In [267]:
explainer = XGBoostExplainer(xgbc, X)
explainer.fit()

In [270]:
sample = [5.5, 4.2, 1.4, 0.2]
exp = explainer.explain(sample, reorder="asc")
print(exp, type(exp[0]))

print(explainer.explain_range(sample, reorder="asc", exp=exp))

[x2 == 1.4] <class 'z3.z3.BoolRef'>
['x2 <= 2.99']


In [271]:
expstr = ['x2 == 1.4']
print(expstr, type(expstr[0]))
    
print(explainer.explain_range(sample, reorder="asc", exp=expstr))

['x2 == 1.4'] <class 'str'>
['x2 <= 2.99']


# range test all samples

In [202]:
sample = [5.5, 4.2, 1.4, 0.2]
print(explainer.explain(sample, reorder="asc"))
print(explainer.explain_range(sample, reorder="asc"))

[x2 == 1.4]
['x2 <= 2.99']


In [181]:
explainer.explain_range(X_test.values[19], reorder="asc")

['x1 <= 2.69', '6.0 <= x0 <= 6.29', 'x2 >= 5.2']

In [182]:
explainer.explain_range(X_test.values[0], reorder="desc")

['x2 <= 2.99']

In [183]:
filtered_df = X[
    (X["x2"] >= 5.2) & (X["x0"] >= 6) & (X["x0"] <= 6.29) & (X["x1"] <= 2.69)
]

xgbc.predict(filtered_df)

array([0])

In [184]:
xgbc.predict(X[X["x2"] <= 2.99])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0])

In [185]:
explainer.cumulative_range_expresson

[x2 <= 1.4 + 1.59]

In [186]:
explainer.explain_range(X_test.values[19], reorder="desc")

['x2 >= 5.2', '6.0 <= x0 <= 6.29', 'x1 <= 2.69']

In [187]:
copiaexp = explainer.cumulative_range_expresson
print(copiaexp)

[x2 >= 5.6 - 0.4, x0 >= 6.1 - 0.1, x0 <= 6.1 + 0.19, x1 <= 2.6 + 0.09]


In [188]:
copiaexp = [expr for expr in copiaexp if not expr.eq(Real("x2") == 5.6)]
print(copiaexp)  # Saída esperada: [x0 == 6.1, x1 == 2.6]

[x2 >= 5.6 - 0.4, x0 >= 6.1 - 0.1, x0 <= 6.1 + 0.19, x1 <= 2.6 + 0.09]


In [189]:
copiaexp = list(filter(lambda expr: not expr.eq(Real("x2") == 5.6), copiaexp))
print(copiaexp)

[x2 >= 5.6 - 0.4, x0 >= 6.1 - 0.1, x0 <= 6.1 + 0.19, x1 <= 2.6 + 0.09]


In [190]:
xgbc.feature_importances_

array([0.03846328, 0.02926282, 0.36015186, 0.57212204], dtype=float32)

In [191]:
["None <= x3 <= 1.69", "3.0 <= x2 <= 4.89"]

filtered_df = X[(X["x2"] >= 3) & (X["x2"] <= 4.89) & (X["x3"] <= 1.69)]

xgbc.predict(filtered_df)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1])

In [192]:
range_metric_list = []
for i in range(X_test.shape[0]):
    print(explainer.explain_range(X_test.values[i], reorder="desc"))
    range_metric_list.append(explainer.range_metric)

['x2 <= 2.99']
['x2 <= 2.99']
['x2 <= 2.99']
['1.6 <= x3 <= 1.69', 'x2 >= 3.0', 'x0 >= 6.3', 'x1 >= 3.0']
['x3 <= 1.69', '3.0 <= x2 <= 4.89']
['x3 >= 1.8']
['x3 <= 1.69', '3.0 <= x2 <= 4.89']
['x3 <= 1.69', '3.0 <= x2 <= 4.89']
['x2 >= 5.0', 'x0 >= 6.0', 'x1 <= 2.59']
['x2 <= 2.99']
['x3 >= 1.8']
['x2 <= 2.99']
['x2 <= 2.99']
['x2 >= 5.2', 'x0 >= 6.0', '2.8 <= x1 <= 2.99']
['x3 >= 1.8']
['x3 <= 1.69', '3.0 <= x2 <= 4.89']
['x3 <= 1.69', '3.0 <= x2 <= 4.89']
['x3 <= 1.69', '3.0 <= x2 <= 4.89']
['x2 <= 2.99']
['x2 >= 5.2', '6.0 <= x0 <= 6.29', 'x1 <= 2.69']
['x3 <= 1.69', '3.0 <= x2 <= 4.89']
['x2 <= 2.99']
['x3 <= 1.69', '3.0 <= x2 <= 4.89']
['x3 <= 1.69', '3.0 <= x2 <= 4.89']
['x3 <= 1.69', '3.0 <= x2 <= 4.89']
['x3 <= 1.69', '3.0 <= x2 <= 4.89']
['x3 <= 1.69', '3.0 <= x2 <= 4.89']
['x3 >= 1.8']
['x2 <= 2.99']
['x2 <= 2.99']


In [193]:
print("sum:", np.sum(range_metric_list), "mean:", np.mean(range_metric_list))

sum: 0 mean: 0.0


In [194]:
for i in range(X_test.shape[0]):
    print(explainer.explain_range(X_test.values[i], reorder="asc"))

['x2 <= 2.99']
['x2 <= 2.99']
['x2 <= 2.99']
['x1 >= 3.0', 'x0 >= 6.3', 'x2 >= 3.0', '1.6 <= x3 <= 1.69']
['3.0 <= x2 <= 4.99', 'x3 <= 1.59']
['x3 >= 1.8']
['3.0 <= x2 <= 4.99', 'x3 <= 1.59']
['3.0 <= x2 <= 4.99', 'x3 <= 1.59']
['x3 >= 1.8']
['x2 <= 2.99']
['x3 >= 1.8']
['x2 <= 2.99']
['x2 <= 2.99']
['x3 >= 1.8']
['x3 >= 1.8']
['3.0 <= x2 <= 4.99', 'x3 <= 1.59']
['3.0 <= x2 <= 4.99', 'x3 <= 1.59']
['3.0 <= x2 <= 4.99', 'x3 <= 1.59']
['x2 <= 2.99']
['x1 <= 2.69', '6.0 <= x0 <= 6.29', 'x2 >= 5.2']
['3.0 <= x2 <= 4.99', 'x3 <= 1.59']
['x2 <= 2.99']
['3.0 <= x2 <= 4.99', 'x3 <= 1.59']
['3.0 <= x2 <= 4.99', 'x3 <= 1.59']
['3.0 <= x2 <= 4.99', 'x3 <= 1.59']
['3.0 <= x2 <= 4.99', 'x3 <= 1.59']
['3.0 <= x2 <= 4.99', 'x3 <= 1.59']
['x3 >= 1.8']
['x2 <= 2.99']
['x2 <= 2.99']


# check correct

In [195]:
def check_correct_explanation(exp, explainer):
    opt = Optimize()

    exprange_z3 = []
    exptokens = []
    for item in exp:
        item = str(item)
        if "<=" in item:
            tokens = item.split(" <= ")
            exprange_z3.append((tokens[0]) <= Real(tokens[1]))
            exprange_z3.append(Real(tokens[1]) <= (tokens[2]))
            exptokens.append(tokens[1])
        else:
            tokens = item.split(" == ")
            exprange_z3.append(Real(tokens[0]) == (tokens[1]))
            exptokens.append(tokens[0])
    opt.add(exprange_z3)

    inst = explainer.I
    deltaexp = []
    for item in inst:
        item = str(item)
        tokens = item.split(" == ")
        if tokens[0] not in exptokens:
            if tokens[0] in explainer.categoric_features:
                deltaexp.append(Real(tokens[0]) == (tokens[1]))
            else:
                deltaexp.append(Real(tokens[0]) >= (tokens[1]) - Real("delta"))
                deltaexp.append(Real(tokens[0]) <= (tokens[1]) + Real("delta"))
    opt.add(deltaexp)

    opt.add(explainer.T_model)

    opt.add(Not(explainer.D))

    opt.add(Real("delta") >= 0)

    delta = Real("delta")
    expmin = opt.minimize(delta)

    printlist = []
    if opt.check() == sat:
        for var in opt.model():
            if str(var) in explainer.columns:
                printlist.append(f"{var} = {opt.model()[var]}")
        printlist.append(f"delta = {opt.model().eval(delta)}")
    else:
        printlist.append("unsat == correct")
    #   print(printlist)
    value = str(expmin.value())
    #   print(value)
    return printlist, exprange_z3, deltaexp, explainer.T, Not(explainer.D)

In [196]:
# count = 0
# explanationstest = []
# lista_results = []
# for i in range(0, len(X)):
#     exprange = explainer.explain_range(X.values[i])
#     explanationstest.append(exprange)
#     ans, ansrange, ansdelta, anst, ansnotd = check_correct_explanation(
#         exprange, explainer
#     )
#     if ans[0] == ("unsat == correct"):
#         count += 1
#     else:
#         lista_results.append([i, ans])
#         print(exprange)
#         print(i, ans)
# count, len(X)

In [197]:
explainer.explain(X.values[70])

[x3 == 1.8]

In [198]:
# import numpy as np


# def fazer_predicoes(xgbc, X, lista_results):
#     for i, features in lista_results:
#         # Extrai os valores das features ignorando 'delta'
#         valores_dict = {
#             f.split(" = ")[0]: float(f.split(" = ")[1])
#             for f in features
#             if not f.startswith("delta")
#         }

#         # Ordena corretamente em [x0, x1, x2, x3]
#         valores = np.array(
#             [
#                 [
#                     valores_dict["x0"],
#                     valores_dict["x1"],
#                     valores_dict["x2"],
#                     valores_dict["x3"],
#                 ]
#             ]
#         )

#         # Faz as previsões
#         pred1 = xgbc.predict(X.values[i].reshape(1, -1))[0]  # Do dataset original
#         pred2 = xgbc.predict(valores)[0]  # Dos valores extraídos e organizados

#         # Print do resultado lado a lado
#         print(
#             f"Índice {i}: Predição original = {pred1}, Predição valores extraídos = {pred2}"
#         )


# # Exemplo de chamada da função (supondo que 'xgbc' e 'X' já estejam definidos)
# fazer_predicoes(xgbc, X, lista_results)

In [199]:
xgb_noname = XGBClassifier()
xgb_noname.fit(X_train.values, y_train)

tree_df = xgb_noname.get_booster().trees_to_dataframe()
tree_df[tree_df["Feature"] == "f0"]

Unnamed: 0,Tree,Node,ID,Feature,Split,Yes,No,Missing,Gain,Cover,Category
45,5,4,5-4,f0,6.1,5-7,5-8,5-8,0.201748,3.858635,
87,10,1,10-1,f0,5.5,10-3,10-4,10-4,1.280823,5.402,
101,12,1,12-1,f0,5.5,12-3,12-4,12-4,0.884022,4.041253,
108,13,1,13-1,f0,5.2,13-3,13-4,13-4,0.630678,3.950562,
122,15,1,15-1,f0,5.5,15-3,15-4,15-4,0.497514,3.119324,
127,16,1,16-1,f0,5.5,16-3,16-4,16-4,0.350619,3.297701,
137,18,1,18-1,f0,5.8,18-3,18-4,18-4,0.306577,2.872401,
142,19,1,19-1,f0,5.5,19-3,19-4,19-4,0.220266,3.097914,
147,20,1,20-1,f0,5.9,20-3,20-4,20-4,0.205819,2.747463,
152,21,1,21-1,f0,5.5,21-3,21-4,21-4,0.147528,2.994935,


In [200]:
X_train.columns

Index(['x0', 'x1', 'x2', 'x3'], dtype='object')

In [201]:
tree_df = xgbc.get_booster().trees_to_dataframe()
tree_df

Unnamed: 0,Tree,Node,ID,Feature,Split,Yes,No,Missing,Gain,Cover,Category
0,0,0,0-0,x2,3.0,0-1,0-2,0-2,25.037951,26.302439,
1,0,1,0-1,Leaf,,,,,-0.132875,8.767480,
2,0,2,0-2,x3,1.8,0-3,0-4,0-4,65.942047,17.534960,
3,0,3,0-3,Leaf,,,,,0.245933,8.767480,
4,0,4,0-4,x2,5.1,0-5,0-6,0-6,0.366725,8.767480,
...,...,...,...,...,...,...,...,...,...,...,...
631,99,0,99-0,x2,5.0,99-1,99-2,99-2,0.096660,4.017835,
632,99,1,99-1,x0,5.8,99-3,99-4,99-4,0.109326,2.625185,
633,99,2,99-2,Leaf,,,,,-0.016689,1.392650,
634,99,3,99-3,Leaf,,,,,-0.007084,1.545399,
