In [360]:
from xgboost import XGBClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from xgboost import plot_tree
from z3 import *
import numpy as np
import pandas as pd

In [361]:
set_option(rational_to_decimal=True)

# model

In [362]:
# import sys
# import os
# sys.path.append(os.path.abspath('../../../'))

# from model.xai_xgb_z3 import XGBoostExplainer

In [726]:
from z3 import *


class XGBoostExplainer:
    """Apenas classificação binária e base_score = None
    data = X. labels = y
    """

    def __init__(self, model, data, labels):
        """_summary_

        Args:
            model (XGBoost): xgboost model fited
            data (DataFrame): dataframe (X or X_train)
            labels (array): y (targets)
        """
        self.model = model
        self.data = data.values
        self.columns = data.columns
        self.T_constraints = self.feature_constraints_expression(self.data)
        self.T_model = self.model_trees_expression(self.model)
        self.T = And(self.T_model, self.T_constraints)
        self.label_proportions = labels.mean()

    def explain(self, instance, reorder="asc"):
        self.D = self.decision_function_expression(
            self.model, [instance], self.label_proportions)
        self.I = self.instance_expression(instance)

        return self.explain_expression(self.I, self.T, self.D, self.model, reorder)

    def feature_constraints_expression(self, X):
        constraints = []

        for i in range(X.shape[1]):
            feature_values = X[:, i]
            min_val, max_val = feature_values.min(), feature_values.max()

            x = Real(self.columns[i])
            min = RealVal(min_val)
            max = RealVal(max_val)

            constraint = And(min <= x, x <= max)
            constraints.append(constraint)

        return And(*constraints)

    def model_trees_expression(self, model):
        """
        Constrói expressões lógicas para todas as árvores de decisão em um dataframe de XGBoost.
        Para árvores que são apenas folhas, gera diretamente um And com o valor da folha.

        Args:
            df (pd.DataFrame): Dataframe contendo informações das árvores.
            class_index (int): Índice da classe atual.

        Returns:
            z3.ExprRef: Fórmula representando todos os caminhos de todas as árvores.
        """
        df = model.get_booster().trees_to_dataframe()
        class_index = 0  # if model.n_classes_ == 2:

        all_tree_formulas = []

        for tree_index in df["Tree"].unique():
            tree_df = df[df["Tree"] == tree_index]
            o = Real(f'o_{tree_index}_{class_index}')

            if len(tree_df) == 1 and tree_df.iloc[0]["Feature"] == "Leaf":
                leaf_value = tree_df.iloc[0]["Gain"]
                all_tree_formulas.append(And(o == leaf_value))
                continue

            path_formulas = []

            def get_conditions(node_id):
                conditions = []
                current_node = tree_df[tree_df["ID"] == node_id]
                if current_node.empty:
                    return conditions

                parent_node = tree_df[(tree_df["Yes"] == node_id) | (
                    tree_df["No"] == node_id)]
                if not parent_node.empty:
                    parent_data = parent_node.iloc[0]
                    feature = parent_data["Feature"]
                    split_value = parent_data["Split"]
                    x = Real(feature)
                    if parent_data["Yes"] == node_id:
                        conditions.append(x < split_value)
                    else:
                        conditions.append(x >= split_value)
                    conditions = get_conditions(parent_data["ID"]) + conditions

                return conditions

            for _, node in tree_df[tree_df["Feature"] == "Leaf"].iterrows():
                leaf_value = node["Gain"]
                leaf_id = node["ID"]
                conditions = get_conditions(leaf_id)
                path_formula = And(*conditions)
                implication = Implies(path_formula, o == leaf_value)
                path_formulas.append(implication)

            all_tree_formulas.append(And(*path_formulas))

        return And(*all_tree_formulas)

    def tree_paths_expression(self, tree, tree_index, class_index):
        tree_ = tree.tree_
        feature = tree_.feature
        threshold = tree_.threshold
        value = tree_.value

        paths = []
        o = Real(f"o_{tree_index}_{class_index}")

        def traverse(node, path_conditions):

            if feature[node] == -2:
                leaf_value = value[node][0][0]
                path_formula = And(path_conditions)
                implication = Implies(path_formula, o == leaf_value)
                paths.append(implication)
            else:

                x = Real(f"x{feature[node]}")
                left_condition = x <= threshold[node]
                right_condition = x > threshold[node]
                traverse(tree_.children_left[node],
                         path_conditions + [left_condition])
                traverse(
                    tree_.children_right[node], path_conditions +
                    [right_condition]
                )

        traverse(0, [])
        return And(*paths)

    def decision_function_expression(self, model, x, label_proportions):
        n_classes = 1 if model.n_classes_ <= 2 else model.n_classes_
        predicted_class = model.predict(x)[0]
        init_value = label_proportions

        equation_list = []
        for class_number in range(n_classes):
            estimator_list = []
            for estimator_number in range(int(len(model.get_booster().get_dump()) / n_classes)):
                o = Real(f"o_{estimator_number}_{class_number}")
                estimator_list.append(o)
            equation_o = (
                Sum(estimator_list) + init_value
            )
            equation_list.append(equation_o)

        if n_classes <= 2:
            if predicted_class == 0:
                final_equation = equation_list[0] < 0
            else:
                final_equation = equation_list[0] > 0
        else:
            compare_equation = []
            for class_number in range(n_classes):
                if predicted_class != class_number:
                    compare_equation.append(
                        equation_list[predicted_class] > equation_list[class_number]
                    )
            final_equation = compare_equation

        return And(final_equation)

    def instance_expression(self, instance):
        formula = [Real(self.columns[i]) == value for i,
                   value in enumerate(instance)]
        return formula

    def explain_expression(self, I, T, D, model, reorder):
        X = I.copy()
        T_s = simplify(T)
        D_s = simplify(D)

        importances = model.feature_importances_
        non_zero_indices = np.where(importances != 0)[0]

        if reorder == "asc":
            sorted_feature_indices = non_zero_indices[np.argsort(importances[non_zero_indices])]
            X = [X[i] for i in sorted_feature_indices]
        elif reorder == "desc":
            sorted_feature_indices = non_zero_indices[np.argsort(-importances[non_zero_indices])]
            X = [X[i] for i in sorted_feature_indices]
            
        for feature in X.copy():
            X.remove(feature)

            # prove(Implies(And(And(X), T), D))
            if self.is_proved(Implies(And(And(X), T_s), D_s)):
                continue
                # print('proved')
            else:
                # print('not proved')
                X.append(feature)

        return X

    def is_proved(self, f):
        s = Solver()
        s.add(Not(f))
        if s.check() == unsat:
            return True
        else:
            return False


# pred

In [727]:
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = iris.target

y = np.where(y == 0, 0, 1) # converte em binario
# X = X.iloc[:, :2] # corta colunas do df

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2,  random_state=101)

xgbc = XGBClassifier(n_estimators=3, max_depth=3, learning_rate=0.1)
xgbc.fit(X_train, y_train)

preds = xgbc.predict(X_test)
preds

array([0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 0])

In [728]:
booster = xgbc.get_booster()
booster

<xgboost.core.Booster at 0x27698e64260>

In [729]:
# for i in range(3):
#        plot_tree(xgbc, num_trees=i)

In [730]:
trees = booster.get_dump(with_stats=True)
for i, tree in enumerate(trees):
  print(f"Tree {i}:")
  print(tree)

Tree 0:
0:[petal length (cm)<3] yes=1,no=2,missing=2,gain=108.977974,cover=26.8988876
	1:leaf=-0.265196383,cover=8.96629524
	2:leaf=0.143348008,cover=17.9325905

Tree 1:
0:[petal length (cm)<3] yes=1,no=2,missing=2,gain=87.7270432,cover=26.6550579
	1:leaf=-0.225890428,cover=9.60763645
	2:leaf=0.136479303,cover=17.0474205

Tree 2:
0:[petal length (cm)<3] yes=1,no=2,missing=2,gain=72.5324249,cover=26.0368938
	1:leaf=-0.199127138,cover=9.92332363
	2:leaf=0.13069883,cover=16.1135693



In [731]:
booster.get_dump()

['0:[petal length (cm)<3] yes=1,no=2,missing=2\n\t1:leaf=-0.265196383\n\t2:leaf=0.143348008\n',
 '0:[petal length (cm)<3] yes=1,no=2,missing=2\n\t1:leaf=-0.225890428\n\t2:leaf=0.136479303\n',
 '0:[petal length (cm)<3] yes=1,no=2,missing=2\n\t1:leaf=-0.199127138\n\t2:leaf=0.13069883\n']

In [732]:
print(type(booster))

<class 'xgboost.core.Booster'>


In [733]:
X_test.values[0]

array([5.5, 4.2, 1.4, 0.2])

# explain

In [734]:
explainer = XGBoostExplainer(xgbc, X, y)

In [735]:
explainer.explain(X_test.values[0])

[2]
[2]


[petal length (cm) == 1.4]

In [736]:
X.columns

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')

In [737]:
print(X_test.values[0])
print(y_test[0])
print(xgbc.predict([X_test.values[0]]))

[5.5 4.2 1.4 0.2]
0
[0]


In [738]:
filter_condition = X["petal length (cm)"] == 1.4
filtered_X = X[filter_condition]
filtered_y = y[filter_condition]
print("Entradas em X:")
print(filtered_X)

print("\nCorrespondências em y:")
print(filtered_y)

print("\n Predictions")
print(xgbc.predict(filtered_X))

Entradas em X:
    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                 5.1               3.5                1.4               0.2
1                 4.9               3.0                1.4               0.2
4                 5.0               3.6                1.4               0.2
6                 4.6               3.4                1.4               0.3
8                 4.4               2.9                1.4               0.2
12                4.8               3.0                1.4               0.1
17                5.1               3.5                1.4               0.3
28                5.2               3.4                1.4               0.2
33                5.5               4.2                1.4               0.2
37                4.9               3.6                1.4               0.1
45                4.8               3.0                1.4               0.3
47                4.6               3.2                1.4   

## print expresions

In [739]:
xgbc.get_booster().trees_to_dataframe()

Unnamed: 0,Tree,Node,ID,Feature,Split,Yes,No,Missing,Gain,Cover,Category
0,0,0,0-0,petal length (cm),3.0,0-1,0-2,0-2,108.977974,26.898888,
1,0,1,0-1,Leaf,,,,,-0.265196,8.966295,
2,0,2,0-2,Leaf,,,,,0.143348,17.93259,
3,1,0,1-0,petal length (cm),3.0,1-1,1-2,1-2,87.727043,26.655058,
4,1,1,1-1,Leaf,,,,,-0.22589,9.607636,
5,1,2,1-2,Leaf,,,,,0.136479,17.047421,
6,2,0,2-0,petal length (cm),3.0,2-1,2-2,2-2,72.532425,26.036894,
7,2,1,2-1,Leaf,,,,,-0.199127,9.923324,
8,2,2,2-2,Leaf,,,,,0.130699,16.113569,


In [740]:
print(explainer.T_constraints)

And(And(4.3 <= sepal length (cm), 7.9 >= sepal length (cm)),
    And(2 <= sepal width (cm), 4.4 >= sepal width (cm)),
    And(1 <= petal length (cm), 6.9 >= petal length (cm)),
    And(0.1 <= petal width (cm), 2.5 >= petal width (cm)))


In [741]:
print(explainer.T_model)

And(And(Implies(And(petal length (cm) < 3),
                o_0_0 == -0.265196383),
        Implies(And(petal length (cm) >= 3),
                o_0_0 == 0.143348008)),
    And(Implies(And(petal length (cm) < 3),
                o_1_0 == -0.225890428),
        Implies(And(petal length (cm) >= 3),
                o_1_0 == 0.136479303)),
    And(Implies(And(petal length (cm) < 3),
                o_2_0 == -0.199127138),
        Implies(And(petal length (cm) >= 3),
                o_2_0 == 0.13069883)))


In [742]:
xgbc.predict([X_test.values[0]], output_margin=True)

array([-0.02354732], dtype=float32)

In [743]:
-0.265196383 + -0.225890428 + -0.199127138 + y.mean()

np.float64(-0.023547282333333364)

In [744]:
explainer.label_proportions

np.float64(0.6666666666666666)

In [747]:
print(explainer.D)

And(o_0_0 + o_1_0 + o_2_0 + 0.6666666666? < 0)


In [746]:
print(explainer.I)

[sepal length (cm) == 5.5, sepal width (cm) == 4.2, petal length (cm) == 1.4, petal width (cm) == 0.2]
