# Imports

In [1]:
# %pip install --upgrade z3-solver

In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_wine
from z3 import *

In [3]:
set_option(rational_to_decimal=True)

# Z3 Functions

(I ^ T -> D)

## function I (instance expression)

In [4]:
def instance_expression(instance):
    formula = [Real(f'x{i}') == value for i, value in enumerate(instance)]
    return formula

## funcion T (tree leafs and constraints expression)

T = T_model ^ T_constraints

In [5]:
def feature_constraints_expression(X):
    constraints = []

    for i in range(X.shape[1]):
        feature_values = X[:, i]
        min_val, max_val = feature_values.min(), feature_values.max()

        x = Real(f'x{i}')
        min = RealVal(min_val)
        max = RealVal(max_val)

        constraint = And(min <= x, x <= max)
        constraints.append(constraint)

    return And(*constraints)

In [6]:
def tree_paths_expression(tree, tree_index, class_index):
    tree_ = tree.tree_
    feature = tree_.feature
    threshold = tree_.threshold
    value = tree_.value

    paths = []
    o = Real(f'o_{tree_index}_{class_index}')
    def traverse(node, path_conditions):

        if feature[node] == -2:
            leaf_value = value[node][0][0]
            path_formula = And(path_conditions)
            implication = Implies(path_formula, o == leaf_value)
            paths.append(implication)
        else:

            x = Real(f'x{feature[node]}')
            left_condition = x <= threshold[node]
            right_condition = x > threshold[node]
            traverse(tree_.children_left[node], path_conditions + [left_condition])
            traverse(tree_.children_right[node], path_conditions + [right_condition])

    traverse(0, [])
    return And(*paths)

In [7]:
def model_trees_expression(model):
    formulas = []
    for i, estimators in enumerate(model.estimators_):
        for class_index, estimator in enumerate(estimators):
            formula = tree_paths_expression(estimator, i, class_index)
            formulas.append(formula)
    return And(*formulas)

## function D (decision function result expression)

In [8]:
def decision_function_expression(model, x):
  learning_rate = model.learning_rate
  decision = model.decision_function(x)
  n_classes = 1 if model.n_classes_ <= 2 else model.n_classes_
  predicted_class = model.predict(x)[0]

  estimators = model.estimators_
  estimator_results = []
  for estimator in estimators:
    class_predictions = [tree.predict(x) for tree in estimator]
    estimator_results.append(class_predictions)

  estimator_sum = np.sum(estimator_results, axis=0) * learning_rate
  init_value = decision - estimator_sum.T

  equation_list = []
  for class_number in range(n_classes):
    estimator_list = []
    for estimator_number in range(len(estimators)):
      # print(f"o_{estimator_number}_{class_number}")
      o = Real(f'o_{estimator_number}_{class_number}')
      estimator_list.append(o)
    equation_o = Sum(estimator_list) * learning_rate + init_value[0][class_number]
    equation_list.append(equation_o)

  if n_classes <= 2:
    if predicted_class == 0:
      final_equation = equation_list[0] < 0
    else:
      final_equation = equation_list[0] > 0
  else:
    compare_equation = []
    for class_number in range(n_classes):
      if predicted_class != class_number:
        compare_equation.append(equation_list[predicted_class] > equation_list[class_number])
    final_equation = compare_equation

  return And(final_equation)

# Explaination Functions

In [9]:
def is_proved(f):
    s = Solver()
    s.add(Not(f))
    if s.check() == unsat:
        return True
    else:
        return False

In [10]:
def explain(I, T, D):
  X = I.copy()
  relevante = []

  for formula in I:
    X.remove(formula)

    if is_proved(Implies(And(And(X), T), D)):
      continue
      # print('proved')
    else:
      # print('not proved')
      X.append(formula)

  return X

In [11]:
def explain_instance(model, data, instance):
  I = instance_expression(instance)
  T_constraints = feature_constraints_expression(data)
  T_model = model_trees_expression(model)
  T = And(T_model, T_constraints)
  D = decision_function_expression(model, [instance])

  return explain(I, T, D)

In [12]:
# gb = GradientBoostingClassifier(n_estimators = 10)
# breast_cancer = load_breast_cancer()
# X_train, X_test, y_train, y_test = train_test_split(breast_cancer.data, breast_cancer.target, test_size=0.1, random_state=101)
# gb.fit(X_train, y_train)

In [13]:
# test = explain_instance(gb, breast_cancer.data, X_test[0])
# print(test)

## outro

In [14]:
def multiclass_initial_prediction(y, n_classes):
    class_counts = np.bincount(y, minlength=n_classes)
    class_probs = class_counts / len(y)

    log_odds = np.log(class_probs + 1e-15)
    return log_odds - log_odds.mean()

In [15]:
def binary_initial_prediction(y):
    pi = np.mean(y)

    initial_log_odds = np.log(pi / (1 - pi))

    return initial_log_odds

In [16]:
def compare_decision_function(model, X, initial_prediction=None):
    learning_rate = model.learning_rate
    estimators = model.estimators_
    estimator_results = []

    if initial_prediction is None and model.init_ != 'zero':
        print("Error - Missing initial_prediction")
        return None

    for estimator in estimators:
        class_predictions = [tree.predict(X) for tree in estimator]
        estimator_results.append(np.array(class_predictions).T)

    final_predictions = np.sum(estimator_results, axis=0) * learning_rate

    if model.init_ != 'zero':
            final_predictions += initial_prediction

    if np.isscalar(initial_prediction) and initial_prediction != None:
      final_predictions = final_predictions.flatten()

    if not np.allclose(final_predictions, model.decision_function(X)):
      print("Error - Deicison Function does not match")

    return final_predictions

In [17]:
def print_init_decision_function(model, X):
  X = [X[1]]

  decision = model.decision_function(X)
  print(decision)

  learning_rate = model.learning_rate
  estimators = model.estimators_
  estimator_results = []
  for estimator in estimators:
        class_predictions = [tree.predict(X) for tree in estimator]
        estimator_results.append(np.array(class_predictions).T)

  final_predictions = np.sum(estimator_results, axis=0) * learning_rate
  print(final_predictions)

  return decision - final_predictions

# Test Datasets
explicar todas as instancias - tamanho medio da explicação & desvio padrão, porcentagem de redução de features:
 - para binario
 - para multiclasse
 - dataset de imagens
 - reordenar as features e reavaliar

## Iris Multiclass

In [18]:
gb_iris = GradientBoostingClassifier()

In [19]:
iris = load_iris()
X_iris, y_iris = iris.data, iris.target
X_iris_train, X_iris_test, y_iris_train, y_iris_test = train_test_split(X_iris, y_iris, test_size=0.1, random_state=101)

In [20]:
gb_iris.fit(X_iris_train, y_iris_train)

In [21]:
multiclass_explain = explain_instance(gb_iris, X_iris, X_iris_test[4])
multiclass_explain

[x2 == 4.7, x3 == 1.4]

In [22]:
X_iris_test[4], y_iris_test[4], gb_iris.predict([X_iris_test[4]])

(array([7. , 3.2, 4.7, 1.4]), np.int64(1), array([1]))

In [23]:
# check_explanation = []
# for i in range(len(X_iris)):
#   if X_iris[i][3] == 1.6:
#     check_explanation.append((i, X_iris[i], y_iris[i], gb_multiclass.predict([X_iris[i]])))
# check_explanation

In [24]:
iris_explain_sizes = pd.DataFrame(columns=['explain_size'])
iris_count_zeros = 0

for i in range(len(X_iris_test)):
  explain_size = len(explain_instance(gb_iris, X_iris, X_iris_test[i]))
  if explain_size == 0:
    iris_count_zeros += 1
  iris_explain_sizes.loc[len(iris_explain_sizes)] = [explain_size]

In [25]:
iris_explain_stat = pd.DataFrame({
    'mean': [iris_explain_sizes['explain_size'].mean()],
    'std': [iris_explain_sizes['explain_size'].std()],
    'count_zeros': [iris_count_zeros],
    'dataset_features': [iris.data.shape[1]],
})
iris_explain_stat

Unnamed: 0,mean,std,count_zeros,dataset_features
0,2.0,0.0,0,4


## Cancer Binary

In [26]:
gb_cancer = GradientBoostingClassifier(n_estimators = 10)

In [27]:
cancer = load_breast_cancer()
X_cancer, y_cancer = cancer.data, cancer.target
X_cancer_train, X_cancer_test, y_cancer_train, y_cancer_test = train_test_split(X_cancer, y_cancer, test_size=0.1, random_state=101)

In [28]:
gb_cancer.fit(X_cancer_train, y_cancer_train)

In [29]:
cancer_explain_sizes = pd.DataFrame(columns=['explain_size'])
cancer_count_zeros = 0

for i in range(len(X_cancer_test)):
  explain_size = len(explain_instance(gb_cancer, X_cancer, X_cancer_test[i]))
  if explain_size == 0:
    cancer_count_zeros += 1
    print(i)
  cancer_explain_sizes.loc[len(cancer_explain_sizes)] = [explain_size]

In [30]:
cancer_explain_stat = pd.DataFrame({
    'mean': [cancer_explain_sizes['explain_size'].mean()],
    'std': [cancer_explain_sizes['explain_size'].std()],
    'count_zeros': [cancer_count_zeros],
    'dataset_features': [cancer.data.shape[1]],
})
cancer_explain_stat

Unnamed: 0,mean,std,count_zeros,dataset_features
0,5.54386,2.13031,0,30


## Wine Multiclass

In [31]:
gb_wine = GradientBoostingClassifier(n_estimators = 10)

In [32]:
wine = load_wine()
X_wine, y_wine = wine.data, wine.target
X_wine_train, X_wine_test, y_wine_train, y_wine_test = train_test_split(X_wine, y_wine, test_size=0.1, random_state=101)
set(wine.target)

{np.int64(0), np.int64(1), np.int64(2)}

In [33]:
gb_wine.fit(X_wine_train, y_wine_train)

In [34]:
wine_explain_sizes = pd.DataFrame(columns=['explain_size'])
wine_count_zeros = 0

for i in range(len(X_wine_test)):
  explain_size = len(explain_instance(gb_wine, X_wine, X_wine_test[i]))
  if explain_size == 0:
    wine_count_zeros += 1
    print(i)
  wine_explain_sizes.loc[len(wine_explain_sizes)] = [explain_size]

In [35]:
wine_explain_stat = pd.DataFrame({
    'mean': [wine_explain_sizes['explain_size'].mean()],
    'std': [wine_explain_sizes['explain_size'].std()],
    'count_zeros': [wine_count_zeros],
    'dataset_features': [wine.data.shape[1]],
})
wine_explain_stat

Unnamed: 0,mean,std,count_zeros,dataset_features
0,4.222222,0.942809,0,13


# Digits multiclass

In [36]:
from sklearn.datasets import load_digits

In [37]:
digits = load_digits()
X_digits, y_digits = digits.data, digits.target
X_digits_train, X_digits_test, y_digits_train, y_digits_test = train_test_split(X_digits, y_digits, test_size=0.01, random_state=101)
set(digits.target)

{np.int64(0),
 np.int64(1),
 np.int64(2),
 np.int64(3),
 np.int64(4),
 np.int64(5),
 np.int64(6),
 np.int64(7),
 np.int64(8),
 np.int64(9)}

In [38]:
gb_digits = GradientBoostingClassifier(n_estimators = 10)
gb_digits.fit(X_digits_train, y_digits_train)

In [39]:
digits_explain_sizes = pd.DataFrame(columns=['explain_size'])
digits_count_zeros = 0

for i in range(len(X_digits_test)):
  explain_size = len(explain_instance(gb_digits, X_digits, X_digits_test[i]))
  if explain_size == 0:
    digits_count_zeros += 1
    print(i)
  digits_count = i
  digits_explain_sizes.loc[len(digits_explain_sizes)] = [explain_size]

In [40]:
digits_explain_stat = pd.DataFrame({
    'mean': [digits_explain_sizes['explain_size'].mean()],
    'std': [digits_explain_sizes['explain_size'].std()],
    'count_zeros': [digits_count_zeros],
    'dataset_len': [digits_count],
    'dataset_features': [digits.data.shape[1]],
})
digits_explain_stat

Unnamed: 0,mean,std,count_zeros,dataset_len,dataset_features
0,17.611111,3.031965,0,17,64
