# Imports

In [2]:
import sys
import os
sys.path.append(os.path.abspath('../../../'))

from model.xai_gb_z3 import Explainer

In [3]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.datasets import load_breast_cancer
from sklearn.datasets import load_wine
from sklearn.datasets import load_digits
from pmlb import fetch_data
from z3 import *

In [4]:
set_option(rational_to_decimal=True)

# test model

In [10]:
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = iris.target

y = np.where(y == 0, 0, 1) # converte em binario
# X = X.iloc[:, :2] # corta colunas do df

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2,  random_state=101)

gb = GradientBoostingClassifier(n_estimators=3, max_depth=3, learning_rate=0.1)
gb.fit(X_train, y_train)

preds = gb.predict(X_test)
preds

array([0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 0])

In [12]:
explainer = Explainer(gb, X.values)

In [13]:
explainer.explain(X_test.values[0])



[x0 == 5.5, x1 == 4.2]

# Test Datasets
explicar todas as instancias - tamanho medio da explicação & desvio padrão, porcentagem de redução de features:
 - para binario
 - para multiclasse
 - dataset de imagens
 - reordenar as features e reavaliar

## Auto (multiclass)

In [4]:
gb_auto = GradientBoostingClassifier(n_estimators=1, max_depth=1)

In [5]:
auto_data = fetch_data('auto')
print(auto_data.shape)
auto_data.head()

(202, 26)


Unnamed: 0,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,...,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price,target
0,26,11,1,0,2,2,1,0,93.7,157.3,...,1,4,19,9.4,42,16,31.0,38.0,119,2
1,3,17,1,1,1,3,1,0,99.1,186.6,...,5,25,10,9.0,23,16,19.0,26.0,70,2
2,17,19,1,0,2,0,2,0,98.4,176.2,...,5,30,28,9.3,9,8,24.0,30.0,62,2
3,19,9,1,0,2,0,2,0,96.6,180.3,...,5,22,12,8.3,21,7,16.0,18.0,92,3
4,51,20,1,0,2,0,1,0,94.5,159.3,...,5,13,24,8.5,54,16,24.0,29.0,15,3


In [6]:
set(auto_data.target)

{-1, 0, 1, 2, 3}

In [7]:
auto_data = auto_data[auto_data['target'] <= 0]
auto_data.shape

(89, 26)

In [8]:
X_auto = auto_data.drop(columns=['target']).values
y_auto = auto_data['target'].values
X_auto_train, X_auto_test, y_auto_train, y_auto_test = train_test_split(
    X_auto, y_auto, test_size=0.1, random_state=101)
gb_auto.fit(X_auto_train, y_auto_train)

In [9]:
auto_explainer = Explainer(gb_auto, X_auto)

In [10]:
auto_explain_sizes = pd.DataFrame(columns=['explain_size'])
auto_count_zeros = 0

for i in range(len(X_auto_test)):
    explain_size = len(auto_explainer.explain(X_auto_test[i], reorder='asc'))
    if explain_size == 0:
        auto_count_zeros += 1
        print(i)
    auto_count = i
    auto_explain_sizes.loc[len(auto_explain_sizes)] = [explain_size]

auto_explain_stat = pd.DataFrame({
    'mean': [auto_explain_sizes['explain_size'].mean()],
    'std': [auto_explain_sizes['explain_size'].std()],
    'count_zeros': [auto_count_zeros],
    'dataset_len': [auto_count],
    'dataset_features': [X_auto.shape[1]],
})
auto_explain_stat

Unnamed: 0,mean,std,count_zeros,dataset_len,dataset_features
0,1.0,0.0,0,8,25


### teste

In [11]:
gb_auto.predict(X_auto_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0])

In [12]:
gb_auto.feature_importances_

array([0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0.])

In [13]:
number = 0

In [14]:
auto_explainer.explain(X_auto_test[number])

[x0 == 51]

In [15]:
gb_auto.decision_function([X_auto_test[number]])

array([1.21982441])

In [16]:
print(auto_explainer.T_constraints)

And(And(1 <= x0, 51 >= x0),
    And(1 <= x1, 21 >= x1),
    And(0 <= x2, 1 >= x2),
    And(0 <= x3, 1 >= x3),
    And(0 <= x4, 2 >= x4),
    And(1 <= x5, 4 >= x5),
    And(0 <= x6, 2 >= x6),
    And(0 <= x7, 0 >= x7),
    And(94.3 <= x8, 120.9 >= x8),
    And(155.9 <= x9, 208.1 >= x9),
    And(61.8 <= x10, 71.7 >= x10),
    And(47.8 <= x11, 59.8 >= x11),
    And(1909 <= x12, 4066 >= x12),
    And(0 <= x13, 5 >= x13),
    And(0 <= x14, 5 >= x14),
    And(90 <= x15, 326 >= x15),
    And(0 <= x16, 6 >= x16),
    And(2 <= x17, 36 >= x17),
    And(1 <= x18, 35 >= x18),
    And(7 <= x19, 23 >= x19),
    And(1 <= x20, 59 >= x20),
    And(0 <= x21, 23 >= x21),
    And(13 <= x22, 38 >= x22),
    And(16 <= x23, 47 >= x23),
    And(0 <= x24, 186 >= x24))


In [17]:
print(auto_explainer.T_model)

And(And(Implies(And(x8 <= 102.2000007629?),
                o_0_0 == 1.2121212121?),
        Implies(And(x8 > 102.2000007629?),
                o_0_0 == -1.4814814814?)))


In [18]:
print(auto_explainer.D)

And((o_0_0)*0.1 + 1.0986122886? < 0)


In [19]:
print(auto_explainer.I)

[x0 == 51, x1 == 6, x2 == 1, x3 == 0, x4 == 1, x5 == 3, x6 == 1, x7 == 0, x8 == 94.5, x9 == 155.9, x10 == 63.6, x11 == 52, x12 == 1909, x13 == 3, x14 == 2, x15 == 90, x16 == 1, x17 == 7, x18 == 13, x19 == 9.6, x20 == 44, x21 == 15, x22 == 38, x23 == 43, x24 == 186]


In [20]:
print(X_auto_test[number])
print(gb_auto.predict([X_auto_test[number]]))

[5.100e+01 6.000e+00 1.000e+00 0.000e+00 1.000e+00 3.000e+00 1.000e+00
 0.000e+00 9.450e+01 1.559e+02 6.360e+01 5.200e+01 1.909e+03 3.000e+00
 2.000e+00 9.000e+01 1.000e+00 7.000e+00 1.300e+01 9.600e+00 4.400e+01
 1.500e+01 3.800e+01 4.300e+01 1.860e+02]
[0]


In [21]:
gb_auto.predict([[0.800e+01, 1.900e+01, 1.000e+00, 0.000e+00, 1.000e+00, 4.000e+00,
       1.000e+00, 0.000e+00, 9.570e+01, 1.697e+02, 6.360e+01, 5.910e+01,
       2.280e+03, 3.000e+00, 2.000e+00, 9.200e+01, 1.000e+00, 8.000e+00,
       9.000e+00, 9.000e+00, 4.000e+01, 8.000e+00, 3.100e+01, 3.700e+01,
       1.260e+02]])

array([0])

In [22]:
# import re

# def extrair_variaveis(formula_str):
#     # Expressão regular para capturar variáveis que começam com 'x' e são seguidas por números
#     variaveis = re.findall(r'\bx\d+\b', formula_str)
#     # Remover duplicatas convertendo a lista para um set e retornando como uma lista ordenada
#     return sorted(set(variaveis))

# variaveis = extrair_variaveis(auto_explainer.T_model.sexpr())
# print(variaveis)

In [23]:
# # Feature Importance
# feature_importances = gb_auto.feature_importances_
# feature_names = auto_data.drop(columns=['target']).columns

# # Create a DataFrame for easier sorting and printing
# feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})

# # Sort by importance in ascending order
# feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# # Print the features and their importances
# feature_importance_df

In [24]:
s = Solver()
s.add(auto_explainer.D)
# s.check()

from z3 import prove

prove(auto_explainer.D)

counterexample
[o_0_0 = 0]


In [25]:
def test_prove(claim, show=False, **keywords):
    """Try to prove the given claim.

    This is a simple function for creating demonstrations.  It tries to prove
    `claim` by showing the negation is unsatisfiable.

    >>> p, q = Bools('p q')
    >>> prove(Not(And(p, q)) == Or(Not(p), Not(q)))
    proved
    """
    # if z3_debug():
    #     _z3_assert(is_bool(claim), "Z3 Boolean expression expected")
    s = Solver()
    s.set(**keywords)
    s.add(Not(claim))
    if show:
        print(s)
    r = s.check()
    if r == unsat:
        print("proved")
    elif r == unknown:
        print("failed to prove")
        print(s.model())
    else:
        print("counterexample")
        print(s.model())

In [26]:
formula = Implies(And(And(auto_explainer.I), auto_explainer.T), auto_explainer.D)


In [27]:
test_prove(formula)

counterexample
[x11 = 52,
 x13 = 3,
 x23 = 43,
 x22 = 38,
 x20 = 44,
 x1 = 6,
 x10 = 63.6,
 x16 = 1,
 x21 = 15,
 x6 = 1,
 x2 = 1,
 o_0_0 = 1.2121212121?,
 x12 = 1909,
 x19 = 9.6,
 x24 = 186,
 x9 = 155.9,
 x4 = 1,
 x5 = 3,
 x17 = 7,
 x8 = 94.5,
 x14 = 2,
 x15 = 90,
 x18 = 13,
 x0 = 51,
 x7 = 0,
 x3 = 0]


In [28]:
prove(auto_explainer.D)

counterexample
[o_0_0 = 0]


## Iris Multiclass

In [29]:
gb_iris = GradientBoostingClassifier(n_estimators=1, max_depth=1)

In [30]:
iris = load_iris()
X_iris, y_iris = iris.data, iris.target

filter_indices = np.where(np.isin(y_iris, [0, 1]))[0]
X_iris = X_iris[filter_indices]
y_iris = y_iris[filter_indices]

X_iris_train, X_iris_test, y_iris_train, y_iris_test = train_test_split(
    X_iris, y_iris, test_size=0.1, random_state=101)

gb_iris.fit(X_iris_train, y_iris_train)

In [31]:
# check_explanation = []
# for i in range(len(X_iris)):
#   if X_iris[i][3] == 1.6:
#     check_explanation.append((i, X_iris[i], y_iris[i], gb_iris.predict([X_iris[i]])))
# check_explanation

In [32]:
iris_explainer = Explainer(gb_iris, X_iris)

In [33]:
iris_explain_sizes = pd.DataFrame(columns=['explain_size'])
iris_count_zeros = 0

for i in range(len(X_iris_test)):
    explain_size = len(iris_explainer.explain(X_iris_test[i], reorder='asc'))
    if explain_size == 0:
        iris_count_zeros += 1
    iris_explain_sizes.loc[len(iris_explain_sizes)] = [explain_size]

iris_explain_stat = pd.DataFrame({
    'mean': [iris_explain_sizes['explain_size'].mean()],
    'std': [iris_explain_sizes['explain_size'].std()],
    'count_zeros': [iris_count_zeros],
    'dataset_features': [iris.data.shape[1]],
})
iris_explain_stat

Unnamed: 0,mean,std,count_zeros,dataset_features
0,1.0,0.0,0,4


In [34]:
print(iris_explainer.I)

[x0 == 4.5, x1 == 2.3, x2 == 1.3, x3 == 0.3]


In [35]:
print(iris_explainer.T_constraints)

And(And(4.3 <= x0, 7 >= x0),
    And(2 <= x1, 4.4 >= x1),
    And(1 <= x2, 5.1 >= x2),
    And(0.1 <= x3, 1.8 >= x3))


In [36]:
print(iris_explainer.T_model)

And(And(Implies(And(x3 <= 0.75), o_0_0 == -2.0454545454?),
        Implies(And(x3 > 0.75), o_0_0 == 1.9565217391?)))


In [37]:
print(iris_explainer.D)

And((o_0_0)*0.1 + 0.0444517625? < 0)


## Cancer Binary

In [38]:
gb_cancer = GradientBoostingClassifier(n_estimators=2, max_depth=3)

In [39]:
cancer = load_breast_cancer()
X_cancer, y_cancer = cancer.data, cancer.target
X_cancer_train, X_cancer_test, y_cancer_train, y_cancer_test = train_test_split(
    X_cancer, y_cancer, test_size=0.1, random_state=101)

gb_cancer.fit(X_cancer_train, y_cancer_train)

In [40]:
cancer_explainer = Explainer(gb_cancer, X_cancer)

In [41]:
cancer_explain_sizes = pd.DataFrame(columns=['explain_size'])
cancer_count_zeros = 0

for i in range(len(X_cancer)):
    explain_size = len(cancer_explainer.explain(
        X_cancer[i], reorder='asc'))
    if explain_size == 0:
        cancer_count_zeros += 1
    cancer_explain_sizes.loc[len(cancer_explain_sizes)] = [explain_size]

cancer_explain_stat = pd.DataFrame({
    'mean': [cancer_explain_sizes['explain_size'].mean()],
    'std': [cancer_explain_sizes['explain_size'].std()],
    'count_zeros': [cancer_count_zeros],
    'dataset_features': [cancer.data.shape[1]],
})
cancer_explain_stat

Unnamed: 0,mean,std,count_zeros,dataset_features
0,0.0,0.0,569,30


In [42]:
from sklearn.tree import export_text
for i, tree in enumerate(gb_cancer.estimators_[:, 0]):  # Access the trees for the first class
    tree_text = export_text(tree, feature_names=[f"Feature_{j}" for j in range(X_cancer_train.shape[1])])
    print(f"Tree {i + 1}:\n{tree_text}\n{'-'*40}")

Tree 1:
|--- Feature_23 <= 868.20
|   |--- Feature_27 <= 0.16
|   |   |--- Feature_27 <= 0.13
|   |   |   |--- value: [1.52]
|   |   |--- Feature_27 >  0.13
|   |   |   |--- value: [0.21]
|   |--- Feature_27 >  0.16
|   |   |--- Feature_23 <= 446.20
|   |   |   |--- value: [1.59]
|   |   |--- Feature_23 >  446.20
|   |   |   |--- value: [-2.69]
|--- Feature_23 >  868.20
|   |--- Feature_6 <= 0.07
|   |   |--- Feature_1 <= 18.84
|   |   |   |--- value: [1.59]
|   |   |--- Feature_1 >  18.84
|   |   |   |--- value: [-2.27]
|   |--- Feature_6 >  0.07
|   |   |--- value: [-2.69]

----------------------------------------
Tree 2:
|--- Feature_23 <= 868.20
|   |--- Feature_27 <= 0.16
|   |   |--- Feature_27 <= 0.13
|   |   |   |--- value: [1.43]
|   |   |--- Feature_27 >  0.13
|   |   |   |--- value: [0.19]
|   |--- Feature_27 >  0.16
|   |   |--- Feature_6 <= 0.29
|   |   |   |--- value: [-2.29]
|   |   |--- Feature_6 >  0.29
|   |   |   |--- value: [1.50]
|--- Feature_23 >  868.20
|   |--- 

In [43]:
print(cancer_explainer.T_model)

And(And(Implies(And(x23 <= 868.1999816894?,
                    x27 <= 0.1602999940?,
                    x27 <= 0.1323499977?),
                o_0_0 == 1.5176837510?),
        Implies(And(x23 <= 868.1999816894?,
                    x27 <= 0.1602999940?,
                    x27 > 0.1323499977?),
                o_0_0 == 0.2078689008?),
        Implies(And(x23 <= 868.1999816894?,
                    x27 > 0.1602999940?,
                    x23 <= 446.2000122070?),
                o_0_0 == 1.5900621118?),
        Implies(And(x23 <= 868.1999816894?,
                    x27 > 0.1602999940?,
                    x23 > 446.2000122070?),
                o_0_0 == -2.6947368421?),
        Implies(And(x23 > 868.1999816894?,
                    x6 <= 0.0721400007?,
                    x1 <= 18.8350000381?),
                o_0_0 == 1.5900621118?),
        Implies(And(x23 > 868.1999816894?,
                    x6 <= 0.0721400007?,
                    x1 > 18.8350000381?),
                o_0_0 == 

## Wine Multiclass

In [44]:
gb_wine = GradientBoostingClassifier(n_estimators=100)

In [45]:
wine = load_wine()
X_wine, y_wine = wine.data, wine.target
X_wine_train, X_wine_test, y_wine_train, y_wine_test = train_test_split(
    X_wine, y_wine, test_size=0.1, random_state=101)

gb_wine.fit(X_wine_train, y_wine_train)

In [46]:
wine_explainer = Explainer(gb_wine, X_wine)

In [47]:
wine_explain_sizes = pd.DataFrame(columns=['explain_size'])
wine_count_zeros = 0

for i in range(len(X_wine_test)):
    explain_size = len(wine_explainer.explain(X_wine_test[i], reorder='asc'))
    if explain_size == 0:
        wine_count_zeros += 1
        print(i)
    wine_explain_sizes.loc[len(wine_explain_sizes)] = [explain_size]

wine_explain_stat = pd.DataFrame({
    'mean': [wine_explain_sizes['explain_size'].mean()],
    'std': [wine_explain_sizes['explain_size'].std()],
    'count_zeros': [wine_count_zeros],
    'dataset_features': [wine.data.shape[1]],
})
wine_explain_stat

Unnamed: 0,mean,std,count_zeros,dataset_features
0,3.833333,0.785905,0,13


In [48]:
wine_explain_sizes = pd.DataFrame(columns=['explain_size'])
wine_count_zeros = 0

for i in range(len(X_wine_test)):
    explain_size = len(wine_explainer.explain(X_wine_test[i], reorder='desc'))
    if explain_size == 0:
        wine_count_zeros += 1
        print(i)
    wine_explain_sizes.loc[len(wine_explain_sizes)] = [explain_size]

wine_explain_stat = pd.DataFrame({
    'mean': [wine_explain_sizes['explain_size'].mean()],
    'std': [wine_explain_sizes['explain_size'].std()],
    'count_zeros': [wine_count_zeros],
    'dataset_features': [wine.data.shape[1]],
})
wine_explain_stat

Unnamed: 0,mean,std,count_zeros,dataset_features
0,4.277778,0.95828,0,13


## Digits multiclass

In [49]:
gb_digits = GradientBoostingClassifier(n_estimators=5)

In [50]:
digits = load_digits()
X_digits, y_digits = digits.data, digits.target
X_digits_train, X_digits_test, y_digits_train, y_digits_test = train_test_split(
    X_digits, y_digits, test_size=0.01, random_state=101)

gb_digits.fit(X_digits_train, y_digits_train)

In [51]:
X_digits.shape, gb_digits.n_classes_

((1797, 64), 10)

In [52]:
digits_explainer = Explainer(gb_digits, X_digits)

In [53]:
digits_explain_sizes = pd.DataFrame(columns=['explain_size'])
digits_count_zeros = 0

for i in range(len(X_digits_test)):
    explain_size = len(digits_explainer.explain(
        X_digits_test[i], reorder='asc'))
    if explain_size == 0:
        digits_count_zeros += 1
        print(i)
    digits_count = i
    digits_explain_sizes.loc[len(digits_explain_sizes)] = [explain_size]

digits_explain_stat = pd.DataFrame({
    'mean': [digits_explain_sizes['explain_size'].mean()],
    'std': [digits_explain_sizes['explain_size'].std()],
    'count_zeros': [digits_count_zeros],
    'dataset_len': [digits_count],
    'dataset_features': [digits.data.shape[1]],
})
digits_explain_stat

Unnamed: 0,mean,std,count_zeros,dataset_len,dataset_features
0,41.0,15.457722,0,17,64


In [54]:
digits_explain_sizes = pd.DataFrame(columns=['explain_size'])
digits_count_zeros = 0

for i in range(len(X_digits_test)):
    explain_size = len(digits_explainer.explain(
        X_digits_test[i], reorder='desc'))
    if explain_size == 0:
        digits_count_zeros += 1
        print(i)
    digits_count = i
    digits_explain_sizes.loc[len(digits_explain_sizes)] = [explain_size]

digits_explain_stat = pd.DataFrame({
    'mean': [digits_explain_sizes['explain_size'].mean()],
    'std': [digits_explain_sizes['explain_size'].std()],
    'count_zeros': [digits_count_zeros],
    'dataset_len': [digits_count],
    'dataset_features': [digits.data.shape[1]],
})
digits_explain_stat

Unnamed: 0,mean,std,count_zeros,dataset_len,dataset_features
0,40.944444,15.554353,0,17,64


In [55]:
gb_auto = GradientBoostingClassifier(n_estimators=5)

In [56]:
auto_data = fetch_data('auto')
print(auto_data.shape)
auto_data.head()

(202, 26)


Unnamed: 0,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,...,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price,target
0,26,11,1,0,2,2,1,0,93.7,157.3,...,1,4,19,9.4,42,16,31.0,38.0,119,2
1,3,17,1,1,1,3,1,0,99.1,186.6,...,5,25,10,9.0,23,16,19.0,26.0,70,2
2,17,19,1,0,2,0,2,0,98.4,176.2,...,5,30,28,9.3,9,8,24.0,30.0,62,2
3,19,9,1,0,2,0,2,0,96.6,180.3,...,5,22,12,8.3,21,7,16.0,18.0,92,3
4,51,20,1,0,2,0,1,0,94.5,159.3,...,5,13,24,8.5,54,16,24.0,29.0,15,3


In [57]:
set(auto_data['fuel-type'])

{0, 1}

In [58]:
X_auto = auto_data.drop(columns=['target']).values
y_auto = auto_data['target'].values
X_auto_train, X_auto_test, y_auto_train, y_auto_test = train_test_split(
    X_auto, y_auto, test_size=0.02, random_state=101)

gb_auto.fit(X_auto_train, y_auto_train)

In [59]:
gb_auto.score(X_auto_test, y_auto_test)

0.8

In [60]:
gb_auto.n_classes_

5

In [61]:
gb_auto.feature_importances_

array([7.73024595e-02, 9.00376762e-02, 0.00000000e+00, 7.18381415e-03,
       1.11722494e-01, 8.82433051e-03, 0.00000000e+00, 2.70745997e-02,
       1.33797440e-01, 4.50247035e-02, 1.68398915e-01, 1.23665402e-01,
       4.43874905e-02, 0.00000000e+00, 2.43699058e-03, 3.53329588e-05,
       6.70780766e-03, 1.29763651e-02, 2.07054958e-02, 1.94230884e-02,
       5.11527478e-02, 8.16538782e-03, 4.32629535e-06, 2.13342787e-02,
       1.96388528e-02])

In [62]:
auto_explainer = Explainer(gb_auto, X_auto)

In [63]:
import re

def extrair_variaveis(formula_str):
    # Expressão regular para capturar variáveis que começam com 'x' e são seguidas por números
    variaveis = re.findall(r'\bx\d+\b', formula_str)
    # Remover duplicatas convertendo a lista para um set e retornando como uma lista ordenada
    return sorted(set(variaveis))

variaveis = extrair_variaveis(auto_explainer.T_model.sexpr())
print(variaveis)

['x0', 'x1', 'x10', 'x11', 'x12', 'x14', 'x15', 'x16', 'x17', 'x18', 'x19', 'x20', 'x21', 'x22', 'x23', 'x24', 'x3', 'x4', 'x5', 'x7', 'x8', 'x9']


In [64]:
gb_auto.feature_importances_

array([7.73024595e-02, 9.00376762e-02, 0.00000000e+00, 7.18381415e-03,
       1.11722494e-01, 8.82433051e-03, 0.00000000e+00, 2.70745997e-02,
       1.33797440e-01, 4.50247035e-02, 1.68398915e-01, 1.23665402e-01,
       4.43874905e-02, 0.00000000e+00, 2.43699058e-03, 3.53329588e-05,
       6.70780766e-03, 1.29763651e-02, 2.07054958e-02, 1.94230884e-02,
       5.11527478e-02, 8.16538782e-03, 4.32629535e-06, 2.13342787e-02,
       1.96388528e-02])

In [65]:
auto_data.columns

Index(['normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors',
       'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length',
       'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders',
       'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio',
       'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price', 'target'],
      dtype='object')

In [66]:
# Feature Importance
feature_importances = gb_auto.feature_importances_
feature_names = auto_data.drop(columns=['target']).columns

# Create a DataFrame for easier sorting and printing
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})

# Sort by importance in ascending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print the features and their importances
feature_importance_df

Unnamed: 0,Feature,Importance
10,width,0.168399
8,wheel-base,0.133797
11,height,0.123665
4,num-of-doors,0.111722
1,make,0.090038
0,normalized-losses,0.077302
20,horsepower,0.051153
9,length,0.045025
12,curb-weight,0.044387
7,engine-location,0.027075


In [67]:
X_auto_test[0]

array([3.800e+01, 1.900e+01, 1.000e+00, 0.000e+00, 1.000e+00, 4.000e+00,
       1.000e+00, 0.000e+00, 9.570e+01, 1.697e+02, 6.360e+01, 5.910e+01,
       2.280e+03, 3.000e+00, 2.000e+00, 9.200e+01, 1.000e+00, 8.000e+00,
       9.000e+00, 9.000e+00, 4.000e+01, 8.000e+00, 3.100e+01, 3.700e+01,
       1.260e+02])

In [68]:
gb_auto.predict([X_auto_test[1]])

array([1])

In [69]:
gb_auto.predict([[3.800e+01, 1.900e+01, 1.000e+00, 0.000e+00, 1.000e+00, 4.000e+00,
       1.000e+00, 0.000e+00, 9.570e+01, 1.697e+02, 6.360e+01, 5.910e+01,
       2.280e+03, 3.000e+00, 2.000e+00, 9.200e+01, 1.000e+00, 8.000e+00,
       9.000e+00, 9.000e+00, 4.000e+01, 8.000e+00, 3.100e+01, 3.700e+01,
       1.260e+02]])

array([0])

In [70]:
auto_explainer.explain(X_auto_test[0], reorder='none')

[x0 == 38,
 x1 == 19,
 x2 == 1,
 x3 == 0,
 x4 == 1,
 x5 == 4,
 x6 == 1,
 x7 == 0,
 x8 == 95.7,
 x9 == 169.7,
 x10 == 63.6,
 x11 == 59.1,
 x12 == 2280,
 x13 == 3,
 x14 == 2,
 x15 == 92,
 x16 == 1,
 x17 == 8,
 x18 == 9,
 x19 == 9,
 x20 == 40,
 x21 == 8,
 x22 == 31,
 x23 == 37,
 x24 == 126]

In [71]:
print(auto_explainer.T_constraints)

And(And(0 <= x0, 51 >= x0),
    And(0 <= x1, 21 >= x1),
    And(0 <= x2, 1 >= x2),
    And(0 <= x3, 1 >= x3),
    And(0 <= x4, 2 >= x4),
    And(0 <= x5, 4 >= x5),
    And(0 <= x6, 2 >= x6),
    And(0 <= x7, 1 >= x7),
    And(86.6 <= x8, 120.9 >= x8),
    And(141.1 <= x9, 208.1 >= x9),
    And(60.3 <= x10, 72.3 >= x10),
    And(47.8 <= x11, 59.8 >= x11),
    And(1488 <= x12, 4066 >= x12),
    And(0 <= x13, 6 >= x13),
    And(0 <= x14, 6 >= x14),
    And(61 <= x15, 326 >= x15),
    And(0 <= x16, 7 >= x16),
    And(0 <= x17, 38 >= x17),
    And(0 <= x18, 36 >= x18),
    And(7 <= x19, 23 >= x19),
    And(0 <= x20, 59 >= x20),
    And(0 <= x21, 23 >= x21),
    And(13 <= x22, 49 >= x22),
    And(16 <= x23, 54 >= x23),
    And(0 <= x24, 186 >= x24))


In [72]:
s = Solver()
s.add(auto_explainer.D)

In [73]:
s.check()

In [74]:
auto_explainer.D

In [75]:
auto_explain_sizes = pd.DataFrame(columns=['explain_size'])
auto_count_zeros = 0

for i in range(len(X_auto_test)):
    explain_size = len(auto_explainer.explain(X_auto_test[i], reorder='asc'))
    if explain_size == 0:
        auto_count_zeros += 1
        print(i)
    auto_count = i
    auto_explain_sizes.loc[len(auto_explain_sizes)] = [explain_size]

auto_explain_stat = pd.DataFrame({
    'mean': [auto_explain_sizes['explain_size'].mean()],
    'std': [auto_explain_sizes['explain_size'].std()],
    'count_zeros': [auto_count_zeros],
    'dataset_len': [auto_count],
    'dataset_features': [X_auto.shape[1]],
})
auto_explain_stat

Unnamed: 0,mean,std,count_zeros,dataset_len,dataset_features
0,22.0,0.0,0,4,25


In [76]:
auto_explain_sizes = pd.DataFrame(columns=['explain_size'])
auto_count_zeros = 0

for i in range(len(X_auto_test)):
    explain_size = len(auto_explainer.explain(X_auto_test[i], reorder='desc'))
    if explain_size == 0:
        auto_count_zeros += 1
        print(i)
    auto_count = i
    auto_explain_sizes.loc[len(auto_explain_sizes)] = [explain_size]

auto_explain_stat = pd.DataFrame({
    'mean': [auto_explain_sizes['explain_size'].mean()],
    'std': [auto_explain_sizes['explain_size'].std()],
    'count_zeros': [auto_count_zeros],
    'dataset_len': [auto_count],
    'dataset_features': [X_auto.shape[1]],
})
auto_explain_stat

Unnamed: 0,mean,std,count_zeros,dataset_len,dataset_features
0,22.0,0.0,0,4,25


## backache (binary)

In [77]:
gb_backache = GradientBoostingClassifier(n_estimators=10, learning_rate=0.1, max_depth=3, )

In [78]:
backache_data = fetch_data('backache')
backache_data

Unnamed: 0,id,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,col_10,...,col_24,col_25,col_26,col_27,col_28,col_29,col_30,col_31,col_32,target
0,1.0,1,0,26.0,1.52,54.5,75.0,3.35,0,1,...,0,0,0,0,0,0,0,0,0,0
1,2.0,3,0,23.0,1.60,59.1,68.6,2.22,1,2,...,1,0,0,0,0,0,0,0,0,0
2,3.0,2,6,24.0,1.57,73.2,82.7,4.15,0,1,...,1,0,0,0,0,0,0,0,0,0
3,4.0,1,8,22.0,1.52,41.4,47.3,2.81,0,1,...,1,0,0,0,0,0,0,0,0,0
4,5.0,1,0,27.0,1.60,55.5,60.0,3.75,1,2,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,176.0,1,0,34.0,1.63,50.9,60.5,2.93,0,1,...,0,0,0,0,0,0,0,1,0,0
176,177.0,3,3,26.0,1.63,66.8,84.1,3.10,1,3,...,0,0,0,0,0,0,1,1,0,0
177,178.0,1,7,18.0,1.50,54.1,60.5,3.52,0,0,...,0,0,0,0,0,0,0,1,0,0
178,179.0,3,6,39.0,1.52,82.7,84.1,3.35,1,2,...,0,0,0,0,0,0,0,0,0,0


In [79]:
# backache_data.shape, gb_backache.n_classes_

In [80]:
# gb_backache.feature_importances_

In [81]:
X_backache = backache_data.drop(columns=['target']).values
y_backache = backache_data['target'].values
X_backache_train, X_backache_test, y_backache_train, y_backache_test = train_test_split(
    X_backache, y_backache, test_size=0.1, random_state=101)

gb_backache.fit(X_backache_train, y_backache_train)

In [82]:
gb_backache.score(X_backache_test, y_backache_test)

0.8333333333333334

In [83]:
backache_explainer = Explainer(gb_backache, X_backache)

In [84]:
backache_explain_sizes = pd.DataFrame(columns=['explain_size'])
backache_count_zeros = 0

for i in range(len(X_backache_test)):
    explain_size = len(backache_explainer.explain(X_backache_test[i], reorder='asc'))
    if explain_size == 0:
        backache_count_zeros += 1
        print(i)
    backache_count = i
    backache_explain_sizes.loc[len(backache_explain_sizes)] = [explain_size]

backache_explain_stat = pd.DataFrame({
    'mean': [backache_explain_sizes['explain_size'].mean()],
    'std': [backache_explain_sizes['explain_size'].std()],
    'count_zeros': [backache_count_zeros],
    'dataset_len': [backache_count],
    'dataset_features': [X_backache.shape[1]],
})
backache_explain_stat

Unnamed: 0,mean,std,count_zeros,dataset_len,dataset_features
0,6.722222,2.886185,0,17,32


In [85]:
backache_explain_sizes = pd.DataFrame(columns=['explain_size'])
backache_count_zeros = 0

for i in range(len(X_backache_test)):
    explain_size = len(backache_explainer.explain(X_backache_test[i], reorder='desc'))
    if explain_size == 0:
        backache_count_zeros += 1
        print(i)
    backache_count = i
    backache_explain_sizes.loc[len(backache_explain_sizes)] = [explain_size]

backache_explain_stat = pd.DataFrame({
    'mean': [backache_explain_sizes['explain_size'].mean()],
    'std': [backache_explain_sizes['explain_size'].std()],
    'count_zeros': [backache_count_zeros],
    'dataset_len': [backache_count],
    'dataset_features': [X_backache.shape[1]],
})
backache_explain_stat

Unnamed: 0,mean,std,count_zeros,dataset_len,dataset_features
0,6.388889,2.973192,0,17,32
