In [1]:
import os
import docplex
import pandas as pd
import tensorflow as tf
import numpy as np
import utility
import copy
import mlp_explainer
import mymetrics
import time
import pickle
from sklearn.preprocessing import MinMaxScaler
from sklearn import datasets
from sklearn.model_selection import train_test_split
from milp import codify_network
from teste import get_minimal_explanation
from sklearn.metrics import classification_report
import joblib
import re

In [2]:
np.random.seed(50)
def load_data(dataset_name):
    if dataset_name == 'Iris':
        dataset = datasets.load_iris()
        df = pd.DataFrame(dataset.data, columns = dataset.feature_names)
        scaler = MinMaxScaler()
        scaler.fit(dataset.data)
        scaled_df = scaler.transform(dataset.data)
        df_scaled = pd.DataFrame(scaled_df, columns=df.columns)
        targets = dataset.target
        df_scaled['target'] = targets
        columns = df_scaled.columns
        X_train, X_test, y_train, y_test = train_test_split(scaled_df, targets, test_size=0.75,random_state=50,stratify=targets)
        return X_train, X_test, y_train, y_test, columns, np.unique(targets)
    elif dataset_name == 'Banknote':
        df = pd.read_csv('./datasets/banknote_authentication.csv') 
        scaler = MinMaxScaler()
        scaler.fit(df.values[:, :-1])
        scaled_df = scaler.transform(df.values[:, :-1])
        df_scaled = pd.DataFrame(scaled_df, columns=df.columns[:-1])
        #targets = (utility.check_targets_0_1(df.values[:,-1])).astype(np.int32)
        targets = df['target'].values
        df_scaled['target'] = targets
        columns = df_scaled.columns
        X_train, X_test, y_train, y_test = train_test_split(scaled_df, targets, test_size=0.75,random_state=50,stratify=targets)
        return X_train, X_test, y_train, y_test, columns, np.unique(targets)
    elif dataset_name == 'Blood_Transfusion':
        df = pd.read_csv('./datasets/blood_transfusion.csv')
        scaler = MinMaxScaler()
        scaler.fit(df.values[:, :-1])
        scaled_df = scaler.transform(df.values[:, :-1])
        df_scaled = pd.DataFrame(scaled_df, columns=df.columns[:-1])
        targets = (utility.check_targets_0_1(df.values[:,-1])).astype(np.int32)
        df_scaled['target'] = targets
        columns = df_scaled.columns
        X_train, X_test, y_train, y_test = train_test_split(scaled_df, targets, test_size=0.75,random_state=50,stratify=targets)
        return X_train, X_test, y_train, y_test, columns, np.unique(targets)
    elif dataset_name == 'Breast_Cancer':
        dataset = datasets.load_breast_cancer()
        df = pd.DataFrame(dataset.data, columns = dataset.feature_names)
        scaler = MinMaxScaler()
        scaler.fit(dataset.data)
        scaled_df = scaler.transform(dataset.data) 
        df_scaled = pd.DataFrame(scaled_df, columns=df.columns)
        targets = (utility.check_targets_0_1(np.where(dataset.target == dataset.target[0],0,1))).astype(np.int32)
        df_scaled['target'] = targets
        columns = df_scaled.columns
        X_train, X_test, y_train, y_test = train_test_split(scaled_df, targets, test_size=0.75,random_state=50,stratify=targets)
        return X_train, X_test, y_train, y_test, columns, np.unique(targets)
    elif dataset_name == 'Climate':
        df = pd.read_csv('./datasets/climate_model_simulation_crashes.csv')
        scaler = MinMaxScaler()
        scaler.fit(df.values[:, :-1])
        scaled_df = scaler.transform(df.values[:, :-1])
        df_scaled = pd.DataFrame(scaled_df, columns=df.columns[:-1])
        targets = df['target'].values
        df_scaled['target'] = targets
        columns = df_scaled.columns
        X_train, X_test, y_train, y_test = train_test_split(scaled_df, targets, test_size=0.75,random_state=50,stratify=targets)
        return X_train, X_test, y_train, y_test, columns, np.unique(targets)
    elif dataset_name == 'Column':
        df = pd.read_csv('./datasets/column_2C.dat', sep=" ", names=['pelvic_incidence', 'pelvic_tilt', 'lumbar_lordosis_angle', 'sacral_slope', 'pelvic_radius', 'degree_spondylolisthesis','target'])
        df['target']=np.where(df['target']=='AB',1,0)
        scaler = MinMaxScaler()
        scaler.fit(df.values[:, :-1])
        scaled_df = scaler.transform(df.values[:, :-1])
        df_scaled = pd.DataFrame(scaled_df, columns=df.columns[:-1])
        targets = (utility.check_targets_0_1(df.values[:,-1])).astype(np.int32)
        df_scaled['target'] = targets
        columns = df_scaled.columns
        X_train, X_test, y_train, y_test = train_test_split(scaled_df, targets, test_size=0.75,random_state=50,stratify=targets)
        return X_train, X_test, y_train, y_test, columns, np.unique(targets)
    elif dataset_name == 'Glass':
        df = pd.read_csv('./datasets/glass.csv')
        unique_labels = sorted(df['target'].unique())
        label_map = {original: new for new, original in enumerate(unique_labels)}
        df['target'] = df['target'].map(label_map)
        scaler = MinMaxScaler()
        scaler.fit(df.values[:, :-1])
        scaled_df = scaler.transform(df.values[:, :-1])
        df_scaled = pd.DataFrame(scaled_df, columns=df.columns[:-1])
        targets = df['target'].values
        df_scaled['target'] = targets
        columns = df_scaled.columns
        X_train, X_test, y_train, y_test = train_test_split(scaled_df, targets, test_size=0.75,random_state=50,stratify=targets)
        return X_train, X_test, y_train, y_test, columns, np.unique(targets)
    elif dataset_name == 'Ionosphere':
        df = pd.read_csv('./datasets/Ionosphere.csv')
        scaler = MinMaxScaler()
        scaler.fit(df.values[:, :-1])
        scaled_df = scaler.transform(df.values[:, :-1])
        lower_bound = scaled_df.min()
        upper_bound = scaled_df.max()
        print(lower_bound, upper_bound)
        df_scaled = pd.DataFrame(scaled_df, columns=df.columns[:-1])
        targets = (utility.check_targets_0_1(df.values[:,-1])).astype(np.int32)
        df_scaled['target'] = targets
        columns = df_scaled.columns
        X_train, X_test, y_train, y_test = train_test_split(scaled_df, targets, test_size=0.75,random_state=50,stratify=targets)
        return X_train, X_test, y_train, y_test, columns, np.unique(targets)
    elif dataset_name == 'Modeling':
        df = pd.read_csv('./datasets/User_Knowledge_Modeling.csv')
        unique_labels = sorted(df['target'].unique()) 
        label_map = {original: new for new, original in enumerate(unique_labels)}
        df['target'] = df['target'].map(label_map)
        scaler = MinMaxScaler()
        scaler.fit(df.values[:, :-1])
        scaled_df = scaler.transform(df.values[:, :-1])
        df_scaled = pd.DataFrame(scaled_df, columns=df.columns[:-1])
        targets = df['target'].values
        df_scaled['target'] = targets
        columns = df_scaled.columns
        X_train, X_test, y_train, y_test = train_test_split(scaled_df, targets, test_size=0.75,random_state=50,stratify=targets)
        return X_train, X_test, y_train, y_test, columns, np.unique(targets)
    elif dataset_name == 'Parkinson':
        df = pd.read_csv('./datasets/parkinsons.csv')
        scaler = MinMaxScaler()
        scaler.fit(df.values[:, :-1])
        scaled_df = scaler.transform(df.values[:, :-1])
        lower_bound = scaled_df.min()
        upper_bound = scaled_df.max()
        print(lower_bound, upper_bound)
        df_scaled = pd.DataFrame(scaled_df, columns=df.columns[:-1])
        targets = (utility.check_targets_0_1(df.values[:,-1])).astype(np.int32)
        df_scaled['target'] = targets
        columns = df_scaled.columns
        X_train, X_test, y_train, y_test = train_test_split(scaled_df, targets, test_size=0.75,random_state=50,stratify=targets)
        return X_train, X_test, y_train, y_test, columns, np.unique(targets)
    elif dataset_name == 'Pima':
        df = pd.read_csv('./datasets/diabetes.csv')
        scaler = MinMaxScaler()
        scaler.fit(df.values[:, :-1])
        scaled_df = scaler.transform(df.values[:, :-1])
        df_scaled = pd.DataFrame(scaled_df, columns=df.columns[:-1])
        targets = (utility.check_targets_0_1(df.values[:,-1])).astype(np.int32)
        df_scaled['target'] = targets
        columns = df_scaled.columns
        X_train, X_test, y_train, y_test = train_test_split(scaled_df, targets, test_size=0.75,random_state=50,stratify=targets)
        return X_train, X_test, y_train, y_test, columns, np.unique(targets)
    elif dataset_name == 'Sonar':
        df = pd.read_csv('./datasets/sonar.csv')
        scaler = MinMaxScaler()
        scaler.fit(df.values[:, :-1])
        scaled_df = scaler.transform(df.values[:, :-1])
        df_scaled = pd.DataFrame(scaled_df, columns=df.columns[:-1])
        targets = df['target'].values
        df_scaled['target'] = targets
        columns = df_scaled.columns
        X_train, X_test, y_train, y_test = train_test_split(scaled_df, targets, test_size=0.75,random_state=50,stratify=targets)
        return X_train, X_test, y_train, y_test, columns, np.unique(targets)
    elif dataset_name == 'Wine':
        dataset = datasets.load_wine()
        df = pd.DataFrame(dataset.data, columns = dataset.feature_names)
        scaler = MinMaxScaler()
        scaler.fit(dataset.data)
        scaled_df = scaler.transform(dataset.data)
        lower_bound = scaled_df.min()
        upper_bound = scaled_df.max()
        print(lower_bound, upper_bound)
        df_scaled = pd.DataFrame(scaled_df, columns=df.columns)
        targets = dataset.target
        df_scaled['target'] = targets
        columns = df_scaled.columns
        X_train, X_test, y_train, y_test = train_test_split(scaled_df, targets, test_size=0.75,random_state=50,stratify=targets)
        return X_train, X_test, y_train, y_test, columns, np.unique(targets)
    else:
        print("Incorrect dataset name")
        
def parse_explanation(explanation, feature_names, epsilon=1e-6):
    bounds = [[0, 1] for _ in range(len(feature_names))]
    conditions = explanation

    for condition in conditions:
        condition_no_space = condition.replace(' ', '')  # for regex matching
        # Check for double inequality
        match = re.match(r'(\d+\.?\d*)\s*(<|<=)\s*([^\s<>]+)\s*(<|<=)\s*(\d+\.?\d*)', condition_no_space)
        
        if match:
            value_1, op1, feature_token, op2, value_2 = match.groups()
            value_1 = float(value_1)
            value_2 = float(value_2)
            lower_bound = value_1 if op1 == '<=' else value_1 + epsilon
            upper_bound = value_2 if op2 == '<=' else value_2
            upper_bound = upper_bound if op2 == '<=' else upper_bound - epsilon

            for idx, feature in enumerate(feature_names):
                if feature.replace(" ", "") in feature_token:
                    bounds[idx] = [lower_bound, upper_bound]
                    break
            continue  # go to next condition

        # Fallback to single operator logic
        for idx, feature in enumerate(feature_names):
            if feature in condition:
                cond_clean = condition.replace('<=', ' LESS_EQUAL ').replace('>=', ' GREATER_EQUAL ')
                cond_clean = cond_clean.replace('<', ' < ').replace('>', ' > ')
                tokens = cond_clean.split()

                tokens = ['<=' if token == 'LESS_EQUAL' else token for token in tokens]
                tokens = ['>=' if token == 'GREATER_EQUAL' else token for token in tokens]

                operator = None
                operator_pos = None
                for i, token in enumerate(tokens):
                    if token in ['>', '>=', '<', '<=']:
                        operator = token
                        operator_pos = i
                        break

                value = None
                if operator is not None and operator_pos is not None:
                    for i in range(operator_pos + 1, len(tokens)):
                        try:
                            value = float(tokens[i])
                            break
                        except ValueError:
                            continue

                if value is not None:
                    if operator == '>':
                        bounds[idx] = [value + epsilon, 1]
                    elif operator == '>=':
                        bounds[idx] = [value, 1]
                    elif operator == '<':
                        bounds[idx] = [0, value - epsilon]
                    elif operator == '<=':
                        bounds[idx] = [0, value]
                else:
                    print(f"Could not extract numeric value from condition: '{condition}'")

    return np.array(bounds)

In [3]:
from alibi.explainers import AnchorTabular

def run_anchors(dataset_name, epsilon = 1e-6, verbose=0):

    def predict_fn(x):
        return clf.predict(x, verbose=0)
    
    print(dataset_name)
    clf = tf.keras.models.load_model(f'new_models/{dataset_name}.h5', compile=False)
    print(f'Loaded model')
    test_dataset_df = pd.read_csv(f'{dataset_name}_results/{dataset_name}_X_test.csv')
    X_train, X_test, y_train, y_test,feature_names, class_names = load_data(dataset_name)
    if 'target' in feature_names:
        feature_names = feature_names[feature_names != 'target']
    print(f'feature names:\n {feature_names}')
    print(f'class names:\n {class_names}')
    #predict_fn = lambda x: clf.predict(x)
    explainer = AnchorTabular(predict_fn, feature_names)
    explainer.fit(X_train, disc_perc=(25, 50, 75))
    coverages = []
    errors = []
    times = []
    rsums = []
    sizes = []
    explanations = []
    all_bounds = []
    for idx in range(len(test_dataset_df)):
        prediction = class_names[explainer.predictor(test_dataset_df.values[idx,:-1].reshape(1, -1))[0]]
        
        start = time.perf_counter()
        explanation = explainer.explain(test_dataset_df.values[idx,:-1], threshold=1)
        end =  time.perf_counter()
        times.append(end - start)
        size = 0
        for feature_name in feature_names:
            for anchor in explanation.anchor:
                if feature_name in re.split(r' <= | >= | < | > ', anchor):
                    size += 1
        bounds = parse_explanation(explanation.anchor, feature_names, epsilon)
        rsum = mymetrics.range_sum(bounds)
        
        coverage_df = mymetrics.calculate_coverage(test_dataset_df, bounds)
        my_coverage = len(coverage_df)
        error = len(coverage_df[coverage_df['target'] != prediction])
        original_instance_df = pd.DataFrame(test_dataset_df.values[idx,:-1].reshape(1, -1), columns=feature_names)
        original_instance_df['target'] = prediction
        instance_coverage = len(mymetrics.calculate_coverage(original_instance_df, bounds))
        if instance_coverage == 0:
            my_coverage += 1
        if verbose:
            print('\n\nPrediction: ', prediction)
            print('Anchor: %s' % (' AND '.join(explanation.anchor)))
            print(f'Bounds:\n {bounds}')
            print('Precision: %.2f' % explanation.precision)
            print('Coverage: %.2f' % explanation.coverage)
            print(f'Time: {times[-1]}')
            print(f'My_coverage: {my_coverage}')      
            if error > 0:    
                print(f'errors: {error}')
                print(f'explanation:\n {bounds}')
                display(coverage_df[coverage_df['target'] != prediction])
        coverages.append(my_coverage)
        errors.append(error)
        rsums.append(rsum)
        sizes.append(size)
        explanations.append(explanation.anchor)
        all_bounds.append(bounds)
    print('END')
    result_df = pd.DataFrame(columns=['coverage','errors','time'])
    result_df['coverage'] = coverages
    result_df['errors'] = errors
    result_df['time'] = times
    result_df['rsum'] = rsums
    result_df['size'] = sizes
    result_df.to_csv(f'./Anchors_results/{dataset_name}_results.csv',index=False)
    print(f'saved {dataset_name}_results.csv')

    anchors_exp_df = pd.DataFrame()
    anchors_exp_df['Explanation'] = explanations
    anchors_exp_df['Bounds'] = all_bounds
    anchors_exp_df.to_csv(f'./Anchors_results/{dataset_name}_explanations.csv',index=False)
    #return coverages,errors, times
    explainer.save(f'Anchors_explainers/{dataset_name}_model')
for dataset_name in ['Iris', 'Wine', 'Column', 'Pima', 'Parkinson', 'Breast_Cancer', 'Blood_Transfusion', 'Ionosphere', 'Glass', 'Climate', 'Modeling', 'Banknote', 'Sonar']:
    run_anchors(dataset_name = dataset_name)


Iris
Loaded model
feature names:
 Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)'],
      dtype='object')
class names:
 [0 1 2]
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step 


Could not find an anchor satisfying the 1 precision constraint. Now returning the best non-eligible result. The desired precision threshold might not be achieved due to the quantile-based discretisation of the numerical features. The resolution of the bins may be too large to find an anchor of required precision. Consider increasing the number of bins in `disc_perc`, but note that for some numerical distribution (e.g. skewed distribution) it may not help.
Could not find an anchor satisfying the 1 precision constraint. Now returning the best non-eligible result. The desired precision threshold might not be achieved due to the quantile-based discretisation of the numerical features. The resolution of the bins may be too large to find an anchor of required precision. Consider increasing the number of bins in `disc_perc`, but note that for some numerical distribution (e.g. skewed distribution) it may not help.
Could not find an anchor satisfying the 1 precision constraint. Now returning th

END
saved Iris_results.csv


In [9]:
test_dataset_df[test_dataset_df['target']==0]

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,0.786842,0.185771,0.454545,0.278351,0.282609,0.575862,0.419831,0.245283,0.495268,0.291809,0.455285,0.849817,0.539943,0
1,0.710526,0.150198,0.716578,0.613402,0.336957,0.696552,0.613924,0.301887,0.621451,0.377133,0.577236,0.527473,0.71826,0
2,0.707895,0.136364,0.609626,0.314433,0.413043,0.834483,0.702532,0.113208,0.514196,0.47099,0.333333,0.586081,0.71826,0
3,0.597368,0.193676,0.417112,0.329897,0.26087,0.489655,0.390295,0.264151,0.29653,0.227816,0.439024,0.549451,0.71826,0
4,0.715789,0.195652,0.561497,0.278351,0.206522,0.558621,0.510549,0.301887,0.44164,0.368601,0.544715,0.59707,0.743224,0
5,0.807895,0.280632,0.502674,0.381443,0.380435,0.67931,0.628692,0.169811,0.621451,0.381399,0.626016,0.695971,0.878745,0
6,0.797368,0.278656,0.668449,0.360825,0.554348,0.558621,0.457806,0.339623,0.264984,0.321672,0.471545,0.846154,0.725392,0
7,0.665789,0.1917,0.508021,0.28866,0.51087,0.748276,0.622363,0.396226,0.608833,0.413823,0.382114,0.772894,0.368759,0
8,0.686842,0.466403,0.641711,0.237113,0.5,0.593103,0.567511,0.075472,0.394322,0.325939,0.390244,0.765568,0.404422,0
9,0.5,0.604743,0.68984,0.412371,0.347826,0.493103,0.436709,0.226415,0.495268,0.274744,0.447154,0.824176,0.350927,0
