In [None]:
from pulp import *
from pulp import LpProblem, LpVariable, LpMinimize, LpInteger, lpSum, value, LpBinary,LpStatusOptimal
import pulp
import numpy as np
import pandas as pd
import time
from sklearn.preprocessing import MinMaxScaler
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore", message="Overwriting previously set objective.")
import utility
import docplex.mp.model
import docplex
import docplex_explainer
import mymetrics
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import seaborn as sns
import joblib
import dill
from alibi.explainers import AnchorTabular


In [None]:
def load_data(dataset_name):
    if dataset_name == 'Iris':
        # Load Dataset
        dataset = datasets.load_iris()
        df = pd.DataFrame(dataset.data, columns = dataset.feature_names)
        # Scale
        scaler = MinMaxScaler()
        scaler.fit(dataset.data)
        scaled_df = scaler.transform(dataset.data)
        # Check if binary targets
        df_scaled = pd.DataFrame(scaled_df, columns=df.columns)
        targets = (utility.check_targets_0_1(np.where(dataset.target == dataset.target[0],0,1))).astype(np.int32)
        df_scaled['target'] = targets
        X_train, X_test, y_train, y_test = train_test_split(scaled_df, targets, test_size=0.75,random_state=50,stratify=targets)
        return X_train, X_test, y_train, y_test, df.columns.values, np.unique(targets)
    elif dataset_name == 'Banknote':
        # Load Dataset
        df = pd.read_csv('./datasets/banknote_authentication.csv')
        # Scale
        scaler = MinMaxScaler()
        scaler.fit(df.values[:, :-1])
        scaled_df = scaler.transform(df.values[:, :-1])
        df_scaled = pd.DataFrame(scaled_df, columns=df.columns[:-1])
        targets = (utility.check_targets_0_1(df.values[:,-1])).astype(np.int32)
        df_scaled['target'] = targets
        X_train, X_test, y_train, y_test = train_test_split(scaled_df, targets, test_size=0.75,random_state=50,stratify=targets)
        return X_train, X_test, y_train, y_test, df.columns.values, np.unique(targets)
    elif dataset_name == 'Blood_Transfusion':
        df = pd.read_csv('./datasets/blood_transfusion.csv')
        scaler = MinMaxScaler()
        scaler.fit(df.values[:, :-1])
        scaled_df = scaler.transform(df.values[:, :-1])
        df_scaled = pd.DataFrame(scaled_df, columns=df.columns[:-1])
        targets = (utility.check_targets_0_1(df.values[:,-1])).astype(np.int32)
        df_scaled['target'] = targets
        X_train, X_test, y_train, y_test = train_test_split(scaled_df, targets, test_size=0.75,random_state=50,stratify=targets)
        return X_train, X_test, y_train, y_test, df.columns.values, np.unique(targets)
    elif dataset_name == 'Breast_Cancer':
        dataset = datasets.load_breast_cancer()
        df = pd.DataFrame(dataset.data, columns = dataset.feature_names)
        scaler = MinMaxScaler()
        scaler.fit(dataset.data)
        scaled_df = scaler.transform(dataset.data)
        df_scaled = pd.DataFrame(scaled_df, columns=df.columns)
        targets = (utility.check_targets_0_1(np.where(dataset.target == dataset.target[0],0,1))).astype(np.int32)
        df_scaled['target'] = targets
        X_train, X_test, y_train, y_test = train_test_split(scaled_df, targets, test_size=0.75,random_state=50,stratify=targets)
        return X_train, X_test, y_train, y_test, df.columns.values, np.unique(targets)
    elif dataset_name == 'Climate':
        df = pd.read_csv('./datasets/climate_model_simulation_crashes.csv')
        scaler = MinMaxScaler()
        scaler.fit(df.values[:, :-1])
        scaled_df = scaler.transform(df.values[:, :-1])
        df_scaled = pd.DataFrame(scaled_df, columns=df.columns[:-1])
        targets = (utility.check_targets_0_1(df.values[:,-1])).astype(np.int32)
        df_scaled['target'] = targets
        X_train, X_test, y_train, y_test = train_test_split(scaled_df, targets, test_size=0.75,random_state=50,stratify=targets)
        return X_train, X_test, y_train, y_test, df.columns.values, np.unique(targets)
    elif dataset_name == 'Glass':
        df = pd.read_csv('./datasets/glass.csv')
        df['target'] = df['target'].apply(lambda x: 1 if x in [1, 2, 3] else 0)
        scaler = MinMaxScaler()
        scaler.fit(df.values[:, :-1])
        scaled_df = scaler.transform(df.values[:, :-1])
        df_scaled = pd.DataFrame(scaled_df, columns=df.columns[:-1])
        targets = (utility.check_targets_0_1(df.values[:,-1])).astype(np.int32)
        df_scaled['target'] = targets
        X_train, X_test, y_train, y_test = train_test_split(scaled_df, targets, test_size=0.75,random_state=50,stratify=targets)
        return X_train, X_test, y_train, y_test, df.columns.values, np.unique(targets)
    elif dataset_name == 'Ionosphere':
        df = pd.read_csv('./datasets/ionosphere.csv')
        scaler = MinMaxScaler()
        scaler.fit(df.values[:, :-1])
        scaled_df = scaler.transform(df.values[:, :-1])
        df_scaled = pd.DataFrame(scaled_df, columns=df.columns[:-1])
        targets = (utility.check_targets_0_1(df.values[:,-1])).astype(np.int32)
        df_scaled['target'] = targets
        X_train, X_test, y_train, y_test = train_test_split(scaled_df, targets, test_size=0.75,random_state=50,stratify=targets)
        return X_train, X_test, y_train, y_test, df.columns.values, np.unique(targets)
    elif dataset_name == 'Modeling':
        df = pd.read_csv('./datasets/User_Knowledge_Modeling.csv')
        df['target'] = df['target'].apply(lambda x: 1 if x == 'Low' else 0)
        scaler = MinMaxScaler()
        scaler.fit(df.values[:, :-1])
        scaled_df = scaler.transform(df.values[:, :-1])
        df_scaled = pd.DataFrame(scaled_df, columns=df.columns[:-1])
        targets = (utility.check_targets_0_1(df.values[:,-1])).astype(np.int32)
        df_scaled['target'] = targets
        X_train, X_test, y_train, y_test = train_test_split(scaled_df, targets, test_size=0.75,random_state=50,stratify=targets)
        return X_train, X_test, y_train, y_test, df.columns.values, np.unique(targets)

    elif dataset_name == 'Parkinsons':
        df = pd.read_csv('./datasets/parkinsons.csv')
        scaler = MinMaxScaler()
        scaler.fit(df.values[:, :-1])
        scaled_df = scaler.transform(df.values[:, :-1])
        df_scaled = pd.DataFrame(scaled_df, columns=df.columns[:-1])
        targets = (utility.check_targets_0_1(df.values[:,-1])).astype(np.int32)
        df_scaled['target'] = targets
        X_train, X_test, y_train, y_test = train_test_split(scaled_df, targets, test_size=0.75,random_state=50,stratify=targets)
        return X_train, X_test, y_train, y_test, df.columns.values, np.unique(targets)

    elif dataset_name == 'Pima':
        df = pd.read_csv('./datasets/diabetes.csv')
        scaler = MinMaxScaler()
        scaler.fit(df.values[:, :-1])
        scaled_df = scaler.transform(df.values[:, :-1])
        df_scaled = pd.DataFrame(scaled_df, columns=df.columns[:-1])
        targets = (utility.check_targets_0_1(df.values[:,-1])).astype(np.int32)
        df_scaled['target'] = targets
        X_train, X_test, y_train, y_test = train_test_split(scaled_df, targets, test_size=0.75,random_state=50,stratify=targets)
        return X_train, X_test, y_train, y_test, df.columns.values, np.unique(targets)

    elif dataset_name == 'Sonar':
        df = pd.read_csv('./datasets/sonar.csv')
        scaler = MinMaxScaler()
        scaler.fit(df.values[:, :-1])
        scaled_df = scaler.transform(df.values[:, :-1])
        df_scaled = pd.DataFrame(scaled_df, columns=df.columns[:-1])
        targets = (utility.check_targets_0_1(df.values[:,-1])).astype(np.int32)
        df_scaled['target'] = targets
        X_train, X_test, y_train, y_test = train_test_split(scaled_df, targets, test_size=0.75,random_state=50,stratify=targets)
        return X_train, X_test, y_train, y_test, df.columns.values, np.unique(targets)
    elif dataset_name == 'Wine':
        dataset = datasets.load_wine()
        df = pd.DataFrame(dataset.data, columns = dataset.feature_names)
        scaler = MinMaxScaler()
        scaler.fit(dataset.data)
        scaled_df = scaler.transform(dataset.data)
        df_scaled = pd.DataFrame(scaled_df, columns=df.columns)
        targets = (utility.check_targets_0_1(np.where(dataset.target == dataset.target[0],0,1))).astype(np.int32)
        df_scaled['target'] = targets
        X_train, X_test, y_train, y_test = train_test_split(scaled_df, targets, test_size=0.75,random_state=50,stratify=targets)
        return X_train, X_test, y_train, y_test, df.columns.values, np.unique(targets)

    elif dataset_name == 'Vertebral-Column':
        dataset_name = 'Vertebral-Column'
        df = pd.read_csv('./datasets/column_2C.dat', sep=" ", names=['pelvic_incidence', 'pelvic_tilt', 'lumbar_lordosis_angle', 'sacral_slope', 'pelvic_radius', 'degree_spondylolisthesis','target'])
        df['target']=np.where(df['target']=='AB',1,0)
        scaler = MinMaxScaler()
        scaler.fit(df.values[:, :-1])
        scaled_df = scaler.transform(df.values[:, :-1])
        df_scaled = pd.DataFrame(scaled_df, columns=df.columns[:-1])
        targets = (utility.check_targets_0_1(df.values[:,-1])).astype(np.int32)
        df_scaled['target'] = targets
        X_train, X_test, y_train, y_test = train_test_split(scaled_df, targets, test_size=0.75,random_state=50,stratify=targets)
        return X_train, X_test, y_train, y_test, df.columns.values, np.unique(targets)
        
    else:
        print("Incorrect dataset name")

def parse_explanation(explanation, feature_names, epsilon=1e-6):
    bounds = [[0, 1] for _ in range(len(feature_names))]
    conditions = explanation

    for condition in conditions:
        condition_no_space = condition.replace(' ', '')  # for regex matching
        # Check for double inequality
        match = re.match(r'(\d+\.?\d*)\s*(<|<=)\s*([^\s<>]+)\s*(<|<=)\s*(\d+\.?\d*)', condition_no_space)
        
        if match:
            value_1, op1, feature_token, op2, value_2 = match.groups()
            value_1 = float(value_1)
            value_2 = float(value_2)
            lower_bound = value_1 if op1 == '<=' else value_1 + epsilon
            upper_bound = value_2 if op2 == '<=' else value_2
            upper_bound = upper_bound if op2 == '<=' else upper_bound - epsilon

            for idx, feature in enumerate(feature_names):
                if feature.replace(" ", "") in feature_token:
                    bounds[idx] = [lower_bound, upper_bound]
                    break
            continue  # go to next condition

        # Fallback to single operator logic
        for idx, feature in enumerate(feature_names):
            if feature in condition:
                cond_clean = condition.replace('<=', ' LESS_EQUAL ').replace('>=', ' GREATER_EQUAL ')
                cond_clean = cond_clean.replace('<', ' < ').replace('>', ' > ')
                tokens = cond_clean.split()

                tokens = ['<=' if token == 'LESS_EQUAL' else token for token in tokens]
                tokens = ['>=' if token == 'GREATER_EQUAL' else token for token in tokens]

                operator = None
                operator_pos = None
                for i, token in enumerate(tokens):
                    if token in ['>', '>=', '<', '<=']:
                        operator = token
                        operator_pos = i
                        break

                value = None
                if operator is not None and operator_pos is not None:
                    for i in range(operator_pos + 1, len(tokens)):
                        try:
                            value = float(tokens[i])
                            break
                        except ValueError:
                            continue

                if value is not None:
                    if operator == '>':
                        bounds[idx] = [value + epsilon, 1]
                    elif operator == '>=':
                        bounds[idx] = [value, 1]
                    elif operator == '<':
                        bounds[idx] = [0, value - epsilon]
                    elif operator == '<=':
                        bounds[idx] = [0, value]
                else:
                    print(f"Could not extract numeric value from condition: '{condition}'")

    return np.array(bounds)
def train_anchors(dataset_name):
    print(dataset_name)
    clf = joblib.load(f'models/{dataset_name}_svm_model.pkl')
    print(f'Loaded model')
    X_train, X_test, y_train, y_test,feature_names, class_names = load_data(dataset_name)
    if 'target' in feature_names:
        feature_names = feature_names[feature_names != 'target']
    print(f'feature names:\n {feature_names}')
    print(f'class names:\n {class_names}')
    predict_fn = lambda x: clf.predict(x)
    explainer = AnchorTabular(predict_fn, feature_names)
    explainer.fit(X_train, disc_perc=(25, 50, 75))
    return explainer, feature_names, class_names
    
# Compute means and standard deviations
def compute_mean_std(arr):
    return np.mean(arr), np.std(arr)

# Compute relative percentage differences
def relative_percentage_diff(new, old):
    if np.any(old == 0):
        print(f'Warning: found possible division by zero')
        return np.where(old != 0, ((new - old) / old) * 100, np.nan)
    return ((new - old) / old) * 100

In [None]:
datasets_name = ['Iris', 'Wine', 'Vertebral-Column', 'Pima', 'Parkinsons', 'Breast_Cancer', 'Blood_Transfusion', 'Ionosphere', 'Glass', 'Climate', 'Modeling', 'Banknote', 'Sonar']

In [None]:
df = pd.read_csv(f'./Anchors_results/{datasets_name[0]}_explanations.csv')
df

In [None]:
X_train, X_test, y_train, y_test, columns, targets = load_data(datasets_name[0])

In [None]:
parse_explanation(df['Explanation'].values[0:1],columns)

In [None]:
columns