In [1]:
from pulp import *
from pulp import LpProblem, LpVariable, LpMinimize, LpInteger, lpSum, value, LpBinary,LpStatusOptimal
import pulp
import numpy as np
import pandas as pd
import time
from sklearn.preprocessing import MinMaxScaler
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import metrics
import warnings
warnings.filterwarnings("ignore", message="Overwriting previously set objective.")
import utility
import docplex.mp.model
import docplex
import docplex_explainer
import mymetrics

In [2]:
# Load Dataset
dataset_name = 'Column'
df = pd.read_csv('./datasets/column_2C.dat', sep=" ", names=['pelvic_incidence', 'pelvic_tilt', 'lumbar_lordosis_angle', 'sacral_slope', 'pelvic_radius', 'degree_spondylolisthesis','target'])
df['target']=np.where(df['target']=='AB',1,0)

In [3]:
# Scale
scaler = MinMaxScaler()
scaler.fit(df.values[:, :-1])
scaled_df = scaler.transform(df.values[:, :-1])

In [4]:
# Get scaled bounds
lower_bound = scaled_df.min()
upper_bound = scaled_df.max()
print(lower_bound, upper_bound)

0.0 1.0000000000000002


In [5]:
# Check if binary targets
df_scaled = pd.DataFrame(scaled_df, columns=df.columns[:-1])
targets = (utility.check_targets_0_1(df.values[:,-1])).astype(np.int32)
df_scaled['target'] = targets

Original Targets:  [0. 1.] 
Desired Targets: [0,1]
Is original the desired [0, 1]?  True


In [6]:
# Train model
X_train, X_test, y_train, y_test = train_test_split(scaled_df, targets, test_size=0.3,random_state=50,stratify=targets)
X = np.concatenate((X_train,X_test),axis=0)
y = np.concatenate((y_train,y_test),axis=0)

clf = svm.SVC(kernel='linear')

# Train the model using the training set
clf.fit(X_train, y_train)

# Predict the response for test dataset
y_pred = clf.predict(X_test)
print("Accuracy Linear:", metrics.accuracy_score(y_test, y_pred))

Accuracy Linear: 0.8064516129032258


In [7]:
# Finding patterns classified as positive/negative
positive_indexes,negative_indexes = utility.find_indexes(clf, X_test, threshold=0)
print(f"Positive patterns = {len(positive_indexes)},\nNegative patterns = {len(negative_indexes)}")

Positive patterns = 73,
Negative patterns = 20


In [8]:
# Make a dataframe with the test data. For comparing Onestep against Twostep.
test_df_names = list(df.columns)
if 'target' not in test_df_names:
    test_df_names.append('target')
test_dataset = []
for instance, test_class in zip(X_test, y_test.astype('int32')):
    test_dataset.append(np.append(instance, test_class))
test_dataset_df = pd.DataFrame(np.asarray(test_dataset), columns=test_df_names)

In [9]:
# Parameter p value
p = 0.75

#Variables for results
times_twostep = []
rsum_twostep = []
coverage_twostep = []
pos_exp_twostep = []
neg_exp_twostep = []

times_onestep = []
rsum_onestep = []
coverage_onestep = []
pos_exp_onestep = []
neg_exp_onestep = []


#Generate Explanations for the patterns classfied as negative
for idx in  negative_indexes:
    
    #Twostep
    start = time.perf_counter()
    exp_ = docplex_explainer.twostep(
            classifier = clf,
            dual_coef = clf.dual_coef_,
            support_vectors = clf.support_vectors_,
            intercept = clf.intercept_,
            lower_bound = lower_bound,
            upper_bound = upper_bound,
            data = (X_test[idx]),
            p = p,
            positive = False)
    end = time.perf_counter()
    times_twostep.append((end - start))
    neg_exp_twostep.append(exp_)
    rsum_twostep.append(mymetrics.range_sum(exp_))
    coverage_twostep.append(len(mymetrics.calculate_coverage(test_dataset_df, exp_)))
    
    #Onestep
    start = time.perf_counter()
    exp = docplex_explainer.onestep(
            classifier = clf,
            dual_coef = clf.dual_coef_,
            support_vectors = clf.support_vectors_,
            intercept = clf.intercept_,
            lower_bound = lower_bound,
            upper_bound = upper_bound,
            data = (X_test[idx]),
            positive = False)
    end = time.perf_counter()
    times_onestep.append((end - start))
    neg_exp_onestep.append(exp)
    rsum_onestep.append(mymetrics.range_sum(exp))
    coverage_onestep.append(len(mymetrics.calculate_coverage(test_dataset_df, exp)))

#Generate Explanations for the patterns classfied as positive
for idx in positive_indexes:
    
    #Twostep
    start = time.perf_counter()
    exp_ = docplex_explainer.twostep(
            classifier = clf,
            dual_coef = clf.dual_coef_,
            support_vectors = clf.support_vectors_,
            intercept = clf.intercept_,
            lower_bound = lower_bound,
            upper_bound = upper_bound,
            data = (X_test[idx]),
            p = p,
            positive = True)
    end = time.perf_counter()
    times_twostep.append((end - start))
    pos_exp_twostep.append(exp_)
    rsum_twostep.append(mymetrics.range_sum(exp_))
    coverage_twostep.append(len(mymetrics.calculate_coverage(test_dataset_df, exp_)))
    
    #Onestep
    start = time.perf_counter()
    exp = docplex_explainer.onestep(
            classifier = clf,
            dual_coef = clf.dual_coef_,
            support_vectors = clf.support_vectors_,
            intercept = clf.intercept_,
            lower_bound = lower_bound,
            upper_bound = upper_bound,
            data = (X_test[idx]),
            positive = True)
    end = time.perf_counter()
    times_onestep.append((end - start))
    pos_exp_onestep.append(exp)
    rsum_onestep.append(mymetrics.range_sum(exp))
    coverage_onestep.append(len(mymetrics.calculate_coverage(test_dataset_df, exp)))

#Check number of expanded features ranges (Twostep)
frequency = utility.detail_exp(explanations = neg_exp_twostep, patterns = X_test[negative_indexes],
                               number_of_features = len(X_test[0]), show_explanation = False,
                               show_frequency = False, low_val = lower_bound, upp_val = upper_bound)

neg_sizes_twostep = [len(np.where(x == 1)[0]) for x in frequency.to_numpy()]
frequency = utility.detail_exp(explanations = pos_exp_twostep, patterns = X_test[positive_indexes],
                               number_of_features = len(X_test[0]), show_explanation = False,
                               show_frequency = False, low_val = lower_bound, upp_val = upper_bound)
pos_sizes_twostep = [len(np.where(x == 1)[0]) for x in frequency.to_numpy()]
feature_sizes_twostep = neg_sizes_twostep.copy()
for size in pos_sizes_twostep:
    feature_sizes_twostep.append(size)
feature_sizes_twostep = np.asarray(feature_sizes_twostep)

#Check number of expanded features ranges (Onestep)
frequency = utility.detail_exp(explanations = neg_exp_onestep, patterns = X_test[negative_indexes],
                               number_of_features = len(X_test[0]), show_explanation = False,
                               show_frequency = False, low_val = lower_bound, upp_val = upper_bound)
neg_sizes_onestep = [len(np.where(x == 1)[0]) for x in frequency.to_numpy()]

frequency = utility.detail_exp(explanations = pos_exp_onestep, patterns = X_test[positive_indexes],
                               number_of_features = len(X_test[0]), show_explanation = False,
                               show_frequency = False, low_val = lower_bound, upp_val = upper_bound)
pos_sizes_onestep = [len(np.where(x == 1)[0]) for x in frequency.to_numpy()]

feature_sizes_onestep = neg_sizes_onestep.copy()
for size in pos_sizes_onestep:
    feature_sizes_onestep.append(size)
feature_sizes_onestep = np.asarray(feature_sizes_onestep)

#Calculate mean and standard deviation
time_mean_twostep = sum(times_twostep)/len(times_twostep)
time_std_twostep = np.std(times_twostep)
sizes_mean_twostep = sum(feature_sizes_twostep)/len(feature_sizes_twostep)
sizes_std_twostep = np.std(feature_sizes_twostep)
rsum_mean_twostep = sum(rsum_twostep)/len(rsum_twostep)
rsum_std_twostep = np.std(rsum_twostep)
coverage_mean_twostep = sum(coverage_twostep)/len(coverage_twostep)
coverage_std_twostep = np.std(coverage_twostep)

time_mean_onestep = sum(times_onestep)/len(times_onestep)
time_std_onestep = np.std(times_onestep)
sizes_mean_onestep = sum(feature_sizes_onestep)/len(feature_sizes_onestep)
sizes_std_onestep = np.std(feature_sizes_onestep)
rsum_mean_onestep = sum(rsum_onestep)/len(rsum_onestep)
rsum_std_onestep = np.std(rsum_onestep)
coverage_mean_onestep = sum(coverage_onestep)/len(coverage_onestep)
coverage_std_onestep = np.std(coverage_onestep)

#Make a dataframe with the results.
all_metrics_names = ['Metric','ONESTEP_MEAN','ONESTEP_STD','TWOSTEP_MEAN','TWOSTEP_STD']

all_metrics_mean_df  = pd.DataFrame(columns=all_metrics_names)
pattern_row = ['Time',time_mean_onestep, time_std_onestep, time_mean_twostep,time_std_twostep]
all_metrics_mean_df.loc[len(all_metrics_mean_df), :] = pattern_row

pattern_row = ['Size', sizes_mean_onestep, sizes_std_onestep, sizes_mean_twostep,sizes_std_twostep]
all_metrics_mean_df.loc[len(all_metrics_mean_df), :] = pattern_row


pattern_row = ['Ranges_Sum', rsum_mean_onestep, rsum_std_onestep, rsum_mean_twostep,rsum_std_twostep]
all_metrics_mean_df.loc[len(all_metrics_mean_df), :] = pattern_row


pattern_row = ['Coverage', coverage_mean_onestep, coverage_std_onestep, coverage_mean_twostep,coverage_std_twostep]
all_metrics_mean_df.loc[len(all_metrics_mean_df), :] = pattern_row

display(all_metrics_mean_df)

Unnamed: 0,Metric,ONESTEP_MEAN,ONESTEP_STD,TWOSTEP_MEAN,TWOSTEP_STD
0,Time,0.026677,0.003068,0.037751,0.009541
1,Size,3.774194,1.09867,3.774194,1.09867
2,Ranges_Sum,4.078253,0.945892,4.082009,0.956222
3,Coverage,3.666667,3.260494,4.107527,3.626255
