In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import rce
from scipy.stats import sem
import Datasets as DS

## User preferences

Here, the user can choose the dataset (`'banknote'`, `'diabetes'`, or `'ionosphere'`), the number of factual instances, the time limit (in seconds), as well as the uncertainty set (`'l2'` or `'linf'`) and rho. Furthermore, the models that should be fit to the data can be specified in a dictionary.

In [None]:
dataset_name = 'banknote' # or 'diabetes' or 'ionosphere'
num_instances = 20
time_limit = 1000
rho = 0.05
unc_type = 'linf' # or 'l2'

clf_dict = {'linear': [0], 'cart': [3, 5, 10], 'rf': [5, 10, 20, 50, 100],
           'gbm': [5, 10, 20, 50, 100], 'mlp': [(10,), (10, 10, 10), (50,), (100,)]}

## Load dataset

In [None]:
data = getattr(DS,dataset_name)
df = data('../data/')

scaler = MinMaxScaler()
df = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)
X = df.iloc[:,:-1]
y = df['Outcome']
df.head()

## Robust counterfcatual explanation

In [None]:
fnamefull = './results_%s/%s_results_%s.txt' % (dataset_name,dataset_name,unc_type)

num_iterations_dict = {(i, j):[] for i in clf_dict.keys() for j in clf_dict[i]}
comp_time_dict = {(i, j):[] for i in clf_dict.keys() for j in clf_dict[i]}
dist_early_stops = {(i, j):[] for i in clf_dict.keys() for j in clf_dict[i]}
early_stops_iter = {(i, j):[] for i in clf_dict.keys() for j in clf_dict[i]}


for clf_type in clf_dict.keys():
    for param in clf_dict[clf_type]:

        if clf_type == 'cart':
            clf = DecisionTreeClassifier(max_depth=param).fit(X, y)
        elif clf_type == 'rf':
            clf = RandomForestClassifier(max_depth=3, random_state=0, n_estimators=param).fit(X, y)
        elif clf_type == 'mlp':
            clf = MLPClassifier(hidden_layer_sizes=param, activation='relu', random_state=0, max_iter=10000).fit(X, y)
        elif clf_type == 'gbm':
            clf = GradientBoostingClassifier(n_estimators=param, learning_rate=1.0, max_depth=2, random_state=0).fit(X, y)
        elif clf_type == 'linear':
            clf = LogisticRegression(random_state=param).fit(X, y)

        for i in range(num_instances):
            print(f'######## Iteration number: {i} ########')
            np.random.seed(i)
            u = pd.DataFrame([X.iloc[i, :]])
            
            
            if clf_type == 'linear':
                it = False
            else:
                it = True
            
            final_model, num_iterations, comp_time, x_, solutions_master_dict = rce.generate(clf, X, y, '../experiments/results_%s' % dataset_name, clf_type, 'binary', u, list(u.columns), [], [], {}, [], [], [], rho,
                             unc_type=unc_type, iterative=it, time_limit=time_limit)
            
            if x_ is not None:
                solution_subopt, dist = rce.find_maxrad(x_, clf_type, 'results_%s' % dataset_name, x_.columns, [], [], {}, [], [], [], clf.predict(u)[0], unc_type)
            if x_ is None or dist + rho/100 < rho:
                best_dist = 0
                for i in range(len(solutions_master_dict)):
                    x_ = solutions_master_dict[i]['sol']
                    solution_subopt_i, dist_i = rce.find_maxrad(x_, clf_type, 'results_%s' % dataset_name, x_.columns, [], [], {}, [], [], [], clf.predict(u)[0], unc_type)
                    if dist_i >= best_dist:
                        best_dist = dist_i
                print(best_dist)
                dist_early_stops[(clf_type, param)].append(best_dist)
                early_stops_iter[(clf_type, param)].append(num_iterations)
                print('\n\n@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ ERROR @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n\n')

            else:
                num_iterations_dict[(clf_type, param)].append(num_iterations)
                comp_time_dict[(clf_type, param)].append(comp_time)

            # write results to .txt file
            num_iterations_list = num_iterations_dict[(clf_type, param)] 
            comp_time_list = comp_time_dict[(clf_type, param)] 
            dist_early_stops_list = dist_early_stops[(clf_type,param)]
            early_stops_iter_list = early_stops_iter[(clf_type, param)]

            txt = '{0}: \t {1} \t {2:.2f} ({3:.2f}) \t {4:.2f} ({5:.2f}) \t {6} \t {7:.3f} ({8:.3f}) \t {9:.2f} ({10:.2f}) '.format(clf_type,
                                      param,
                                      np.mean(comp_time_list),
                                        sem(comp_time_list),
                                        np.mean(num_iterations_list), 
                                        sem(num_iterations_list), 
                                        len(dist_early_stops_list),
                                        np.mean(dist_early_stops_list),
                                        sem(dist_early_stops_list), 
                                        np.mean(early_stops_iter_list),
                                        sem(early_stops_iter_list))
        with open(fnamefull, 'a') as f:   
            print(txt,file=f)