In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import tree
import matplotlib.pyplot as plt
import lime
import lime.lime_tabular
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from numpy import dot, inner
from numpy.linalg import norm

In [None]:
#load dataset
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
print(cancer.data.shape)

#build dataframe
import pandas as pd
X = pd.DataFrame(cancer['data'])
y = cancer ['target']
X.describe()

# separate between train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target,stratify=cancer.target, random_state=42)

In [None]:
# random forest classifier 
param_grid = {'max_depth': [1,2,3,4,5,6,None], 'max_features': [3,5,7,10, None]}
from sklearn.ensemble import RandomForestClassifier
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, scoring= 'roc_auc', cv=5,return_train_score=True)
grid_search.fit(X_train, y_train)
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.3f}".format(grid_search.best_score_))
print("Test set score: {:.3f}".format(grid_search.score(X_test, y_test)))

In [None]:
rfc = RandomForestClassifier(max_depth= 6, max_features= 5)
np.set_printoptions(precision=3)
rfc.fit(X_train,y_train)
y_pred=rfc.predict(X_test)
roc_test=roc_auc_score(y_test, y_pred)
print("accuracy on training set is {:.3f}".format(rfc.score(X_train,y_train)))
print("accuracy on test set is {:.3f}".format(rfc.score(X_test,y_test)))
print("roc_auc_score on test set is {:.3f}".format(roc_test))

In [None]:
# gradient boosting 
from sklearn.ensemble import GradientBoostingClassifier
param_grid = {'max_depth': [1,3,5,None],'max_features': [1,2,3,5,10,None],'learning_rate' : [0.03,0.1,0.5]}
grid_search = GridSearchCV(GradientBoostingClassifier(), param_grid,scoring= 'roc_auc', cv=5, return_train_score=True)
grid_search.fit(X_train, y_train)
print("Best parameters: {}".format(grid_search.best_params_))
print("Best cross-validation score: {:.3f}".format(grid_search.best_score_))
print("Test set score: {:.3f}".format(grid_search.score(X_test, y_test)))

In [None]:
# let's fit the best parameters on the whole training set
gbr = GradientBoostingClassifier(max_depth=5, max_features=2,learning_rate=0.03)
np.set_printoptions(precision=3)
gbr.fit(X_train,y_train)
y_pred=gbr.predict(X_test)
roc_test=roc_auc_score(y_test, y_pred)
print("accuracy on the training set is {:.3f}".format(gbr.score(X_train,y_train)))
print("accuracy on the test set is {:.3f}".format(gbr.score(X_test,y_test)))
print("roc_auc_score on test set is {:.3f}".format(roc_test))

In [None]:
gbr = GradientBoostingClassifier()
np.set_printoptions(precision=3)
gbr.fit(X_train,y_train)
y_pred=gbr.predict(X_test)
roc_test=roc_auc_score(y_test, y_pred)
print("accuracy on the training set is {:.3f}".format(gbr.score(X_train,y_train)))
print("accuracy on the test set is {:.3f}".format(gbr.score(X_test,y_test)))
print("roc_auc_score on test set is {:.3f}".format(roc_test))

In [None]:
X_train= pd.DataFrame(X_train)
echmean=X_train.mean(axis=0)
echcov= X_train.cov()
X_virt= np.random.multivariate_normal(echmean, echcov, 100000, check_valid ='warn')
d_virt=pd.DataFrame(X_virt, columns=X_train.columns)
d_virt.describe()

y_virt=gbr.predict(d_virt)

In [None]:
from sklearn.model_selection import cross_val_score
for i in range (4,10):
    clf = DecisionTreeClassifier(max_depth=i,random_state=42)
    score=cross_val_score(clf,d_virt,y_virt,cv=5,scoring= 'roc_auc').mean() 
    print("k=",i,"average cross-validation score: {:.3f}".format(score))    


In [None]:
k=7
clf= DecisionTreeClassifier(max_depth=k)
clf.fit(d_virt,y_virt)
Acc_appr=clf.score(X_train,y_train)
Acc_test=clf.score(X_test,y_test)
y_pred=clf.predict(X_test)
roc_test=roc_auc_score(y_test, y_pred)
print('for depth',k)
print("accuracy on training set is {:.3f}".format(Acc_appr))
print("accuracy on test set is {:.3f}".format(Acc_test))
print("roc_auc_score on test set is {:.3f}".format(roc_test))

In [None]:
# auxiliary function

def make_exp_vec(exp):
    """
    Takes a LIME explanation which is a dictionary (i, w(i)) where i is the feature id and w(i) is the weight 
    of the ith feature. With the explanation vector we generate, we can 
    
    Arg - 
        exp: LIME explanation
    
    Returns - 
        v: explanation vector where v[i] = w(i)
    """
    k = list(exp.keys())[0]
    l = exp[k]
    v = np.zeros(len(l))
    
    for (i,w) in l:
        v[i] = w
    
    return v

def exp_point(point, data, fn, f_names, c_names):
    """
    The explanation vector for top label of a point in a certain data set.
    
    Arg - 
        point: point to explain
        data: data model was trained on
        fn: probability black box
        f_names: names of features
        c_names: names of classes
        
    Returns - 
        explanation vector (see make_exp_vec())
    """
    explainer = lime.lime_tabular.LimeTabularExplainer(data, feature_names=f_names, class_names=c_names, discretize_continuous=False)
    exp = explainer.explain_instance(point, fn, num_features=len(point), top_labels=1)
    
    v = make_exp_vec(exp.local_exp)
    
    return v

In [None]:
# explanation similarity for gbr and clf
points_id = np.random.choice(len(cancer['data']), size=120, replace=False)
points = cancer['data'][points_id]

sim = []
for x in points:
    
    v1 = exp_point(x, cancer['data'], gbr.predict_proba, cancer.feature_names, cancer.target_names)
    v2 = exp_point(x, cancer['data'], clf.predict_proba, cancer.feature_names, cancer.target_names)
    
    sim.append(np.inner(v1, v2) / (norm(v1) * norm(v2)))
    
num_bins = 50
sim_plus = [x for x in sim if x>=0]
num_neg = len([x for x in sim if x<0])

plt.figure()
plt.title('Similarity Between Explanations')
plt.xlabel('Cosine Similarity')
plt.ylabel('Freq.')
n, bins, patches = plt.hist(sim_plus, num_bins, facecolor='tab:blue', alpha=0.5)
plt.show()
print('number of negatively similar vectors: ', num_neg)

In [None]:
# explanation similarity for rf and clf
points_id = np.random.choice(len(cancer['data']), size=120, replace=False)
points = cancer['data'][points_id]

sim = []
for x in points:
    
    v1 = exp_point(x, cancer['data'], rfc.predict_proba, cancer.feature_names, cancer.target_names)
    v2 = exp_point(x, cancer['data'], clf.predict_proba, cancer.feature_names, cancer.target_names)
    
    sim.append(np.inner(v1, v2) / (norm(v1) * norm(v2)))
    
num_bins = 50
sim_plus = [x for x in sim if x>=0]
num_neg = len([x for x in sim if x<0])

plt.figure()
plt.title('Similarity Between Explanations')
plt.xlabel('Cosine Similarity')
plt.ylabel('Freq.')
n, bins, patches = plt.hist(sim_plus, num_bins, facecolor='tab:blue', alpha=0.5)
plt.show()
print('number of negatively similar vectors: ', num_neg)