In [1]:
import numpy as np
import pandas as pd
import lore

from prepare_dataset import *
from neighbor_generator import *

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

In [2]:
path_data = '../../dataset/breast-cancer-wisconsin/'

dataset_name = 'breast-cancer-wisconsin.csv'
dataset = prepare_breast_dataset(dataset_name, path_data)

df_train = pd.read_csv('../../dataset/breast-cancer-wisconsin/train_data.csv')
X_train = df_train.drop('diagnosis',axis=1)
y_train = df_train['diagnosis']

df_test = pd.read_csv('../../dataset/breast-cancer-wisconsin/test_data.csv')
X_test = df_test.drop('diagnosis',axis=1)
y_test = df_test['diagnosis']

#数据增强
df_enhan = pd.read_csv('../../data_enhan/breast-cancer-wisconsin/lore/lore_enhan.csv')
X_enhan = df_enhan.drop('diagnosis',axis=1)
y_enhan=df_enhan['diagnosis']
y_enhan.to_numpy()
X_train = pd.concat([X_train,X_enhan],ignore_index=True)
y_train = np.concatenate((y_train, y_enhan))

from sklearn.preprocessing import StandardScaler
transform = StandardScaler()
X_train = transform.fit_transform(X_train)
X_test = transform.transform(X_test)

import joblib
blackbox = RandomForestClassifier(n_estimators=100, random_state=42) 
blackbox.fit(X_train, y_train)
joblib.dump(blackbox,'../../saved_model/breast-cancer-wisconsin/enhance_model/lore/lore_enhan_model.pkl')

y_pred = blackbox.predict(X_test)
y_pred_proba = blackbox.predict_proba(X_test)

from sklearn.metrics import confusion_matrix
confusion_matrix_model = confusion_matrix(y_test,y_pred)
print("Confusion Matrix:\n", confusion_matrix_model)

from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred,target_names=['Benign','Malignant']))

X2E = X_test
y2E = blackbox.predict(X2E)
y2E = np.asarray([dataset['possible_outcomes'][i] for i in y2E])

Confusion Matrix:
 [[106   2]
 [  4  59]]
              precision    recall  f1-score   support

      Benign       0.96      0.98      0.97       108
   Malignant       0.97      0.94      0.95        63

    accuracy                           0.96       171
   macro avg       0.97      0.96      0.96       171
weighted avg       0.96      0.96      0.96       171



In [3]:
explanations = [] 

# First loop over idx_record2explain, generate explanation, and store the results in explanations
for idx_record2explain in range(171):
    explanation, infos = lore.explain(idx_record2explain, X2E, dataset, blackbox,
                                      ng_function=genetic_neighborhood,
                                      discrete_use_probabilities=True,
                                      continuous_function_estimation=False,
                                      returns_infos=True,
                                      path=path_data, sep=';', log=False)
    explanations.append((explanation, infos))

No 'ss' samples have been generated. Trying again...
No 'ss' samples have been generated. Trying again...
No 'ss' samples have been generated. Trying again...
No 'ss' samples have been generated. Trying again...
No 'ss' samples have been generated. Trying again...
No 'ss' samples have been generated. Trying again...
No 'sd' samples have been generated. Trying again...
No 'sd' samples have been generated. Trying again...
No 'ss' samples have been generated. Trying again...
No 'ss' samples have been generated. Trying again...
No 'ss' samples have been generated. Trying again...
No 'ss' samples have been generated. Trying again...
No 'ss' samples have been generated. Trying again...
No 'ss' samples have been generated. Trying again...
No 'ss' samples have been generated. Trying again...
No 'ss' samples have been generated. Trying again...
No 'ss' samples have been generated. Trying again...
No 'ss' samples have been generated. Trying again...
No 'ss' samples have been generated. Trying ag

In [4]:
# Then loop over k, and for each k iterate through all explanations
for k in range(1, 31):
    test_data = pd.read_csv('../../dataset/breast-cancer-wisconsin/test_data.csv')
    bound = pd.read_csv('../../dataset/breast-cancer-wisconsin/bound.csv')
    count = 0
    maxlen = 0
    
    for idx_record2explain in range(171):
        explanation, infos = explanations[idx_record2explain] # Use the previously stored explanation
        dfX2E = build_df2explain(blackbox, X2E, dataset).to_dict('records')
        dfx = dfX2E[idx_record2explain]
        # x = build_df2explain(blackbox, X2E[idx_record2explain].reshape(1, -1), dataset).to_dict('records')[0]
        keys_view = explanation[0][1].keys()
        keys_list = list(keys_view)
        length = len(keys_list)
        count = count + length
        if length > maxlen:
            maxlen = length
        if length > k:
            for j in range(0,k):
                test_data.at[idx_record2explain,keys_list[j]]=bound.at[1,keys_list[j]] + 10
        elif length <= k:
            for j in range(0,length):
                test_data.at[idx_record2explain,keys_list[j]]=bound.at[1,keys_list[j]] + 10
    filename = '../../explain_set/breast-cancer-wisconsin/data_enhan/lore/lore_explain_'+str(k)+'.csv'
    test_data.to_csv(filename,index=False)
    mean_length = count / 171
print("mean_length:",mean_length)
print("maxlen:",maxlen)

mean_length: 2.7134502923976607
maxlen: 8
