In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

df = pd.read_csv('dataset/breast-cancer-wisconsin/breast-cancer-wisconsin.csv')

X = df.drop('diagnosis',axis=1)

y=df['diagnosis']

feature_names = X.columns

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

df_train = pd.read_csv('dataset/breast-cancer-wisconsin/train_data.csv')
X_train = df_train.drop('diagnosis',axis=1)
y_train = df_train['diagnosis']

df_test = pd.read_csv('dataset/breast-cancer-wisconsin/test_data.csv')
X_test = df_test.drop('diagnosis',axis=1)
y_test = df_test['diagnosis']

#数据增强
df_enhan = pd.read_csv('data_enhan/breast-cancer-wisconsin/lime/lime_enhan.csv')
X_enhan = df_enhan.drop('diagnosis',axis=1)
y_enhan=df_enhan['diagnosis']
y_enhan.to_numpy()
X_train = pd.concat([X_train,X_enhan],ignore_index=True)
y_train = np.concatenate((y_train, y_enhan))

from sklearn.preprocessing import StandardScaler
transform = StandardScaler()
X_train = transform.fit_transform(X_train)
X_test = transform.transform(X_test)

import joblib
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42) 
model.fit(X_train, y_train)
joblib.dump(model,'saved_model/breast-cancer-wisconsin/enhance_model/lime/lime_enhan_model.pkl')

y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

from sklearn.metrics import confusion_matrix
confusion_matrix_model = confusion_matrix(y_test,y_pred)
print("Confusion Matrix:\n", confusion_matrix_model)

from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred,target_names=['Benign','Malignant']))

Confusion Matrix:
 [[105   3]
 [  3  60]]
              precision    recall  f1-score   support

      Benign       0.97      0.97      0.97       108
   Malignant       0.95      0.95      0.95        63

    accuracy                           0.96       171
   macro avg       0.96      0.96      0.96       171
weighted avg       0.96      0.96      0.96       171



In [2]:
import lime
from lime import lime_tabular
explainer = lime_tabular.LimeTabularExplainer(
    training_data=np.array(X_train), # 训练集特征，必须是 numpy 的 Array
    feature_names=feature_names, # 特征列名
    class_names=['Benign','Malignant'], # 预测类别名称
    mode='classification' # 分类模式
)

In [3]:
explanations = [] 

# First loop over idx, generate explanation, and store the results in explanations
for idx in range(171):
    exp = explainer.explain_instance(
            data_row=X_test[idx], 
            predict_fn=model.predict_proba,
            num_features = 30
        )
    explanations.append(exp)

In [4]:
for k in range(1, 31):
    test_data = pd.read_csv('dataset/breast-cancer-wisconsin/test_data.csv')
    bound = pd.read_csv('dataset/breast-cancer-wisconsin/bound.csv')
    for idx in range(171):
        exp = explanations[idx] # Use the previously stored explanation
        mapa = exp.as_map().values()
        lista = list(mapa)[0]
        for j in range(0,k):
            s = lista[j]
            test_data.iat[idx,s[0]+1]=bound.iat[1,s[0]]+10
    filename = 'explain_set/breast-cancer-wisconsin/data_enhan/lime/lime_explain_'+str(k)+'.csv'
    test_data.to_csv(filename,index=False)