In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

df = pd.read_csv('dataset/early-stage-diabetes-risk-prediction/diabetes.csv')

X = df.drop('class',axis=1)

y=df['class']

feature_names = X.columns

df_train = pd.read_csv('dataset/early-stage-diabetes-risk-prediction/train_data.csv')
X_train = df_train.drop('class',axis=1)
y_train = df_train['class']

df_test = pd.read_csv('dataset/early-stage-diabetes-risk-prediction/test_data.csv')
X_test = df_test.drop('class',axis=1)
y_test = df_test['class']

#数据增强
df_enhan = pd.read_csv('data_enhan/early-stage-diabetes-risk-prediction/anchor/reverse/anchor_enhan.csv')
X_enhan = df_enhan.drop('class',axis=1)
y_enhan=df_enhan['class']
y_enhan.to_numpy()
X_train = pd.concat([X_train,X_enhan],ignore_index=True)
y_train = np.concatenate((y_train, y_enhan))

from sklearn.preprocessing import StandardScaler
transform = StandardScaler()
X_train['Age'] = transform.fit_transform(X_train[['Age']])
X_test['Age'] = transform.transform(X_test[['Age']])

import joblib
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42) 
model.fit(X_train, y_train)
joblib.dump(model,'saved_model/early-stage-diabetes-risk-prediction/enhance_model/anchor/reverse/anchor_enhan_model.pkl')

y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

from sklearn.metrics import confusion_matrix
confusion_matrix_model = confusion_matrix(y_test,y_pred)
print("Confusion Matrix:\n", confusion_matrix_model)

from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred,target_names=['Negative', 'Positive']))

Confusion Matrix:
 [[ 54   0]
 [  1 101]]
              precision    recall  f1-score   support

    Negative       0.98      1.00      0.99        54
    Positive       1.00      0.99      1.00       102

    accuracy                           0.99       156
   macro avg       0.99      1.00      0.99       156
weighted avg       0.99      0.99      0.99       156



In [20]:
import lime
from lime import lime_tabular
explainer = lime_tabular.LimeTabularExplainer(
    training_data=np.array(X_train),
    feature_names=feature_names,
    class_names=['Negative', 'Positive'],
    mode='classification'
)

test_data = pd.read_csv('dataset/early-stage-diabetes-risk-prediction/test_data.csv')
for idx in range(156):
    exp = explainer.explain_instance(
        data_row=X_test.iloc[idx], 
        predict_fn=model.predict_proba,
        num_features = 31
    )
    mapa = exp.as_map().values()
    lista = list(mapa)[0]
    for j in range(0,10):
        s = lista[j]
        if s[0]==0:
            test_data.iat[idx,s[0]]=94
        else:
            test_data.iat[idx,s[0]]=0

In [21]:
test_data.to_csv('explain_set/early-stage-diabetes-risk-prediction/data_enhan/lime/reverse/lime_explain_10.csv',index=False)