In [1]:
import pandas as pd
import torch
import pickle
import tensorflow as tf
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,roc_curve
import numpy as np
from keras.models import load_model



In [2]:
import keras

# 注意力机制层  
@keras.saving.register_keras_serializable(package="custom_objects")
class Attention(tf.keras.layers.Layer):
    def __init__(self, units, **kwargs):
        super(Attention, self).__init__()
        self.W = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, inputs):
        query = inputs[0]
        values = inputs[1]

        query_with_time_axis = tf.expand_dims(query, 1)

        score = tf.nn.tanh(self.W(query_with_time_axis) + self.W(values))
        attention_weights = tf.nn.softmax(self.V(score), axis=1)

        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector

读取测试数据

In [3]:
X_test_feature = pd.read_csv(r'data_features_smote/X_test_features_smote.csv')
y_test = pd.read_csv(r'data_features_smote/y_test_smote.csv')

In [4]:
X_test_embedding=torch.load('data_embedding_smote/test_embedding_somte.pt',map_location=torch.device('cpu')).numpy()
# y_test_embedding = pd.read_csv(r"data_embedding_smote/y_test_smote.csv")

In [5]:
X_test_embedding= torch.unsqueeze(torch.tensor(X_test_embedding),dim=1).numpy()

读取模型

In [6]:
#读取xgboost模型
def load_model_pickle(model_name):
    with open(model_name, 'rb') as f:
        model_xg = pickle.load(f)
    return model_xg
xgboost=load_model_pickle(r'ML_models_smote/xgboost_model_6.pkl')

In [7]:
#读取bert_bi_lstm_attention模型
from keras.models import load_model
import keras.src.models
# 如下的文件位置，根据自己电脑的文件位置更改
bert_bi_lstm_attention_model = load_model(r'classification_models/DL_models_smote/bert_bi-lstm_attention_smote/bert_bilstm_attention_model_l.keras',custom_objects={'Attention': Attention})
# 打印加载的模型结构
print (bert_bi_lstm_attention_model.summary())

OSError: No file or directory found at classification_models/DL_models_smote/bert_bi-lstm_attention_smote/bert_bilstm_attention_model_l.keras

模型评估

In [26]:
#复合模型评价
def evaluate_composite_model(model1,r1, model2,r2, X_test_features,X_test_embedding, y_test):
    
    # Predict probabilities
    y_pred_proba = pd.DataFrame(model1.predict_proba(X_test_features)[:,1])*r1+pd.DataFrame(model2.predict(X_test_embedding)*r2)
    
    # Predict labels
    y_pred=np.where(y_pred_proba>0.5,1,0)
    
    # Calculate accuracy, precision, recall, F1-score, and AUC
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    
    # Calculate ROC curve
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
    
    return [accuracy, precision, recall, f1, auc,fpr.tolist(), tpr.tolist()]

In [37]:
metrics_name = ['accuracy', 'precision', 'recall', 'f1-score','auc','fpr-score','tpr-score']
rate_list=['xg_0.1','xg_0.2','xg_0.3','xg_0.4','xg_0.5','xg_0.6','xg_0.7','xg_0.8','xg_0.9','xg_1.0']
metrics_dict = {}
for r in np.arange(0.1,1.1,0.1):
    r1=r
    r2=1-r
    metrics = evaluate_composite_model(xgboost,r1,bert_bi_lstm_attention_model,r2, X_test_feature,X_test_embedding, y_test)
    metrics_dict[rate_list[int(r*10-1)]] = {metrics_name[j]: metrics[j] for j in range(len(metrics))}



In [39]:
import json
#以json文件保存字典结果
with open('fixed_model_metrics_dict.json', 'w') as f:
    json.dump(metrics_dict, f)

In [40]:
import pandas as pd
data_metrics = pd.DataFrame(metrics_dict)
# #读取每个模型的评价指标值
# metrics_name = ['accuracy', 'precision', 'recall', 'f1-score','auc','fpr-score','tpr-score']
data_metrics

Unnamed: 0,xg_0.1,xg_0.2,xg_0.3,xg_0.4,xg_0.5,xg_0.6,xg_0.7,xg_0.8,xg_0.9,xg_1.0
accuracy,0.807829,0.821699,0.836718,0.854609,0.865016,0.864644,0.860167,0.854693,0.849017,0.842377
precision,0.843754,0.853179,0.865935,0.883842,0.908153,0.93206,0.949105,0.959349,0.961283,0.953256
recall,0.755575,0.777132,0.796797,0.816529,0.812171,0.786627,0.76115,0.740776,0.727328,0.720064
f1-score,0.797233,0.813382,0.829928,0.848853,0.857484,0.85319,0.844799,0.836012,0.828098,0.820411
auc,0.901114,0.912951,0.921637,0.928375,0.932248,0.932071,0.928932,0.922368,0.911651,0.897467
fpr-score,"[0.0, 0.0, 0.0, 0.0, 0.0, 3.378834977699689e-0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
tpr-score,"[0.0, 3.378834977699689e-05, 0.018347073928909...","[0.0, 3.378834977699689e-05, 0.004223543722124...","[0.0, 3.378834977699689e-05, 0.006656304906068...","[0.0, 3.378834977699689e-05, 0.004426273820786...","[0.0, 3.378834977699689e-05, 0.000743343695093...","[0.0, 3.378834977699689e-05, 0.007095553453169...","[0.0, 3.378834977699689e-05, 0.000979862143532...","[0.0, 3.378834977699689e-05, 0.012670631166373...","[0.0, 3.378834977699689e-05, 0.006048114610082...","[0.0, 3.378834977699689e-05, 0.000135153399107..."


In [42]:
# 保存最好的组合模型评价分数
# print(metrics_dict['xg_0.5'])
#以json文件保存字典结果
with open('last_model_metrics_dict.json', 'w') as f:
    json.dump(metrics_dict['xg_0.5'], f)