In [1]:
# 加载数据集
import pandas as pd
import numpy as np
from transformers import BertTokenizer, TFBertModel
from sklearn.preprocessing import LabelEncoder
import os
import tensorflow as tf

# 定义批次大小和统一的向量维度
BATCH_SIZE = 8
UNIFORM_LENGTH = 512  # 假设所有词向量都填充或截断到这个长度
FEATURE_DIM = 768     # BERT基本模型的特征维度
batch_size = 8  


# 2. 修改数据加载器以同时读取特征和标签
def data_generator(file_paths, batch_size):    
    for file_path in file_paths:
        print("Loading file:", file_path)  # 调试输出
        batch_data = np.load(file_path, allow_pickle=True).item()
        features = batch_data['features']
        labels = batch_data['labels']
        # 根据批次大小将数据分块
        for i in range(0, len(features), batch_size):
            print("Loaded data shape:", features.shape, labels.shape)  # 调试输出
            yield features[i:i+batch_size], labels[i:i+batch_size]


def load_dataset(file_paths, batch_size):
    dataset = tf.data.Dataset.from_generator(
        lambda: data_generator(file_paths, batch_size),
        output_types=(tf.float32, tf.int32),
        output_shapes=((batch_size, UNIFORM_LENGTH, FEATURE_DIM), (batch_size,))
    )
    return dataset.prefetch(tf.data.experimental.AUTOTUNE)


  from .autonotebook import tqdm as notebook_tqdm





In [2]:
from sklearn.model_selection import train_test_split
# 划分数据集
vector_dir = 'bert_vectors'

files = [os.path.join(vector_dir, file) for file in sorted(os.listdir(vector_dir)) if file.endswith('.npy')]
# 确保去除数据量不足的最后一个文件
sample_data = np.load(files[-1], allow_pickle=True).item()
if sample_data['features'].shape[0] < BATCH_SIZE:
    files = files[:-1]

# 指定训练集、验证集和测试集的比例
train_size = 0.7
val_size = 0.15
test_size = 0.15  # Note: train_size + val_size + test_size should be 1

# 计算划分的索引
# 划分训练集、验证集、测试集文件列表
train_files, test_files = train_test_split(files, test_size=test_size, random_state=42)
train_files, val_files = train_test_split(train_files, test_size=val_size / (train_size + val_size), random_state=42)

# 现在你有了训练集(train_files)、验证集(val_files)和测试集(test_files)的文件列表
print(f"Train files: {len(train_files)}")
print(f"Validation files: {len(val_files)}")
print(f"Test files: {len(test_files)}")

# 创建数据集
train_dataset = load_dataset(train_files, batch_size)
val_dataset = load_dataset(val_files, batch_size)
test_dataset = load_dataset(test_files, batch_size)

print("训练集为：",train_dataset)

Train files: 695
Validation files: 150
Test files: 150
Instructions for updating:
Use output_signature instead
Instructions for updating:
Use output_signature instead
训练集为： <_PrefetchDataset element_spec=(TensorSpec(shape=(8, 512, 768), dtype=tf.float32, name=None), TensorSpec(shape=(8,), dtype=tf.int32, name=None))>


In [17]:
from tensorflow.keras.metrics import Precision, Recall, TruePositives, TrueNegatives, FalsePositives, FalseNegatives
from tensorflow.keras.models import load_model
model = tf.keras.models.load_model('trained_cnn_lstm_model.keras')

In [18]:
# 使用测试集进行评估
# 假设你已经有一个适当预处理的测试数据集 test_dataset
results = model.evaluate(test_dataset)
print("Loss, Accuracy:", results)

Loading file: bert_vectors\batch_0920.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0525.npy
Loaded data shape: (8, 512, 768) (8,)
      1/Unknown [1m0s[0m 284ms/step - accuracy: 1.0000 - loss: 0.0146Loading file: bert_vectors\batch_0567.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0657.npy
Loaded data shape: (8, 512, 768) (8,)
      3/Unknown [1m0s[0m 36ms/step - accuracy: 0.9028 - loss: 0.2089 Loading file: bert_vectors\batch_0633.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0429.npy
Loaded data shape: (8, 512, 768) (8,)
      5/Unknown [1m0s[0m 36ms/step - accuracy: 0.8967 - loss: 0.2269Loading file: bert_vectors\batch_0857.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0712.npy
Loaded data shape: (8, 512, 768) (8,)
      7/Unknown [1m0s[0m 36ms/step - accuracy: 0.9015 - loss: 0.2161Loading file: bert_vectors\batch_0174.npy
Loaded data shape: (8, 512, 768) (8,

In [19]:
from tensorflow.keras.metrics import TruePositives, TrueNegatives, FalsePositives, FalseNegatives, Precision, Recall

# 初始化度量
tp = TruePositives()
tn = TrueNegatives()
fp = FalsePositives()
fn = FalseNegatives()
precision = Precision()
recall = Recall()

# 预测并更新状态
for X, y_true in test_dataset:
    y_pred = model.predict(X)
    y_pred_classes = tf.argmax(y_pred, axis=-1)  # 将输出转换为类别标签

    tp.update_state(y_true, y_pred_classes)
    tn.update_state(y_true, y_pred_classes)
    fp.update_state(y_true, y_pred_classes)
    fn.update_state(y_true, y_pred_classes)
    precision.update_state(y_true, y_pred_classes)
    recall.update_state(y_true, y_pred_classes)

# 计算 F1 score 和特异性
f1_score = 2 * (precision.result().numpy() * recall.result().numpy()) / (precision.result().numpy() + recall.result().numpy())
specificity = tn.result().numpy() / (tn.result().numpy() + fp.result().numpy())


# 输出结果
print("True Positives:", tp.result().numpy())
print("True Negatives:", tn.result().numpy())
print("False Positives:", fp.result().numpy())
print("False Negatives:", fn.result().numpy())

print("Recall:", recall.result().numpy())
print("Precision:", precision.result().numpy())
print("F1 Score:", f1_score)
print("Specificity:", specificity)


Loading file: bert_vectors\batch_0920.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0525.npy
Loaded data shape: (8, 512, 768) (8,)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step
Loading file: bert_vectors\batch_0567.npy
Loaded data shape: (8, 512, 768) (8,)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
Loading file: bert_vectors\batch_0657.npy
Loaded data shape: (8, 512, 768) (8,)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
Loading file: bert_vectors\batch_0633.npy
Loaded data shape: (8, 512, 768) (8,)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
Loading file: bert_vectors\batch_0429.npy
Loaded data shape: (8, 512, 768) (8,)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
Loading file: bert_vectors\batch_0857.npy
Loaded data shape: (8, 512, 768) (8,)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
Loading