LSTM模型的运行情况 

In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, TFBertModel
from sklearn.preprocessing import LabelEncoder
import os
import tensorflow as tf

# 定义批次大小和统一的向量维度
BATCH_SIZE = 8
UNIFORM_LENGTH = 512  # 假设所有词向量都填充或截断到这个长度
FEATURE_DIM = 768     # BERT基本模型的特征维度
batch_size = 8  


# 2. 修改数据加载器以同时读取特征和标签
def data_generator(file_paths, batch_size):    
    for file_path in file_paths:
        print("Loading file:", file_path)  # 调试输出
        batch_data = np.load(file_path, allow_pickle=True).item()
        features = batch_data['features']
        labels = batch_data['labels']
        # 根据批次大小将数据分块
        for i in range(0, len(features), batch_size):
            print("Loaded data shape:", features.shape, labels.shape)  # 调试输出
            yield features[i:i+batch_size], labels[i:i+batch_size]


def load_dataset(file_paths, batch_size):
    dataset = tf.data.Dataset.from_generator(
        lambda: data_generator(file_paths, batch_size),
        output_types=(tf.float32, tf.int32),
        output_shapes=((batch_size, UNIFORM_LENGTH, FEATURE_DIM), (batch_size,))
    )
    return dataset.prefetch(tf.data.experimental.AUTOTUNE)



  from .autonotebook import tqdm as notebook_tqdm





In [3]:
# 划分数据集
vector_dir = 'bert_vectors'

from sklearn.model_selection import train_test_split
files = [os.path.join(vector_dir, file) for file in sorted(os.listdir(vector_dir)) if file.endswith('.npy')]
# 确保去除数据量不足的最后一个文件
sample_data = np.load(files[-1], allow_pickle=True).item()
if sample_data['features'].shape[0] < BATCH_SIZE:
    files = files[:-1]

# 指定训练集、验证集和测试集的比例
train_size = 0.7
val_size = 0.15
test_size = 0.15  # Note: train_size + val_size + test_size should be 1

# 计算划分的索引
# 划分训练集、验证集、测试集文件列表
train_files, test_files = train_test_split(files, test_size=test_size, random_state=42)
train_files, val_files = train_test_split(train_files, test_size=val_size / (train_size + val_size), random_state=42)

# 现在你有了训练集(train_files)、验证集(val_files)和测试集(test_files)的文件列表
print(f"Train files: {len(train_files)}")
print(f"Validation files: {len(val_files)}")
print(f"Test files: {len(test_files)}")

# 创建数据集
train_dataset = load_dataset(train_files, batch_size)
val_dataset = load_dataset(val_files, batch_size)
test_dataset = load_dataset(test_files, batch_size)

print("训练集为：",train_dataset)

Train files: 695
Validation files: 150
Test files: 150
Instructions for updating:
Use output_signature instead
Instructions for updating:
Use output_signature instead


训练集为： <_PrefetchDataset element_spec=(TensorSpec(shape=(8, 512, 768), dtype=tf.float32, name=None), TensorSpec(shape=(8,), dtype=tf.int32, name=None))>


In [5]:
for features, labels in train_dataset.take(1):
    print("Features shape:", features.shape)
    print("Labels shape:", labels.shape)

Loading file: bert_vectors\batch_0640.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0479.npy
Features shape: (8, 512, 768)
Labels shape: (8,)
Loaded data shape: (8, 512, 768) (8,)


In [4]:
# 单一的LSTM模型
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout

def create_lstm_model(input_shape, num_classes):
    # 输入层
    input_layer = Input(shape=input_shape, name='input_layer')

    # LSTM层
    # 可以根据需要调整LSTM的单元数量，这里使用128个单元
    lstm_layer = LSTM(128, return_sequences=False, name='lstm_layer')(input_layer)

    # 全连接层
    dense_layer = Dense(128, activation='relu', name='dense_layer')(lstm_layer)

    # Dropout层
    dropout_layer = Dropout(0.5, name='dropout_layer')(dense_layer)

    # 输出层
    output_layer = Dense(num_classes, activation='softmax', name='output_layer')(dropout_layer)

    # 创建模型
    model = Model(inputs=input_layer, outputs=output_layer)
    
    return model

# 定义模型输入的维度
input_shape = (UNIFORM_LENGTH, FEATURE_DIM)  # 这里的UNIFORM_LENGTH是序列长度，FEATURE_DIM是每个时间步的特征维度
num_classes = 2  # 假设是一个二分类问题

# 创建模型
model = create_lstm_model(input_shape, num_classes)

# 编译模型
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# 打印模型概况
model.summary()



In [5]:
# 现在使用创建的模型进行训练
model.fit(train_dataset, epochs=10, validation_data=val_dataset)

Epoch 1/10
Loading file: bert_vectors\batch_0640.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0479.npy
Loaded data shape: (8, 512, 768) (8,)
      1/Unknown [1m1s[0m 940ms/step - accuracy: 0.5000 - loss: 0.9974Loading file: bert_vectors\batch_0291.npy
Loaded data shape: (8, 512, 768) (8,)
      2/Unknown [1m1s[0m 69ms/step - accuracy: 0.5938 - loss: 0.8442 Loading file: bert_vectors\batch_0433.npy
Loaded data shape: (8, 512, 768) (8,)
      3/Unknown [1m1s[0m 82ms/step - accuracy: 0.6597 - loss: 0.7335Loading file: bert_vectors\batch_0523.npy
Loaded data shape: (8, 512, 768) (8,)
      4/Unknown [1m1s[0m 78ms/step - accuracy: 0.7057 - loss: 0.6482Loading file: bert_vectors\batch_0159.npy
Loaded data shape: (8, 512, 768) (8,)
      5/Unknown [1m1s[0m 75ms/step - accuracy: 0.7296 - loss: 0.6278Loading file: bert_vectors\batch_0578.npy
Loaded data shape: (8, 512, 768) (8,)
      6/Unknown [1m1s[0m 74ms/step - accuracy: 0.7434 - loss: 0.6242Loading

In [8]:
#  评估模型
# 使用验证集（你已经分配的 val_dataset）来评估模型性能。这通常涉及计算模型在验证数据上的损失和准确率等指标。
# 评估模型性能
val_loss, val_accuracy = model.evaluate(val_dataset)
print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_accuracy}")

Loading file: bert_vectors\batch_0962.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0173.npy
Loaded data shape: (8, 512, 768) (8,)
      1/Unknown [1m0s[0m 116ms/step - accuracy: 1.0000 - loss: 0.0013Loading file: bert_vectors\batch_0709.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0295.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0899.npy
Loaded data shape: (8, 512, 768) (8,)
      4/Unknown [1m0s[0m 18ms/step - accuracy: 1.0000 - loss: 0.0096 Loading file: bert_vectors\batch_0921.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0189.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0989.npy
Loaded data shape: (8, 512, 768) (8,)
      7/Unknown [1m0s[0m 18ms/step - accuracy: 1.0000 - loss: 0.0114Loading file: bert_vectors\batch_0480.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0593.npy
Loaded data shape: (8, 512

In [9]:
# 模型测试
# 使用测试集（test_dataset）来测试模型的泛化能力。这是评估模型在未见过的数据上表现的重要步骤。
# 测试模型
test_loss, test_accuracy = model.evaluate(test_dataset)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")


Loading file: bert_vectors\batch_0920.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0525.npy
Loaded data shape: (8, 512, 768) (8,)
      1/Unknown [1m0s[0m 59ms/step - accuracy: 1.0000 - loss: 0.0097Loading file: bert_vectors\batch_0567.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0657.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0633.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0429.npy
      4/Unknown [1m0s[0m 18ms/step - accuracy: 0.9297 - loss: 0.2465Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0857.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0712.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0174.npy
      7/Unknown [1m0s[0m 19ms/step - accuracy: 0.9300 - loss: 0.2284Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0604.npy
Loaded data shape: (8, 512, 

In [None]:
# 模型预测
# 对一些新的数据实例进行预测，这可以帮助你了解模型在实际应用中的表现
# 假设你有一些新的数据实例
# 这里需要你自己提供或创建这些数据
# 示例：new_data = ...

# 预测新数据
predictions = model.predict(new_data)
predicted_classes = np.argmax(predictions, axis=1)
print("Predictions:", predicted_classes)


In [10]:
# 保存模型
model.save('trained_lstm_model.h5')
print("Model saved successfully.")

# 加载模型
loaded_model = tf.keras.models.load_model('trained_lstm_model.h5')




Model saved successfully.
