CNN-LSTM模型的运行情况 
没有调整参数前的状态

In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, TFBertModel
from sklearn.preprocessing import LabelEncoder
import os
import tensorflow as tf

# 定义批次大小和统一的向量维度
BATCH_SIZE = 8
UNIFORM_LENGTH = 512  # 假设所有词向量都填充或截断到这个长度
FEATURE_DIM = 768     # BERT基本模型的特征维度
batch_size = 8  


# 2. 修改数据加载器以同时读取特征和标签
def data_generator(file_paths, batch_size):    
    for file_path in file_paths:
        print("Loading file:", file_path)  # 调试输出
        batch_data = np.load(file_path, allow_pickle=True).item()
        features = batch_data['features']
        labels = batch_data['labels']
        # 根据批次大小将数据分块
        for i in range(0, len(features), batch_size):
            print("Loaded data shape:", features.shape, labels.shape)  # 调试输出
            yield features[i:i+batch_size], labels[i:i+batch_size]


def load_dataset(file_paths, batch_size):
    dataset = tf.data.Dataset.from_generator(
        lambda: data_generator(file_paths, batch_size),
        output_types=(tf.float32, tf.int32),
        output_shapes=((batch_size, UNIFORM_LENGTH, FEATURE_DIM), (batch_size,))
    )
    return dataset.prefetch(tf.data.experimental.AUTOTUNE)



  from .autonotebook import tqdm as notebook_tqdm





In [2]:
# 划分数据集
vector_dir = 'bert_vectors'

from sklearn.model_selection import train_test_split
files = [os.path.join(vector_dir, file) for file in sorted(os.listdir(vector_dir)) if file.endswith('.npy')]
# 确保去除数据量不足的最后一个文件
sample_data = np.load(files[-1], allow_pickle=True).item()
if sample_data['features'].shape[0] < BATCH_SIZE:
    files = files[:-1]

# 指定训练集、验证集和测试集的比例
train_size = 0.7
val_size = 0.15
test_size = 0.15  # Note: train_size + val_size + test_size should be 1

# 计算划分的索引
# 划分训练集、验证集、测试集文件列表
train_files, test_files = train_test_split(files, test_size=test_size, random_state=42)
train_files, val_files = train_test_split(train_files, test_size=val_size / (train_size + val_size), random_state=42)

# 现在你有了训练集(train_files)、验证集(val_files)和测试集(test_files)的文件列表
print(f"Train files: {len(train_files)}")
print(f"Validation files: {len(val_files)}")
print(f"Test files: {len(test_files)}")

# 创建数据集
train_dataset = load_dataset(train_files, batch_size)
val_dataset = load_dataset(val_files, batch_size)
test_dataset = load_dataset(test_files, batch_size)

print("训练集为：",train_dataset)

Train files: 695
Validation files: 150
Test files: 150
Instructions for updating:
Use output_signature instead


Instructions for updating:
Use output_signature instead
训练集为： <_PrefetchDataset element_spec=(TensorSpec(shape=(8, 512, 768), dtype=tf.float32, name=None), TensorSpec(shape=(8,), dtype=tf.int32, name=None))>


In [3]:
for features, labels in train_dataset.take(1):
    print("Features shape:", features.shape)
    print("Labels shape:", labels.shape)

Loading file: bert_vectors\batch_0640.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0479.npy
Features shape: (8, 512, 768)
Labels shape: (8,)
Loaded data shape: (8, 512, 768) (8,)


In [4]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, LSTM, Dense, Dropout, Flatten, Bidirectional

def create_cnn_lstm_model(sequence_length, vector_dimension, num_classes):
    # 输入层
    input_layer = Input(shape=(sequence_length, vector_dimension), name="input")
    # 卷积和池化层
    conv_3 = Conv1D(filters=128, kernel_size=3, activation='relu', padding='same', name="conv_3x1")(input_layer)
    pool_3 = MaxPooling1D(pool_size=2, name="maxpool_3")(conv_3)
    conv_4 = Conv1D(filters=128, kernel_size=4, activation='relu', padding='same', name="conv_4x1")(input_layer)
    pool_4 = MaxPooling1D(pool_size=2, name="maxpool_4")(conv_4)
    conv_5 = Conv1D(filters=128, kernel_size=5, activation='relu', padding='same', name="conv_5x1")(input_layer)
    pool_5 = MaxPooling1D(pool_size=2, name="maxpool_5")(conv_5)
    # 拼接不同卷积核的输出
    concatenated = tf.keras.layers.concatenate([pool_3, pool_4, pool_5], axis=1)
    # LSTM层
    lstm_layer = LSTM(128, return_sequences=False, name="lstm_layer")(concatenated)
    # 全连接层
    dense_layer = Dense(128, activation='relu', name="dense_layer")(lstm_layer)
    # Dropout层，用于减少过拟合
    dropout_layer = Dropout(0.5, name="dropout_layer")(dense_layer)
    # 输出层
    output_layer = Dense(num_classes, activation='softmax', name="output_layer")(dropout_layer)
    # 构建模型
    model = Model(inputs=input_layer, outputs=output_layer)
    return model

# 模型参数
sequence_length = 512  # 序列长度
vector_dimension = 768  # 特征维度，如BERT词向量维度
num_classes = 2  # 类别数，如正面、负面

# 创建并编译模型
model = create_cnn_lstm_model(sequence_length, vector_dimension, num_classes)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# 打印模型概览
model.summary()


In [5]:
# 现在使用创建的模型进行训练
model.fit(train_dataset, epochs=10, validation_data=val_dataset)

Epoch 1/10
Loading file: bert_vectors\batch_0640.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0479.npy
Loaded data shape: (8, 512, 768) (8,)
      1/Unknown [1m1s[0m 1s/step - accuracy: 0.8750 - loss: 0.5989Loading file: bert_vectors\batch_0291.npy
Loaded data shape: (8, 512, 768) (8,)
      2/Unknown [1m2s[0m 108ms/step - accuracy: 0.8750 - loss: 0.5416Loading file: bert_vectors\batch_0433.npy
Loaded data shape: (8, 512, 768) (8,)
      3/Unknown [1m2s[0m 105ms/step - accuracy: 0.8889 - loss: 0.4920Loading file: bert_vectors\batch_0523.npy
Loaded data shape: (8, 512, 768) (8,)
      4/Unknown [1m2s[0m 104ms/step - accuracy: 0.9010 - loss: 0.4478Loading file: bert_vectors\batch_0159.npy
Loaded data shape: (8, 512, 768) (8,)
      5/Unknown [1m2s[0m 104ms/step - accuracy: 0.9008 - loss: 0.4498Loading file: bert_vectors\batch_0578.npy
Loaded data shape: (8, 512, 768) (8,)
      6/Unknown [1m2s[0m 103ms/step - accuracy: 0.8965 - loss: 0.4641Loadin

  self.gen.throw(value)


Loading file: bert_vectors\batch_0295.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0899.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0921.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0189.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0989.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0480.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0593.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0879.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0942.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0458.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0356.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0297.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0984.np

<keras.src.callbacks.history.History at 0x225833cc530>

In [6]:
#  评估模型
# 使用验证集（已经分配的 val_dataset）来评估模型性能。这通常涉及计算模型在验证数据上的损失和准确率等指标。
# 评估模型性能
val_loss, val_accuracy = model.evaluate(val_dataset)
print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_accuracy}")

Loading file: bert_vectors\batch_0962.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0173.npy
Loaded data shape: (8, 512, 768) (8,)
      1/Unknown [1m0s[0m 59ms/step - accuracy: 1.0000 - loss: 0.0022Loading file: bert_vectors\batch_0709.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0295.npy
Loaded data shape: (8, 512, 768) (8,)
      3/Unknown [1m0s[0m 35ms/step - accuracy: 1.0000 - loss: 0.0078Loading file: bert_vectors\batch_0899.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0921.npy
Loaded data shape: (8, 512, 768) (8,)
      5/Unknown [1m0s[0m 35ms/step - accuracy: 1.0000 - loss: 0.0122Loading file: bert_vectors\batch_0189.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0989.npy
Loaded data shape: (8, 512, 768) (8,)
      7/Unknown [1m0s[0m 35ms/step - accuracy: 1.0000 - loss: 0.0143Loading file: bert_vectors\batch_0480.npy
Loaded data shape: (8, 512, 768) (8,)


In [7]:
# 模型测试
# 使用测试集（test_dataset）来测试模型的泛化能力。这是评估模型在未见过的数据上表现的重要步骤。
# 测试模型
test_loss, test_accuracy = model.evaluate(test_dataset)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")


Loading file: bert_vectors\batch_0920.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0525.npy
Loaded data shape: (8, 512, 768) (8,)
      1/Unknown [1m0s[0m 64ms/step - accuracy: 1.0000 - loss: 0.0086Loading file: bert_vectors\batch_0567.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0657.npy
Loaded data shape: (8, 512, 768) (8,)
      3/Unknown [1m0s[0m 34ms/step - accuracy: 0.9028 - loss: 0.2145Loading file: bert_vectors\batch_0633.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0429.npy
Loaded data shape: (8, 512, 768) (8,)
      5/Unknown [1m0s[0m 35ms/step - accuracy: 0.8967 - loss: 0.2336Loading file: bert_vectors\batch_0857.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0712.npy
Loaded data shape: (8, 512, 768) (8,)
      7/Unknown [1m0s[0m 35ms/step - accuracy: 0.9015 - loss: 0.2232Loading file: bert_vectors\batch_0174.npy
Loaded data shape: (8, 512, 768) (8,)


In [12]:
# 这儿，需要注意，是采用的keras格式保存模型，使用H5保存会出现bug
model.save('trained_cnn_lstm_model.keras')  # 使用 .keras 扩展名
print("Model saved successfully in Keras format.")


Model saved successfully in Keras format.


In [13]:
loaded_model = tf.keras.models.load_model('trained_cnn_lstm_model.keras')

  trackable.load_own_variables(weights_store.get(inner_path))
