SVM支持向量机模型
包括模型训练及测评

In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, TFBertModel
from sklearn.preprocessing import LabelEncoder
import os
import tensorflow as tf

# 定义批次大小和统一的向量维度
BATCH_SIZE = 8
UNIFORM_LENGTH = 512  # 假设所有词向量都填充或截断到这个长度
FEATURE_DIM = 768     # BERT基本模型的特征维度
batch_size = 8  


# 2. 修改数据加载器以同时读取特征和标签
def data_generator(file_paths, batch_size):    
    for file_path in file_paths:
        print("Loading file:", file_path)  # 调试输出
        batch_data = np.load(file_path, allow_pickle=True).item()
        features = batch_data['features']
        labels = batch_data['labels']
        # 根据批次大小将数据分块
        for i in range(0, len(features), batch_size):
            print("Loaded data shape:", features.shape, labels.shape)  # 调试输出
            yield features[i:i+batch_size], labels[i:i+batch_size]


def load_dataset(file_paths, batch_size):
    dataset = tf.data.Dataset.from_generator(
        lambda: data_generator(file_paths, batch_size),
        output_types=(tf.float32, tf.int32),
        output_shapes=((batch_size, UNIFORM_LENGTH, FEATURE_DIM), (batch_size,))
    )
    return dataset.prefetch(tf.data.experimental.AUTOTUNE)



  from .autonotebook import tqdm as notebook_tqdm





In [2]:
# 划分数据集
vector_dir = 'bert_vectors'

from sklearn.model_selection import train_test_split
files = [os.path.join(vector_dir, file) for file in sorted(os.listdir(vector_dir)) if file.endswith('.npy')]
# 确保去除数据量不足的最后一个文件
sample_data = np.load(files[-1], allow_pickle=True).item()
if sample_data['features'].shape[0] < BATCH_SIZE:
    files = files[:-1]

# 指定训练集、验证集和测试集的比例
train_size = 0.7
val_size = 0.15
test_size = 0.15  # Note: train_size + val_size + test_size should be 1

# 计算划分的索引
# 划分训练集、验证集、测试集文件列表
train_files, test_files = train_test_split(files, test_size=test_size, random_state=42)
train_files, val_files = train_test_split(train_files, test_size=val_size / (train_size + val_size), random_state=42)

# 现在你有了训练集(train_files)、验证集(val_files)和测试集(test_files)的文件列表
print(f"Train files: {len(train_files)}")
print(f"Validation files: {len(val_files)}")
print(f"Test files: {len(test_files)}")

# 创建数据集
train_dataset = load_dataset(train_files, batch_size)
val_dataset = load_dataset(val_files, batch_size)
test_dataset = load_dataset(test_files, batch_size)

print("训练集为：",train_dataset)

Train files: 695
Validation files: 150
Test files: 150
Instructions for updating:
Use output_signature instead
Instructions for updating:
Use output_signature instead
训练集为： <_PrefetchDataset element_spec=(TensorSpec(shape=(8, 512, 768), dtype=tf.float32, name=None), TensorSpec(shape=(8,), dtype=tf.int32, name=None))>


In [3]:
for features, labels in train_dataset.take(1):
    print("Features shape:", features.shape)
    print("Labels shape:", labels.shape)

Loading file: bert_vectors\batch_0640.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0479.npy
Features shape: (8, 512, 768)
Labels shape: (8,)
Loaded data shape: (8, 512, 768) (8,)


In [9]:
from sklearn.kernel_approximation import RBFSampler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# 修改提取特征的函数，使用 float32 而不是 float64
def extract_features_labels(dataset, sample_count):
    features_list = []
    labels_list = []
    count = 0
    for features, labels in dataset:
        features_numpy = features.numpy().astype(np.float32)  # 转换为 float32
        labels_numpy = labels.numpy()
        features_list.append(features_numpy)
        labels_list.append(labels_numpy)
        count += len(labels)
        if count >= sample_count:
            break
    features_array = np.concatenate(features_list, axis=0)
    labels_array = np.concatenate(labels_list, axis=0)
    return features_array, labels_array

# 降低样本数量以避免内存问题
train_features, train_labels = extract_features_labels(train_dataset, 500)  # 可根据内存情况调整

# 使用 RBFSampler 进行核近似
rbf_feature = RBFSampler(gamma=1.0, random_state=1, n_components=500)

# 创建 SVM 模型
model = Pipeline([
    ("scaler", StandardScaler()),
    ("rbf_sampler", rbf_feature),
    ("svm", SVC())
])

model.fit(train_features.reshape(-1, UNIFORM_LENGTH * FEATURE_DIM), train_labels)


Loading file: bert_vectors\batch_0640.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0479.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0291.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0433.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0523.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0159.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0578.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0043.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0536.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0407.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0017.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0329.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0796.np

In [10]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

# 提取测试集数据
test_features, test_labels = extract_features_labels(test_dataset, 500)  # 根据内存情况调整样本数量
test_features = test_features.reshape(-1, UNIFORM_LENGTH * FEATURE_DIM)  # 重塑形状以适配模型

# 使用模型进行预测
test_predictions = model.predict(test_features)

# 计算评估指标
accuracy = accuracy_score(test_labels, test_predictions)
precision = precision_score(test_labels, test_predictions, average='macro')  # 可调整为 'micro', 'macro', 'weighted'
recall = recall_score(test_labels, test_predictions, average='macro')
f1 = f1_score(test_labels, test_predictions, average='macro')

# 计算特异性
tn, fp, fn, tp = confusion_matrix(test_labels, test_predictions).ravel()
specificity = tn / (tn + fp)

# 打印评估结果
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Specificity:", specificity)


Loading file: bert_vectors\batch_0920.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0525.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0567.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0657.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0633.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0429.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0857.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0712.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0174.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0604.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0867.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0449.npy
Loaded data shape: (8, 512, 768) (8,)
Loading file: bert_vectors\batch_0846.np

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [11]:
print("Accuracy:", accuracy)

Accuracy: 0.9087301587301587
