In [40]:
# Cell 1: 导入和配置
import torch
import dgl
import os
import pandas as pd
import numpy as np
from dgl.nn import GraphConv
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import time
import logging

# 配置
OUTPUT_DIR = r".\Data\StreamSpot\processed"
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE = 32
EPOCHS = 50
LEARNING_RATE = 0.001
NODE_TYPE_DIM = 32  # 节点类型嵌入维度
HIDDEN_DIM = 64     # GCN 和 LSTM 隐藏层维度
NUM_LAYERS = 2      # GCN 层数
LSTM_LAYERS = 1     # LSTM 层数

# 配置训练日志
train_log_path = os.path.abspath(os.path.join(OUTPUT_DIR, "train_log.txt"))
def log_to_train(message):
    try:
        with open(train_log_path, "a", encoding="utf-8") as f:
            f.write(f"{time.strftime('%Y-%m-%d %H:%M:%S')} - {message}\n")
    except Exception as e:
        print(f"写入 train_log.txt 失败: {e}")

log_to_train("训练日志初始化")
print(f"设备: {DEVICE}")
print(f"快照目录: {os.path.abspath(OUTPUT_DIR)}")

设备: cpu
快照目录: C:\Users\cli305\Codes\Jupyter\Provenance Graph Embedding\Prographer on Provenance Graph Embedding\Data\StreamSpot\processed


In [41]:
# Cell 2: 生成 labels.csv
label_file = os.path.join(OUTPUT_DIR, "labels.csv")
if not os.path.exists(label_file):
    labels = pd.DataFrame({
        "graph_id": range(600),
        "label": [0 if gid < 300 else 1 for gid in range(600)]
    })
    labels.to_csv(label_file, index=False)
    log_to_train(f"生成 labels.csv: {label_file}")
    print(f"生成 labels.csv: {label_file}")
else:
    log_to_train(f"labels.csv 已存在: {label_file}")
    print(f"labels.csv 已存在: {label_file}")
print(pd.read_csv(label_file).head())

labels.csv 已存在: .\Data\StreamSpot\processed\labels.csv
   graph_id  label
0         0      0
1         1      0
2         2      0
3         3      0
4         4      0


In [45]:
# Cell 3: 数据加载
class StreamSpotDataset(Dataset):
    def __init__(self, snapshot_dir, label_file):
        self.snapshot_dir = snapshot_dir
        self.snapshots = [f for f in os.listdir(snapshot_dir) if f.endswith('.pt') and f.startswith('snapshot_')]
        self.snapshots.sort(key=lambda x: int(x.split('_')[1].split('.pt')[0]))
        try:
            self.labels = pd.read_csv(label_file)
        except FileNotFoundError:
            raise FileNotFoundError(f"标签文件不存在: {label_file}")
        self.graph_ids = [int(f.split('_')[1].split('.pt')[0]) // 500 for f in self.snapshots]  # 每500条边
        for gid in self.graph_ids:
            if gid not in self.labels["graph_id"].values:
                log_to_train(f"警告: graph_id {gid} 在 labels.csv 中不存在")
                print(f"警告: graph_id {gid} 在 labels.csv 中不存在")
    
    def __len__(self):
        return len(self.snapshots)
    
    def __getitem__(self, idx):
        snapshot_path = os.path.join(self.snapshot_dir, self.snapshots[idx])
        try:
            graph = torch.load(snapshot_path, weights_only=False)
            # 添加自环以修复零入度节点问题
            graph = dgl.add_self_loop(graph)
            graph_id = self.graph_ids[idx]
            label = self.labels[self.labels["graph_id"] == graph_id]["label"].values
            if len(label) == 0:
                raise ValueError(f"graph_id {graph_id} 无对应标签")
            log_to_train(f"加载快照 {snapshot_path}, 节点数: {graph.num_nodes()}, 边数: {graph.num_edges()}")
            return graph, label[0]
        except Exception as e:
            log_to_train(f"加载快照 {snapshot_path} 失败: {e}")
            print(f"加载快照 {snapshot_path} 失败: {e}")
            raise
    
# 加载数据集
OUTPUT_DIR = r".\Data\StreamSpot\processed"
label_file = os.path.join(OUTPUT_DIR, "labels.csv")
try:
    dataset = StreamSpotDataset(OUTPUT_DIR, label_file)
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=lambda x: x)
    log_to_train(f"加载快照: {len(dataset)} 个")
    print(f"加载快照: {len(dataset)} 个")
    print(f"样本示例: {dataset[0][0].num_nodes()} 节点, {dataset[0][0].num_edges()} 边")
except Exception as e:
    log_to_train(f"数据加载失败: {e}")
    print(f"数据加载失败: {e}")

加载快照: 16625 个
样本示例: 193 节点, 385 边


In [46]:
# Cell 4: 模型定义
class Prographer(nn.Module):
    def __init__(self, node_type_dim, hidden_dim, num_layers, lstm_layers):
        super(Prographer, self).__init__()
        self.node_embedding = nn.Embedding(100, node_type_dim)  # 假设100种节点类型
        self.gcn_layers = nn.ModuleList([
            GraphConv(node_type_dim if i == 0 else hidden_dim, hidden_dim)
            for i in range(num_layers)
        ])
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, lstm_layers, batch_first=True)
        self.mlp = nn.Linear(hidden_dim, 2)  # 二分类：正常/异常
    
    def forward(self, graphs):
        batch_graphs = []
        for g in graphs:
            h = self.node_embedding(g.ndata["type"])
            for gcn in self.gcn_layers:
                h = F.relu(gcn(g, h))
            batch_graphs.append(h.mean(0))  # 平均池化
        batch_graphs = torch.stack(batch_graphs)
        lstm_out, _ = self.lstm(batch_graphs.unsqueeze(0))
        out = self.mlp(lstm_out.squeeze(0))
        return out

# 初始化模型
model = Prographer(NODE_TYPE_DIM, HIDDEN_DIM, NUM_LAYERS, LSTM_LAYERS).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()

log_to_train("模型初始化完成")
print(f"模型初始化完成: {model}")

模型初始化完成: Prographer(
  (node_embedding): Embedding(100, 32)
  (gcn_layers): ModuleList(
    (0): GraphConv(in=32, out=64, normalization=both, activation=None)
    (1): GraphConv(in=64, out=64, normalization=both, activation=None)
  )
  (lstm): LSTM(64, 64, batch_first=True)
  (mlp): Linear(in_features=64, out_features=2, bias=True)
)


In [None]:
# Cell 5: 训练循环
def train(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0
    for batch_idx, batch in enumerate(dataloader):
        try:
            graphs, labels = zip(*batch)
            graphs = [g.to(device) for g in graphs]
            labels = torch.tensor(labels, dtype=torch.long).to(device)
            optimizer.zero_grad()
            out = model(graphs)
            loss = criterion(out, labels)
            if torch.isnan(loss) or torch.isinf(loss):
                log_to_train(f"Batch {batch_idx+1}/{len(dataloader)}, 损失异常: {loss.item()}")
                print(f"Batch {batch_idx+1}/{len(dataloader)}, 损失异常: {loss.item()}")
                continue
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            log_to_train(f"Batch {batch_idx+1}/{len(dataloader)}, Loss: {loss.item():.4f}")
            print(f"Batch {batch_idx+1}/{len(dataloader)}, Loss: {loss.item():.4f}")
        except Exception as e:
            log_to_train(f"Batch {batch_idx+1}/{len(dataloader)} 失败: {e}")
            print(f"Batch {batch_idx+1}/{len(dataloader)} 失败: {e}")
    return total_loss / len(dataloader) if total_loss > 0 else float('nan')

try:
    for epoch in range(EPOCHS):
        loss = train(model, dataloader, criterion, optimizer, DEVICE)
        log_to_train(f"Epoch {epoch+1}/{EPOCHS}, Loss: {loss:.4f}")
        print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {loss:.4f}")
        torch.save(model.state_dict(), os.path.join(OUTPUT_DIR, f"model_epoch_{epoch+1}.pt"))
except Exception as e:
    log_to_train(f"训练失败: {e}")
    print(f"训练失败: {e}")

Batch 1/520, Loss: 0.7262
Batch 2/520, Loss: 0.7018
Batch 3/520, Loss: 0.6758
Batch 4/520, Loss: 0.6459
Batch 5/520, Loss: 0.6203
Batch 6/520, Loss: 0.5904
Batch 7/520, Loss: 0.5604
Batch 8/520, Loss: 0.5300
Batch 9/520, Loss: 0.4896
Batch 10/520, Loss: 0.4443
Batch 11/520, Loss: 0.4091
Batch 12/520, Loss: 0.3387
Batch 13/520, Loss: 0.3036
Batch 14/520, Loss: 0.2635
Batch 15/520, Loss: 0.2163
Batch 16/520, Loss: 0.1554
Batch 17/520, Loss: 0.1201
Batch 18/520, Loss: 0.1030
Batch 19/520, Loss: 0.0736
Batch 20/520, Loss: 0.0578
Batch 21/520, Loss: 0.0474
Batch 22/520, Loss: 0.0382
Batch 23/520, Loss: 0.0329
Batch 24/520, Loss: 0.0248
Batch 25/520, Loss: 0.0222
Batch 26/520, Loss: 0.0196
Batch 27/520, Loss: 0.0152
Batch 28/520, Loss: 0.0128
Batch 29/520, Loss: 0.0140
Batch 30/520, Loss: 0.0111
Batch 31/520, Loss: 0.0077
Batch 32/520, Loss: 0.0077
Batch 33/520, Loss: 0.0073
Batch 34/520, Loss: 0.0068
Batch 35/520, Loss: 0.0078
Batch 36/520, Loss: 0.0068
Batch 37/520, Loss: 0.0063
Batch 38/5

In [48]:
# Cell 6: 评估
def evaluate(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in dataloader:
            graphs, labels = zip(*batch)
            graphs = [g.to(device) for g in graphs]
            labels = torch.tensor(labels, dtype=torch.long).to(device)
            out = model(graphs)
            preds = out.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += len(labels)
    return correct / total

try:
    accuracy = evaluate(model, dataloader, DEVICE)
    log_to_train(f"准确率: {accuracy:.4f}")
    print(f"准确率: {accuracy:.4f}")
except Exception as e:
    log_to_train(f"评估失败: {e}")
    print(f"评估失败: {e}")

准确率: 1.0000


In [49]:
# Cell 8: 可视化
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc
import numpy as np

# 设置 seaborn 风格
sns.set(style="whitegrid")

# 绘制损失曲线
plt.figure(figsize=(10, 5))
plt.plot(range(1, len(epoch_losses) + 1), epoch_losses, label='Training Loss', marker='o')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss Curve')
plt.legend()
loss_plot_path = os.path.join(OUTPUT_DIR, "loss_curve.png")
plt.savefig(loss_plot_path)
plt.close()
log_to_train(f"损失曲线已保存: {loss_plot_path}")
print(f"损失曲线已保存: {loss_plot_path}")

# 绘制准确率曲线
plt.figure(figsize=(10, 5))
plt.plot(range(1, len(epoch_accuracies) + 1), epoch_accuracies, label='Accuracy', marker='o', color='green')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Accuracy Curve')
plt.legend()
accuracy_plot_path = os.path.join(OUTPUT_DIR, "accuracy_curve.png")
plt.savefig(accuracy_plot_path)
plt.close()
log_to_train(f"准确率曲线已保存: {accuracy_plot_path}")
print(f"准确率曲线已保存: {accuracy_plot_path}")

# 无监督异常分数分布
def compute_anomaly_scores_with_probs(model, dataloader, device):
    model.eval()
    scores = []
    probs = []
    labels = []
    with torch.no_grad():
        for batch in dataloader:
            graphs, batch_labels = zip(*batch)
            graphs = [g.to(device) for g in graphs]
            batch_labels = torch.tensor(batch_labels, dtype=torch.long).to(device)
            out = model(graphs)
            scores.extend(out.norm(dim=1).cpu().numpy())
            probs.extend(F.softmax(out, dim=1)[:, 1].cpu().numpy())  # 异常类概率
            labels.extend(batch_labels.cpu().numpy())
    return scores, probs, labels

try:
    scores, probs, labels = compute_anomaly_scores_with_probs(model, dataloader, DEVICE)
    plt.figure(figsize=(10, 5))
    sns.histplot(scores, bins=30, kde=True, color='blue')
    plt.xlabel('Anomaly Score')
    plt.ylabel('Frequency')
    plt.title('Anomaly Score Distribution')
    anomaly_plot_path = os.path.join(OUTPUT_DIR, "anomaly_score_distribution.png")
    plt.savefig(anomaly_plot_path)
    plt.close()
    log_to_train(f"异常分数分布已保存: {anomaly_plot_path}")
    print(f"异常分数分布已保存: {anomaly_plot_path}")

    # ROC 曲线
    fpr, tpr, _ = roc_curve(labels, probs)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(10, 5))
    plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.2f})', color='red')
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc='lower right')
    roc_plot_path = os.path.join(OUTPUT_DIR, "roc_curve.png")
    plt.savefig(roc_plot_path)
    plt.close()
    log_to_train(f"ROC 曲线已保存: {roc_plot_path}")
    print(f"ROC 曲线已保存: {roc_plot_path}")
except Exception as e:
    log_to_train(f"可视化失败: {e}")
    print(f"可视化失败: {e}")

ModuleNotFoundError: No module named 'matplotlib'