In [None]:
import numpy as np
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "6"
import torch
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss
from torch_geometric.data import Data
from torch_geometric.nn import GATConv
from sklearn.metrics import roc_auc_score
import random

In [None]:
npz_file='phase1_gdata.npz'

print(f"正在从 {npz_file} 加载数据...")
data_npz = np.load(npz_file)

# 转换为PyTorch Tensors
x = torch.tensor(data_npz['x'], dtype=torch.float)

# PyG 要求 edge_index 的形状为 [2, N_edges]
edge_index = torch.tensor(data_npz['edge_index'].T, dtype=torch.long)

y = torch.tensor(data_npz['y'], dtype=torch.long)
train_mask = torch.tensor(data_npz['train_mask'])
test_mask = torch.tensor(data_npz['test_mask'])

#train_mask打乱顺序
random.shuffle(train_mask)

train


val_size = int(len(train_mask) * 0.1)


# 创建PyG的Data对象
# 注意：为了简化，此基线模型暂时忽略了 edge_type 和 edge_timestamp
data = Data(x=x, edge_index=edge_index, y=y_binary)

# --- 划分训练集和验证集 ---
# 我们需要从 train_mask 中分出一个验证集来监控模型性能和防止过拟合
train_indices = torch.where(train_mask)[0]
train_indices = train_indices[torch.randperm(len(train_indices))] # 打乱索引

# 划分 10% 作为验证集
val_size = int(len(train_indices) * 0.1)
val_indices = train_indices[:val_size]
train_indices = train_indices[val_size:]

# 创建新的 mask
data.train_mask = torch.zeros_like(train_mask)
data.val_mask = torch.zeros_like(train_mask)
data.test_mask = test_mask # test_mask 保持不变

data.train_mask[train_indices] = True
data.val_mask[val_indices] = True

print("数据加载和预处理完成。")
print(f"  节点总数: {data.num_nodes}")
print(f"  边总数: {data.num_edges}")
print(f"  节点特征维度: {data.num_node_features}")
print(f"  训练集大小: {data.train_mask.sum().item()}")
print(f"  验证集大小: {data.val_mask.sum().item()}")

正在从 phase1_gdata.npz 加载数据...
数据加载和预处理完成。
  节点总数: 4024623
  边总数: 4927620
  节点特征维度: 17
  训练集大小: 744613
  验证集大小: 82734


tensor([3260479, 2281852, 1803372,  ..., 1607210, 1120048, 2758819])

In [65]:
#展示y数据分布
npz_file='phase1_gdata.npz'

print(f"正在从 {npz_file} 加载数据...")
data_npz = np.load(npz_file)

# 转换为PyTorch Tensors
x = torch.tensor(data_npz['x'], dtype=torch.float)

# PyG 要求 edge_index 的形状为 [2, N_edges]
edge_index = torch.tensor(data_npz['edge_index'].T, dtype=torch.long)

y = torch.tensor(data_npz['y'], dtype=torch.long)

train_mask = torch.tensor(data_npz['train_mask'])
test_mask = torch.tensor(data_npz['test_mask'])
unique, counts = torch.unique(y, return_counts=True)
print("训练集标签分布:")
for u, c in zip(unique.tolist(), counts.tolist()):
    print(f"  类别 {u}: {c} 个样本")


正在从 phase1_gdata.npz 加载数据...
训练集标签分布:
  类别 -100: 354578 个样本
  类别 0: 817579 个样本
  类别 1: 9768 个样本
  类别 2: 1992982 个样本
  类别 3: 849716 个样本


In [28]:
# --- 1. 定义GAT模型 ---
class GATModel(torch.nn.Module):
    """
    一个两层的图注意力网络 (GAT)
    """
    def __init__(self, in_channels, hidden_channels, out_channels, heads=8):
        super().__init__()
        # 第一层GAT：多头注意力
        self.conv1 = GATConv(in_channels, hidden_channels, heads=heads, dropout=0.6)
        # 第二层GAT：单头注意力（或平均），输出最终分类
        self.conv2 = GATConv(hidden_channels * heads, out_channels, heads=1, concat=False, dropout=0.6)

    def forward(self, x, edge_index):
        # GAT层通常会从Dropout开始
        x = F.dropout(x, p=0.6, training=self.training)
        # 应用第一层GAT和ELU激活函数
        x = F.elu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.6, training=self.training)
        # 应用第二层GAT，得到原始logits
        x = self.conv2(x, edge_index)
        return x


# --- 3. 训练和评估函数 ---
def train(model, data, optimizer, criterion):
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    
    # 只在训练集上计算损失
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    
    loss.backward()
    optimizer.step()
    return loss.item()

@torch.no_grad()
def evaluate(model, data, mask):
    """在指定的数据集(mask)上评估AUC"""
    model.eval()
    out = model(data.x, data.edge_index)
    
    # 获取 Class 1 的概率
    probs = out[mask].softmax(dim=-1)[:, 1]
    labels = data.y[mask]
    
    # 确保标签中有正负两类，否则AUC无法计算
    if len(torch.unique(labels)) < 2:
        return 0.5 # 无法计算，返回0.5
        
    return roc_auc_score(labels.cpu().numpy(), probs.cpu().numpy())

# --- 4. 主执行流程 ---
def main():
    # 设置设备
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"使用设备: {device}")



    # --- 处理类别不平衡 ---
    # 欺诈检测任务通常类别高度不平衡，Class 1 会很少
    # 我们计算训练集中类别的权重，用于损失函数
    train_labels = data.y[data.train_mask]
    class_counts = torch.bincount(train_labels.long())
    
    # 避免除以零
    class_weights = 1. / class_counts.float()
    class_weights[class_counts == 0] = 0 
    
    class_weights = class_weights / class_weights.sum()
    print(f"类别权重 (0, 1): {class_weights.cpu().numpy()}")

    # 初始化模型、优化器和损失函数
    model = GATModel(
        in_channels=data.num_node_features,
        hidden_channels=16, # 隐藏层维度
        out_channels=2,     # 输出2个类别 (Class 0, Class 1)
        heads=8             # 8个注意力头
    ).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
    criterion = CrossEntropyLoss(weight=class_weights.to(device))

    # 训练循环
    best_val_auc = 0
    best_epoch = 0
    num_epochs = 200 # 训练轮数

    print("\n开始训练...")
    for epoch in range(1, num_epochs + 1):
        train_loss = train(model, data, optimizer, criterion)
        train_auc = evaluate(model, data, data.train_mask)
        val_auc = evaluate(model, data, data.val_mask)

        print(f'Epoch: {epoch:03d}, Loss: {train_loss:.4f}, '
              f'Train AUC: {train_auc:.4f}, Val AUC: {val_auc:.4f}')

        # 保存表现最好的模型
        if val_auc > best_val_auc:
            best_val_auc = val_auc
            best_epoch = epoch
            torch.save(model.state_dict(), 'best_model.pth')
            print(f"  ^-- 新的最佳模型！保存在 'best_model.pth'")

    print("训练完成。")
    print(f"最佳验证集 AUC: {best_val_auc:.4f} (在 epoch {best_epoch})")

    # --- 5. 生成提交文件 ---
    print("\n正在加载最佳模型并生成提交文件...")
    
    # 加载在验证集上表现最好的模型
    model.load_state_dict(torch.load('best_model.pth'))
    model.eval()

    with torch.no_grad():
        out = model(data.x, data.edge_index)
    
    # 获取测试集节点的概率, shape [N_test, 2]
    test_probs = out[data.test_mask].softmax(dim=-1)
    
    # 转换为 numpy 数组
    submission_data = test_probs.cpu().numpy()

    # 检查形状是否符合要求
    expected_shape = (data.test_mask.sum().item(), 2)
    print(f"提交文件形状: {submission_data.shape} (预期: {expected_shape})")

    if submission_data.shape != expected_shape:
        print("警告: 提交文件形状与预期不符！")

    # 保存为 submission.npy
    np.save('submission.npy', submission_data)
    print("提交文件 'submission.npy' 已成功保存！")

if __name__ == "__main__":
    main()

使用设备: cuda


RuntimeError: bincount only supports 1-d non-negative integral inputs.

tensor(0)

In [27]:
for x in data_npz:
    print(x, data_npz[x].shape)

x (4024623, 17)
y (4024623, 1)
edge_index (4927620, 2)
edge_type (4927620, 1)
edge_timestamp (4927620, 1)
train_mask (827347,)
test_mask (354578,)
