获取数据集并分析

In [127]:
import numpy as np
from motifcluster import clustering as mccl
from motifcluster import motifadjacency as mcmo
from motifcluster import utils as mcut
import torch_geometric

In [128]:
import os
import os.path as osp
import torch.nn.functional as F
from torch_geometric.utils import negative_sampling
from torch_geometric.datasets import Planetoid
import torch_geometric.transforms as T
from torch_geometric.seed import seed_everything
from torch_geometric.utils import train_test_split_edges

seed_everything(123)

dataset = Planetoid('dataset','Cora',transform=T.NormalizeFeatures())
data = dataset[0]
# 不再使用
# data.train_mask = data.val_mask = data.test_mask = data.y = None

print(data)

# 加入mam
# 使用MAM
adj = data.edge_index
adj = torch_geometric.utils.to_scipy_sparse_matrix(data.edge_index)
mam = mcmo.build_motif_adjacency_matrix(adj,motif_name='M4',motif_type='func',mam_method='sparse',mam_weight_type='product')
# 加上原邻接矩阵与边权重
inte_adj = adj + mam
mam_edge_index,mam_edge_weight = torch_geometric.utils.from_scipy_sparse_matrix(inte_adj)
data.edge_index = mam_edge_index
data.edge_weight = mam_edge_weight

# 对边集进行分割
split = T.RandomLinkSplit(is_undirected=True,add_negative_train_samples=False,num_val=0.05,num_test=0.1,neg_sampling_ratio=1.0)
train_data,val_data,test_data = split(data)
print(f'train_data.edge_label:{train_data.edge_label}\ntrain_data.edge_label_index:{train_data.edge_label_index}')
print(val_data)
print(test_data)

# 采样负样本边
# data = train_test_split_edges(data)
# train_pos_edge_index = data.train_pos_edge_index
# for key in data.keys:
#     print(key,getattr(data,key).shape)
#     print(key,getattr(data,key))

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])
train_data.edge_label:tensor([1., 1., 1.,  ..., 1., 1., 1.])
train_data.edge_label_index:tensor([[ 136, 1073, 1701,  ...,  284, 1902,   97],
        [ 831, 2163, 1858,  ..., 2225, 1904, 1353]])
Data(x=[2708, 1433], edge_index=[2, 8976], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], edge_weight=[8976], edge_label=[526], edge_label_index=[2, 526])
Data(x=[2708, 1433], edge_index=[2, 9502], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], edge_weight=[9502], edge_label=[1054], edge_label_index=[2, 1054])


构造边预测神经网咯

In [129]:
import torch
from torch_geometric.nn import GCNConv
from sklearn.metrics import roc_auc_score

class Net(torch.nn.Module):
    def __init__(self,in_channels,out_channels):
        super(Net,self).__init__()
        self.conv1 = GCNConv(in_channels,128)
        self.conv2 = GCNConv(128,out_channels)

    # 编码：节点表征生成
    # 使用边权重
    def encode(self,x,edge_index,edge_weight):
        x = self.conv1(x,edge_index,edge_weight)
        x = x.relu()
        return self.conv2(x,edge_index,edge_weight)

    # 未加边权重
    # def encode(self,x,edge_index):
    #     x = self.conv1(x,edge_index)
    #     x = x.relu()
    #     return self.conv2(x,edge_index)

    # 解码：根据边两端节点的表征生成边为真的概率
    def decode(self,z,edge_label_index):
        # 按倒数第一维来cat
        return (z[edge_label_index[0]]*z[edge_label_index[1]]).sum(dim=-1)

    # 推理阶段：对所有节点预测边存在的概率
    def decode_all(self,z):
        prob_adj = z @ z.t()
        # 得到预测的边列表
        return (prob_adj>0).nonzero(as_tuple=False).t()

边预测神经网络预测

In [130]:
# 定义单个epoch训练过程
# 每个epoch的训练过程都进行训练集负样本采样，采样到的负样本数量与正样本数量相同
# 不同epoch中采样的样本不同，实现正负样本类别数量平衡，也增加了负样本多样性

# 生成完整训练集的标签

def get_link_labels(pos_edge_index,neg_edge_index):
    num_links = pos_edge_index.size(1) + neg_edge_index.size(1)
    link_labels = torch.zeros(num_links,dtype=torch.float)
    link_labels[:pos_edge_index.size(1)]=1.
    return link_labels

# 当RandomLinkSplit的参数add_negative_train_samples=False时
def train(train_data,val_data,model,optimizer,criterion):
    model.train()
    optimizer.zero_grad()

    # 对训练集中不存在边的节点对进行采样
    # 传递train_data.edge_label_index为参数，只对训练集中不存在边的节点对采样
    neg_edge_index = negative_sampling(
        edge_index=train_data.edge_index,
        num_nodes=train_data.num_nodes,
        num_neg_samples=train_data.edge_label_index.size(1),method='sparse')

    edge_label_index = torch.cat(
        [train_data.edge_label_index,neg_edge_index],
        dim=-1
    )
    edge_label = torch.cat([
        train_data.edge_label,
        train_data.edge_label.new_zeros(neg_edge_index.size(1))
    ], dim=0)

    # 训练阶段，只使用训练集正样本边
    # edge_weight也应该是mask后的
    z = model.encode(train_data.x.float(),train_data.edge_index,train_data.edge_weight.float())
    # z = model.encode(train_data.x.float(),train_data.edge_index)

    link_logits = model.decode(z,edge_label_index)
    # link_labels = get_link_labels(train_data.edge_label_index,neg_edge_index).to(train_data.x.device)
    # loss = criterion(link_logits,link_labels)
    loss = criterion(link_logits,edge_label)
    loss.backward()
    optimizer.step()
    val_auc = eval_link_predictor(model,val_data)

    return loss,val_auc

@torch.no_grad()
def eval_link_predictor(model,val_data):
    model.eval()
    z = model.encode(val_data.x.float(),val_data.edge_index,val_data.edge_weight.float())
    # z = model.encode(val_data.x.float(),val_data.edge_index)
    out = model.decode(z,val_data.edge_label_index).view(-1).sigmoid()

    return roc_auc_score(val_data.edge_label.cpu(),out.cpu())

In [132]:
def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    dataset = 'Cora'
    dataset = Planetoid('dataset','Cora',transform=T.NormalizeFeatures())
    data = dataset[0]

    # 加入mam
    # 使用MAM
    adj = data.edge_index
    adj = torch_geometric.utils.to_scipy_sparse_matrix(data.edge_index)
    mam = mcmo.build_motif_adjacency_matrix(adj,motif_name='M13',motif_type='func',mam_method='sparse',mam_weight_type='product')
    # 加上原邻接矩阵与边权重
    inte_adj = adj + mam
    mam_edge_index,mam_edge_weight = torch_geometric.utils.from_scipy_sparse_matrix(inte_adj)
    data.edge_index = mam_edge_index
    data.edge_weight = mam_edge_weight

    # 对边集进行分割
    split = T.RandomLinkSplit(is_undirected=True,add_negative_train_samples=False,num_val=0.05,num_test=0.1,neg_sampling_ratio=1.0)
    train_data,val_data,test_data = split(data)
    train_data = train_data.to(device)
    val_data = val_data.to(device)
    test_data = test_data.to(device)
    print(train_data)
    print(val_data)
    print(test_data)

    model = Net(dataset.num_features,64).to(device)
    model = model.float()
    optimizer = torch.optim.Adam(params=model.parameters(),lr=0.01)
    criterion = F.binary_cross_entropy_with_logits
    # criterion = torch.nn.BCEWithLogitsLoss()

    best_val_auc = 0
    for epoch in range(1,101):
        loss,val_auc = train(train_data,val_data,model,optimizer,criterion)
        if val_auc > best_val_auc:
            best_val_auc = val_auc
        print(f'Epoch:{epoch:03d}, Loss:{loss:.4f}, Val:{val_auc:.4f}')


    test_auc = eval_link_predictor(model,test_data)
    print(f'Test::{test_auc:.4f}')

if __name__ == "__main__":
    main()

Data(x=[2708, 1433], edge_index=[2, 82356], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], edge_weight=[82356], edge_label=[41178], edge_label_index=[2, 41178])
Data(x=[2708, 1433], edge_index=[2, 82356], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], edge_weight=[82356], edge_label=[4844], edge_label_index=[2, 4844])
Data(x=[2708, 1433], edge_index=[2, 87200], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708], edge_weight=[87200], edge_label=[9688], edge_label_index=[2, 9688])
Epoch:001, Loss:0.6931, Val:0.7402
Epoch:002, Loss:0.6887, Val:0.7243
Epoch:003, Loss:0.6813, Val:0.7696
Epoch:004, Loss:0.6715, Val:0.7971
Epoch:005, Loss:0.6576, Val:0.8048
Epoch:006, Loss:0.6376, Val:0.8087
Epoch:007, Loss:0.6141, Val:0.8373
Epoch:008, Loss:0.5865, Val:0.8533
Epoch:009, Loss:0.5629, Val:0.8501
Epoch:010, Loss:0.5464, Val:0.8426
Epoch:011, Loss:0.5450, Val:0.8504
Epoch:012, Loss:0.5512, Val:0.8398
Epoch:013, Loss:0.5557, Val:0.8513
Epoch:014, Loss