In [1]:
import math
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
import numpy as np
import time
import sys
import argparse
from tqdm import tqdm, trange
import pycparser
from createclone_bcb import createast,creategmndata,createseparategraph
import models
from torch_geometric.data import Data, DataLoader
from torch.utils.tensorboard import SummaryWriter  
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
import numpy as np
from scipy.linalg import fractional_matrix_power

  from .autonotebook import tqdm as notebook_tqdm


# 配置参数设置

In [2]:
parser = argparse.ArgumentParser()
parser.add_argument("--cuda", default=True)
parser.add_argument("--dataset", default='gcj')
parser.add_argument("--graphmode", default='astandnext')
parser.add_argument("--nextsib", default=True)
parser.add_argument("--ifedge", default=True)
parser.add_argument("--whileedge", default=True)
parser.add_argument("--foredge", default=True)
parser.add_argument("--blockedge", default=True)
parser.add_argument("--nexttoken", default=True)
parser.add_argument("--nextuse", default=True)
parser.add_argument("--data_setting", default='11')
parser.add_argument("--batch_size", default=32)
parser.add_argument("--num_layers", default=4)
parser.add_argument("--num_epochs", default=10)
parser.add_argument("--lr", default=0.001)
parser.add_argument("--threshold", default=0.5)
# args = parser.parse_args()
args = parser.parse_known_args()[0]
device=torch.device('cuda:0')

# 数据加载

In [13]:
astdict,vocablen,vocabdict=createast()
treedict=createseparategraph(astdict, vocablen, vocabdict,device,mode=args.graphmode,nextsib=args.nextsib,ifedge=args.ifedge,whileedge=args.whileedge,foredge=args.foredge,blockedge=args.blockedge,nexttoken=args.nexttoken,nextuse=args.nextuse)
traindata,validdata,testdata=creategmndata(args.data_setting,treedict,vocablen,vocabdict,device)

24876 4024 3851 29899 130 109
allnodes  2205239
77535
nextsib  True
ifedge  True
whileedge  True
foredge  /home/dlf/.local/share/jupyter/runtime/kernel-v2-361906wJB3WwDdIhqB.json
blockedge  True
nexttoken True
nextuse  True
9133
train data
valid data
test data


# 参数设置

In [4]:
num_layers=int(args.num_layers)
model=models.GMNnet(vocablen,embedding_dim=100,num_layers=num_layers,device=device).to(device)
# 这儿进行了修改，将原来的Adam改为了AdamW
optimizer = optim.AdamW(model.parameters(), lr=args.lr)
criterion=nn.CosineEmbeddingLoss()
criterion2=nn.MSELoss()
criterion3 = torch.nn.BCEWithLogitsLoss()



In [17]:
# 将当前x矩阵转换为邻接矩阵并进行归一化
def to_adjacen_matrix(nodes,edge_index):
    # 构造邻接矩阵的样式并求出当前邻接矩阵大小
    data = [item[0] for item in nodes]
    min_id = min(data)
    max_id = max(data)
    len_nodes = max_id - min_id + 1
    # 构造邻接矩阵
    A = [[0 for x in range(len_nodes)] for y in range(len_nodes)]
    print(len(A),len(A[0]))
    # 取出边矩阵
    sources,targets = edge_index
    # 对邻接矩阵赋值    
    for index,source in enumerate(sources):
        row = source-min_id
        col = targets[index] - min_id
        A[row][col]+=1

    # 归一化所需要的D矩阵
    matrix_d = [[0 for x in range(len_nodes)] for y in range(len_nodes)]

    # 添加指向自己的边（单位阵）
    for index in range(len(A)):
        A[index][index] += 1
        """
        执行归一化需要的数据
            找到D矩阵，在邻接矩阵两边分别✖️-根号D，
            D = 对邻接矩阵每一行求和并写在对角线上
        """
        matrix_d[index][index] = sum(A[index])

    
    # 下面对这个矩阵进行归一化
    D = np.array(matrix_d)
    D = fractional_matrix_power(D, -0.5)
    A = np.array(A)
    # 
    A = np.dot(D,A)
    A = np.dot(A,D)
    del D

    return A

# x = [[1],[2],[3],[4],[5]]
# edge_index = [[1,1,2,2,3,3,4,4,5,5],[3,2,1,4,1,5,2,5,4,3]]
# res = to_adjacen_matrix(x, edge_index)
# for item in res:
#     print(item)
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"


# 参数加载

In [16]:
# 存储一对数据的类
class PairData(Data):
    def __init__(self, edge_index_s=None, x_s=None, edge_index_t=None, x_t=None,edge_attr_s=None,edge_attr_t=None,label=None):
        super().__init__()
        self.edge_index_s = edge_index_s
        self.x_s = x_s
        self.edge_index_t = edge_index_t
        self.x_t = x_t
        self.edge_attr_s = edge_attr_s
        self.edge_attr_t = edge_attr_t
        self.label = label

    def __inc__(self, key, value, *args, **kwargs):
        if key == 'edge_index_s':
            return self.x_s.size(0)
        if key == 'edge_index_t':
            return self.x_t.size(0)
        else:
            return super().__inc__(key, value, *args, **kwargs)

# 所有数据的data列表
data_list = []
for item in traindata:
    total_data,label = item
    x1, x2, edge_index1, edge_index2, edge_attr1, edge_attr2=total_data
    x1 = to_adjacen_matrix(x1, edge_index1)
    x2 = to_adjacen_matrix(x2, edge_index2)
    # 数据处理
    label =  [0] if label == -1 else [1]

    # 创建pairdata
    data = PairData(
        edge_index_s=edge_index1,x_s=x1,edge_attr_s=edge_attr1,
        edge_index_t=edge_index2,x_t=x2,edge_attr_t=edge_attr2,
        label=label
    )
    data_list.append(data)



## Train

In [11]:
loss_list = []
writer = SummaryWriter('log/')
epochs = trange(args.num_epochs, leave=True, desc = "Epoch")
# dataloader
loader = DataLoader(data_list, batch_size=args.batch_size,follow_batch=['x_s', 'x_t'])

for epoch in epochs:# without batching
    print(epoch)
    # batches=create_batches(traindata)
    totalloss=0.0
    main_index=0.0
    
    for index, batch in tqdm(enumerate(loader), total=args.batch_size, desc = "Batches"):
        print(batch.x_s)
        # batch.x_s = to_adjacen_matrix(batch.x_s, batch.edge_index_s)
        # batch.x_t = to_adjacen_matrix(batch.x_t, batch.edge_index_t)

        batch.edge_index_s = torch.tensor(batch.edge_index_s, dtype=torch.long, device=device)
        batch.x_s = torch.tensor(batch.x_s, dtype=torch.long, device=device)
        batch.edge_index_t = torch.tensor(batch.edge_index_t, dtype=torch.long, device=device)
        batch.x_t = torch.tensor(batch.x_t, dtype=torch.long, device=device)
        batch.edge_attr_s = torch.tensor(batch.edge_attr_s, dtype=torch.long, device=device)
        batch.edge_attr_t = torch.tensor(batch.edge_attr_t, dtype=torch.long, device=device)
        batch.label = torch.tensor(batch.label, dtype=torch.long, device=device)

        label = batch.label 
        optimizer.zero_grad()
        batchloss= 0
        logits=model(batch)
        batchloss = criterion3(logits, label)  # -log(sigmoid(1.5))


        # for data,label in batch:
        #     logits=model(data)
        #     batchloss += criterion3(logits, label)  # -log(sigmoid(1.5))

        batchloss.backward(retain_graph=True)
        optimizer.step()
        loss = batchloss.item()
        loss_list.append(loss)
        # writer.add_scalar('loss',loss, epoch*len(batches)+index)
        epochs.set_description("Epoch (Loss=%g)" % round(loss,5))


#     #test(validdata)

#     devresults=test(validdata[:40000])
#     devfile=open('gmnbcbresult/'+args.graphmode+'_dev_epoch_'+str(epoch+1),mode='w')
#     for res in devresults:
#         devfile.write(str(res)+'\n')
#     devfile.close()
#     testresults=test(testdata[:40000])
#     resfile=open('gmnbcbresult/'+args.graphmode+'_epoch_'+str(epoch+1),mode='w')
#     for res in testresults:
#         resfile.write(str(res)+'\n')
#     resfile.close()

#     #torch.save(model,'gmnmodels/gmnbcb'+str(epoch+1))
#     #for start in range(0, len(traindata), args.batch_size):
#         #batch = traindata[start:start+args.batch_size]
#         #epochs.set_description("Epoch (Loss=%g)" % round(loss,5))

# import joblib
# joblib.dump(loss_list,"loss_data.data")

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

0


Batches:   0%|          | 0/32 [00:00<?, ?it/s]
Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

[[[72831], [15388], [42038], [54640], [60997], [25372], [16545], [35431], [60997], [25372], [16545], [55423], [70644], [17620], [25372], [58336], [64996], [43893], [37057], [25372], [58336], [37057], [25372], [57636], [74833], [35431], [17620], [25372], [5718], [64996], [66631], [37057], [25372], [5718], [37057], [25372], [51863], [74833], [55423], [38677], [46387], [66631], [46387], [2844], [34627], [48737], [20966], [40180], [24003], [44420], [41638], [64996], [39017], [69080], [50965], [25260], [64449], [74833], [39017], [42181], [74833], [61698], [74833], [23429], [39017], [62672], [38677], [46387], [43893], [46387], [66631], [75099], [39370], [38677], [46387], [66631], [15641], [38677], [46387], [43893], [15641]], [[72831], [15388], [42038], [38524], [37921], [60997], [25372], [31619], [48788], [70644], [17620], [25372], [69818], [64996], [20266], [46387], [45117], [37057], [25372], [51863], [69080], [30145], [35654], [17620], [25372], [55387], [64996], [56082], [46387], [6065], [




ValueError: expected sequence of length 306 at dim 2 (got 1854)