In [3]:
import os
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import networkx as nx
from torch.utils.data import DataLoader
from torch.autograd import Variable
import sklearn
import heapq
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
import argparse
np.set_printoptions(suppress=True)
# tensorboard
from tensorboardX import SummaryWriter

from src.loss import customized_loss, margin_ranking_loss, custom_loss_trusted_err
from src.dataset import Dataset
from src.layers import GraphConvLayer
from src.utils import generate_neg_sample, load_data
from src.model import Model
import os.path


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"current device is {device}")

parser = argparse.ArgumentParser()


parser.add_argument("--epoch", type=int, default=10, help="epoch to run")
parser.add_argument("--seed", type=int, default=8, help="training set ratio")
parser.add_argument('--hidden', type=int, default=128, help="hidden dimension of entity embeddings")
parser.add_argument('--lr', type=float, default=0.01, help="learning rate")
parser.add_argument('--k', type=float, default=10, help="hit@k")
parser.add_argument('--negsize', type=int, default=10, help="number of negative samples")
parser.add_argument('--negiter', type=int, default=10, help="re-calculate epoch of negative samples")
parser.add_argument('--weight_decay', type=float, default=1e-5, help="weight decay coefficient")
parser.add_argument('--graph_s', type=str, default="data_G1", help="source graph path")
parser.add_argument('--graph_d', type=str, default="data_G2", help="destination graph path")
parser.add_argument('--anoise', type=float, default=0.2, help="anchor noise")
parser.add_argument('--board_path', type=str, default='board', help="tensorboard path")

args,unknow =parser.parse_known_args()
# args = parser.parse_args()
# args = parser.parse_known_args()[0]
# args,unknow =parser.parse_known_args()



current device is cuda


In [4]:
############################
# parameters
epoch = args.epoch
embedding_dim = args.hidden
learning_rate = args.lr
weight_decay = args.weight_decay
neg_samples_size = args.negsize
negiter = args.negiter
graph_path_s = args.graph_s
graph_path_d = args.graph_d
train_seeds_ratio = args.seed * 0.1
k = args.k
anoise = args.anoise
############################
tb_logger = SummaryWriter(args.board_path)

############################
# preprocess
graph1 = graph_path_s
graph2 = graph_path_d
A1, A2, anchor = load_data(graph1=graph1, graph2=graph2, anoise=anoise)
train_size = int(train_seeds_ratio * len(anchor[:, 0]))
test_size = len(anchor[:, 0]) - train_size
# train_set, test_set = torch.utils.data.random_split(anchor, lengths=[train_size, test_size], 
                                                        # generator=torch.Generator().manual_seed(43))
train_set, test_set = torch.utils.data.random_split(anchor, lengths=[train_size, test_size])
train_set = np.array(list(train_set))
test_set = np.array(list(test_set))
batchsize = train_size
train_dataset = Dataset(train_set)
train_loader = DataLoader(dataset=train_dataset, batch_size=batchsize, shuffle=False)
model = Model(Variable(torch.from_numpy(A1).float()), Variable(torch.from_numpy(A2).float()), embedding_dim=embedding_dim)
optimizer = torch.optim.Adagrad(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate, weight_decay=weight_decay)
criterion = nn.TripletMarginLoss(margin=3, p=2)
############################

pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'#total params: {pytorch_total_params}')
print(f"training samples: {train_size}, test samples: {test_size}")
print(f"model architecture:\n {model}")


def predict(output_file, sim_measure="cosine",):
    """
    将预测写入文件
    """
    model.eval()
    Embedding1, Embedding2 = model()
    Embedding1 = Embedding1.detach()
    Embedding2 = Embedding2.detach()

    # step 1: generate sim mat
    if sim_measure == "cosine":
        # similarity_matrix = cosine_similarity(Embedding1, Embedding2)
        similarity_matrix = torch.mm(Embedding1, Embedding2.t()).cpu().numpy()
    else:
        Embedding1 = Embedding1.numpy()
        Embedding2 = Embedding2.numpy()
        similarity_matrix = euclidean_distances(Embedding1, Embedding2)
        similarity_matrix = np.exp(-similarity_matrix)
    
    if (os.path.isfile(output_file)==False):
        os.mknod(output_file)
    
    # step 2: information statistics
    # alignment_hit1 = list()
    file = open(output_file, 'w')
    for idx0, line in enumerate(similarity_matrix):
        idx = np.argmax(line)
        idx = int(idx)
        # alignment_hit1.append(idx)
        file.write(f'{idx0} {idx}\n')


def evaluate(data, k, sim_measure="cosine", phase="test"):
    model.eval()
    Embedding1, Embedding2 = model()
    Embedding1 = Embedding1.detach()
    Embedding2 = Embedding2.detach()
    if phase == "over":
        print(Embedding1)
    # step 1: generate sim mat
    if sim_measure == "cosine":
        # similarity_matrix = cosine_similarity(Embedding1, Embedding2)
        similarity_matrix = torch.mm(Embedding1, Embedding2.t()).cpu().numpy()
    else:
        Embedding1 = Embedding1.numpy()
        Embedding2 = Embedding2.numpy()
        similarity_matrix = euclidean_distances(Embedding1, Embedding2)
        similarity_matrix = np.exp(-similarity_matrix)
    # step 2: information statistics
    alignment_hit1 = list()
    alignment_hitk = list()
    for line in similarity_matrix:
        idx = np.argmax(line)
        idx = int(idx)
        alignment_hit1.append(idx)
        idxs = heapq.nlargest(k, range(len(line)), line.take)
        alignment_hitk.append(idxs)
    # step 3: calculate evaluate score: hit@1 and hit@k
    hit_1_score = 0
    hit_k_score = 0
    for idx in range(len(data)):
        gt = data[idx][1]
        if int(gt) == alignment_hit1[idx]:
            hit_1_score += 1
        if int(gt) in alignment_hitk[idx]:
            hit_k_score += 1
    return similarity_matrix, alignment_hit1, alignment_hitk, hit_1_score, hit_k_score


# begin training
best_E1 = None
best_E2 = None
best_hit_1_score = 0
neg1_left, neg1_right, neg2_left, neg2_right = generate_neg_sample(train_set, neg_samples_size=neg_samples_size)

trusted_pair = np.loadtxt(f'data/trusted_pair.txt', delimiter=' ')
err_pair = np.loadtxt(f'data/err_pair.txt', delimiter=' ') #(90,2)
print(trusted_pair.shape, err_pair.shape)

trusted_left, trusted_right = trusted_pair[:,0], trusted_pair[:,1] # (90,) (90,)
err_left, err_right =  err_pair[:,0], err_pair[:, 1] # (90,) (90,)
                                       
for e in range(epoch):
    model.train()
    if e % negiter == 0:
        neg1_left, neg1_right, neg2_left, neg2_right = generate_neg_sample(train_set, neg_samples_size=neg_samples_size)
    for _, data in enumerate(train_loader):
        a1_align, a2_align = data
        E1, E2 = model()
        optimizer.zero_grad()
        # print("---")
        # print(E1.shape, E2.shape)             #torch.Size([1135, 128]) torch.Size([1135, 128])
        # print(a1_align.shape, a2_align.shape) #torch.Size([363]) torch.Size([363])
        # print(neg1_left.shape, neg1_right.shape)       #(3630,) (3630,)
        # print("-*-")
        
        # loss = customized_loss(E1, E2, a1_align, a2_align, neg1_left, neg1_right, neg2_left, neg2_right, neg_samples_size=neg_samples_size, neg_param=0.3)
        loss = custom_loss_trusted_err(err_left, err_right,
                                       trusted_left, trusted_right,
                                       E1, E2, 
                                       a1_align, a2_align, 
                                       neg1_left, neg1_right, 
                                       neg2_left, neg2_right, 
                                       neg_samples_size=neg_samples_size, 
                                       neg_param=0.3) # neg_param=0.3 没有使用 
        # loss = margin_ranking_loss(criterion, E1, E2, a1_align, a2_align, neg1_left, neg1_right, neg2_left, neg2_right)
        loss.backward()  # print([x.grad for x in optimizer.param_groups[0]['params']])
        optimizer.step()
        sim_mat, alignment_hit1, alignment_hitk, hit_1_score, hit_k_score = evaluate(data=test_set, k=k)

        if hit_1_score > best_hit_1_score:
            best_hit_1_score = hit_1_score
            # todo save model
            print(f"current best Hits@1 count at the {e+1}th epoch: {best_hit_1_score}")

    tb_logger.add_scalar('loss_train', loss.item(), epoch)
    print(f"epoch: {e+1}, loss: {round(loss.item(), 3)}\n")

# final evaluation and test
# ground_truth = np.loadtxt('ground_truth.txt', delimiter=' ')
# ground_truth = np.loadtxt('data/anchor/anchor_0.2_test.txt', delimiter=' ')
# similarity_matrix, alignment_hit1, alignment_hitk, hit_1_score, hit_k_score = evaluate(data=ground_truth, k=k, phase="over")
# print(similarity_matrix)
# print(f"final score: hit@1: total {hit_1_score} and ratio {round(hit_1_score/len(ground_truth), 2)}, hit@{k}: total {hit_k_score} and ratio {round(hit_k_score/len(ground_truth), 2)}")

# 写入文件
predict(f'submit_tmp_{args.graph_s}_{args.graph_d}_{anoise}.txt',)

#total params: 323584
training samples: 363, test samples: 91
model architecture:
 Model(
  (dropout): Dropout(p=0.2, inplace=False)
  (gcnblocks): ModuleList(
    (0): GraphConvLayer(
      (act): ReLU()
    )
    (1): GraphConvLayer(
      (act): ReLU()
    )
  )
)
(90, 2) (90, 2)
epoch: 1, loss: 27735.174

current best Hits@1 count at the 2th epoch: 1
epoch: 2, loss: 32104.65

epoch: 3, loss: 21750.719

epoch: 4, loss: 16277.494

epoch: 5, loss: 17253.939

epoch: 6, loss: 14943.545

epoch: 7, loss: 10047.324

epoch: 8, loss: 7851.937

epoch: 9, loss: 6210.887

epoch: 10, loss: 5608.578

