In [2]:
import argparse
import numpy as np
import networkx as nx
import random
import math
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
from tqdm import tqdm

In [3]:
L_max = 100
N_walk = 10

In [4]:
data_name = 'dblp'


if data_name == 'dblp':
    data = 'Datasets/DBLP/dblp.edgelist'
elif data_name == 'movie':
    data = 'Datasets/Movies/Movies.txt'
elif data_name == 'Foursquare':
    data =  'Datasets/Foursquare/Foursquare.txt'


# 读取数据集
G = nx.read_edgelist(data)

# JUST

In [19]:
# 生成JUST路径
def get_justwalk(G,node,L_max,m=2):
    walk = [node]
    Q_hist = [node[0]]
    a = 0.5
    Pr_stay = 0
    same_length = 1
    while len(walk) < L_max:
        # 筛选出可以前往的邻居节点
        n_neighbors = list(G.neighbors(node))
        
        # 不存在邻居则直接结束
        if len(n_neighbors) == 0:
            break
        V_stay = [x for x in n_neighbors if x[0] == node[0]]
        # 计算Pr跳转概率
        if len(V_stay) == 0:
            Pr_stay = 0
        elif len(n_neighbors) == len(V_stay):
            Pr_stay = 1
        else:
            Pr_stay = math.pow(a, same_length)

        # (0,1)均匀抽样，决定JUMP or STAY
        r = random.uniform(0, 1)
        if r<=Pr_stay:
            # Stay
            random_node = random.choice(V_stay)
        else:
            # JUMP
            # 首先选取jump类型
            Q_jump = list(set([x[0] for x in n_neighbors if (x[0] not in Q_hist)]))
            if len(Q_jump)>0:
                # 选取JUMP节点
                jump_type = random.choice(Q_jump)
                V_jump = [x for x in n_neighbors if x[0] == jump_type]
                random_node = random.choice(V_jump)
            else:
                Q_jump = list(set([x[0] for x in n_neighbors if (x[0] != node[0])]))
                jump_type = random.choice(Q_jump)
                V_jump = [x for x in n_neighbors if (x[0] == jump_type)]
                random_node = random.choice(V_jump)
        if random_node[0] not in Q_hist:
            Q_hist.append(random_node[0])
            if len(Q_hist)>m:
                Q_hist.pop(0)
        elif random_node[0] == Q_hist[-1]:
            same_length+=1

        # 添加路径节点
        walk.append(random_node)

        if node[0]!=random_node[0]:
            same_length = 1

        node = random_node
    return walk

In [20]:
walks = []
nodes = list(G.nodes())

for node in tqdm(nodes):
    for i in range(N_walk):
        just_walks = get_justwalk(G, node, L_max)
        walks.append(just_walks)

100%|██████████| 2405/2405 [00:26<00:00, 92.45it/s] 


In [21]:
len(walks)

24050

In [22]:
print('Starting training .. ')
model = Word2Vec(walks, size=128, window=10, min_count=0, workers=4)
print('Finished training .. ')
model.wv.save_word2vec_format('Vectorfile/JUST/'+data_name+'.embeddings')

Starting training .. 
Finished training .. 


# JUST without menory

In [23]:
# 将memory设置为1
def get_justwalk_without_menory(G,node,L_max,m=1):
    walk = [node]
    Q_hist = [node[0]]
    a = 0.5
    Pr_stay = 0
    while len(walk) < L_max:
        # 筛选出可以前往的邻居节点
        n_neighbors = list(G.neighbors(node))
        
        # 不存在邻居则直接结束
        if len(n_neighbors) == 0:
            break
        V_stay = [x for x in n_neighbors if x[0] == node[0]]
        # 计算Pr跳转概率
        if len(V_stay) == 0:
            Pr_stay = 0
        elif len(n_neighbors) == len(V_stay):
            Pr_stay = 1
        else:
            Pr_stay = math.pow(a, 1)

        # (0,1)均匀抽样，决定JUMP or STAY
        r = random.uniform(0, 1)
        if r<=Pr_stay:
            # Stay
            random_node = random.choice(V_stay)
        else:
            # JUMP
            # 首先选取jump类型
            Q_jump = list(set([x[0] for x in n_neighbors if (x[0] not in Q_hist)]))
            if len(Q_jump)>0:
                # 选取JUMP节点
                jump_type = random.choice(Q_jump)
                V_jump = [x for x in n_neighbors if x[0] == jump_type]
                random_node = random.choice(V_jump)
            else:
                Q_jump = list(set([x[0] for x in n_neighbors if (x[0] != node[0])]))
                jump_type = random.choice(Q_jump)
                V_jump = [x for x in n_neighbors if (x[0] == jump_type)]
                random_node = random.choice(V_jump)
        if random_node[0] not in Q_hist:
            Q_hist.append(random_node[0])
            if len(Q_hist)>m:
                Q_hist.pop(0)

        # 添加路径节点
        walk.append(random_node)
        node = random_node
    return walk

In [24]:
walks = []
nodes = list(G.nodes())

for node in tqdm(nodes):
    for i in range(N_walk):
        just_walks = get_justwalk_without_menory(G, node, L_max,m=1)
        walks.append(just_walks)

100%|██████████| 2405/2405 [00:22<00:00, 108.28it/s]


In [25]:
print('Starting training .. ')
model = Word2Vec(walks, size=128, window=10, min_count=0, workers=4)
print('Finished training .. ')
model.wv.save_word2vec_format('Vectorfile/JUST_without_memory/'+data_name+'.embeddings')

Starting training .. 
Finished training .. 


#  DeepWalk

In [26]:
# 生成随机路径
def get_randomwalk(node, path_length):
    random_walk = [node]
    for i in range(path_length-1):
        temp = list(G.neighbors(node))
        temp = list(set(temp) - set(random_walk))    
        if len(temp) == 0:
            break

        random_node = random.choice(temp)
        random_walk.append(random_node)
        node = random_node
        
    return random_walk

In [27]:
# get list of all nodes from the graph
nodes = list(G.nodes())

DeepWalk = []
for n in tqdm(nodes):
    for i in range(N_walk):
        DeepWalk.append(get_randomwalk(n,L_max))

# count of sequences
len(DeepWalk)

100%|██████████| 2405/2405 [00:09<00:00, 254.49it/s]


24050

In [28]:
print('Starting training .. ')
model = Word2Vec(DeepWalk, size=128, window=10, min_count=0, workers=-1)
print('Finished training .. ')
model.wv.save_word2vec_format('Vectorfile/DeepWalk/'+data_name+'.embeddings')

Starting training .. 
Finished training .. 


# Metapath2Vec

In [33]:
# 获取Metapath路径
def get_metapath(node,path_length,meta):
    meta = meta[:-1]
    metapath = [node]
    for i in range(path_length-1):
        current_type = meta[(i+1)%len(meta)]
        neighbors = list(G.neighbors(node))
        temp = [n for n in neighbors if n[0] ==current_type]
#         temp = list(set(temp) - set(metapath))  
        if len(temp) == 0:
            break
        random_node = random.choice(temp)
        metapath.append(random_node)
        node = random_node
        
    return metapath

In [35]:
# get list of all nodes from the graph
nodes = list(G.nodes())
meta1 = ['amdma','amcma']
meta2 = ['ucpcu','pctcp']
Metapath = []
for n in tqdm([n for n in nodes if n[0] in meta[:][0]]):
    for i in range(N_walk):
        for m in meta:
            Metapath.append(get_metapath(n,L_max,m))
# count of sequences
len(Metapath)

100%|██████████| 204/204 [00:00<00:00, 662.55it/s]


18360

In [65]:
# def Save_list(list1,filename):
#     file2 = open(filename + '.txt', 'w')
#     for i in range(len(list1)):
#         for j in range(len(list1[i])):
#             file2.write(str(list1[i][j]))              
#             file2.write(' ')                         
#         file2.write('\n')                              # 写完一行立马换行
#     file2.close()

# Save_list(Metapath,'Metapath')

In [275]:
print('Starting training .. ')
model = Word2Vec(Metapath, size=128, window=10, min_count=0, workers=4)
print('Finished training .. ')
model.wv.save_word2vec_format('Vectorfile/Metapath2vec/'+data_name+'.embeddings')

Starting training .. 
Finished training .. 
