In [289]:
import dgl
import torch as th
import sys
import os
sys.path.append('./..')
import pandas as pd
import numpy as np
from pandarallel import pandarallel
pandarallel.initialize()
from dgl.data.utils import save_graphs
import pickle
from dgl.data.utils import load_graphs

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [145]:
def read_DBLP_data():
    
    loc = './../DBLP'
    fname_e = 'dblp_edges.csv'
    fname_n = 'dblp_nodes.csv'
    df_e = pd.read_csv(os.path.join(loc,fname_e),low_memory=False)
    df_n = pd.read_csv(os.path.join(loc,fname_n),low_memory=False)
    
    # ----------------------------------------
    # replace the node id by synthetic id 
    # ----------------------------------------
    def replace_id(val):      
        num = val[1:]
        t = val[0]
        return df_n.loc[
            (df_n['type']==t) & (df_n['ID']==val)
        ]['synID'].values.tolist()
        
    df_e['n1'] = df_e['n1'].parallel_apply(replace_id)
    df_e['n2'] = df_e['n2'].parallel_apply(replace_id)
    
    graph_data = {}
    print('Types of edges', set(df_e['etype']))
    
    for et in set(df_e['etype']):
        et_R = et[::-1]
        tmp = df_e.loc[df_e['etype']==et]
        n1 = tmp['n1'].values.tolist()
        n2 = tmp['n2'].values.tolist()
        _list = []
        _list_R = []
        for i,j in zip(n1,n2):
            _list.append((i[0], j[0]))
            _list_R.append((j[0], i[0]))
        graph_data [(et[0], et, et[1])] = _list 
        graph_data [(et[1], et_R, et[0])] = _list_R  
    return graph_data

In [146]:
graph_data = read_DBLP_data()
g = dgl.heterograph(graph_data)
print('Node types, edge types', g.ntypes, g.etypes)
print('Graph ::', g)

{'PA', 'PC', 'PT'}
Node types, edge types ['A', 'C', 'P', 'T'] ['AP', 'CP', 'PA', 'PC', 'PT', 'TP']
Graph :: Graph(num_nodes={'A': 4057, 'C': 20, 'P': 14328, 'T': 3590},
      num_edges={('A', 'AP', 'P'): 19645, ('C', 'CP', 'P'): 14328, ('P', 'PA', 'A'): 19645, ('P', 'PC', 'C'): 14328, ('P', 'PT', 'T'): 81823, ('T', 'TP', 'P'): 81823},
      metagraph=[('A', 'P', 'AP'), ('P', 'A', 'PA'), ('P', 'C', 'PC'), ('P', 'T', 'PT'), ('C', 'P', 'CP'), ('T', 'P', 'TP')])


In [147]:
SAVE_FILE = "./dblp_graph_obj.dgl"
save_graphs(SAVE_FILE, g)

In [148]:
g,_ = load_graphs(SAVE_FILE)

In [149]:
graph_obj = g[0]

In [150]:
print('Node types, edge types', graph_obj.ntypes, graph_obj.etypes)
print('Graph ::', graph_obj)

Node types, edge types ['A', 'C', 'P', 'T'] ['AP', 'CP', 'PA', 'PC', 'PT', 'TP']
Graph :: Graph(num_nodes={'A': 4057, 'C': 20, 'P': 14328, 'T': 3590},
      num_edges={('A', 'AP', 'P'): 19645, ('C', 'CP', 'P'): 14328, ('P', 'PA', 'A'): 19645, ('P', 'PC', 'C'): 14328, ('P', 'PT', 'T'): 81823, ('T', 'TP', 'P'): 81823},
      metagraph=[('A', 'P', 'AP'), ('P', 'A', 'PA'), ('P', 'C', 'PC'), ('P', 'T', 'PT'), ('C', 'P', 'CP'), ('T', 'P', 'TP')])


In [179]:
node_typeID2typename

{0: 'A', 1: 'C', 2: 'P', 3: 'T'}

In [155]:
metapaths = {
    'C' : ['CP', 'PC' ],
    'T' : ['TP', 'PT' ]
}



In [226]:
def get_RW_list(graph_obj, metapaths, prefix=True):
    def add_prefix(prefix, val):
        return prefix + str(val)
    
    node_typeID2typename = {}
    for e in  enumerate(graph_obj.ntypes): 
        node_typeID2typename[e[0]]= e[1]
    
    RW_list =[]
    for ntype, mp in metapaths.items():
        mp =  mp * 10
        RW_mp = dgl.sampling.random_walk(
            graph_obj,
            metapath= mp,
            nodes = graph_obj.nodes(ntype),
        )
        print(RW_mp[1])
        _random_walks = RW_mp[0].data.numpy()
        if prefix:
            pattern = RW_mp[1].data.numpy().tolist()
            pattern = [node_typeID2typename[_] for _ in pattern ]
            vectorized_func = np.vectorize(add_prefix)
            _random_walks = vectorized_func(pattern,_random_walks)     
        RW_list.extend(_random_walks.tolist())
        
    return RW_list

In [227]:
random_walks =  get_RW_list(graph_obj, metapaths)

tensor([1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1])
tensor([3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3])


In [215]:
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec
import multiprocessing as mp

cpu_count = mp.cpu_count()

In [247]:
class loss_callback(CallbackAny2Vec):
    '''Callback to print loss after each epoch.'''

    def __init__(self):
        self.epoch = 0
        self.loss_to_be_subed = 0
            
    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        loss_now = loss - self.loss_to_be_subed
        self.loss_to_be_subed = loss
        print('Loss after epoch {}: {}'.format(self.epoch, loss_now))
        self.epoch += 1


In [250]:
def node2vec(random_walks):
    
    cpu_count = mp.cpu_count()
    model = Word2Vec(
        random_walks, 
        size=128, 
        window=3, 
        negative = 10,
        sg = 1,
        min_count=1, 
        iter = 250,
        compute_loss=True,
        callbacks= [loss_callback()]
    )
  
    return model

In [253]:
model = node2vec(random_walks)

Loss after epoch 0: 825818.625
Loss after epoch 1: 591851.875
Loss after epoch 2: 432151.875
Loss after epoch 3: 377465.125
Loss after epoch 4: 325690.5
Loss after epoch 5: 306244.25
Loss after epoch 6: 293204.25
Loss after epoch 7: 279971.0
Loss after epoch 8: 263887.25
Loss after epoch 9: 246207.25
Loss after epoch 10: 228091.5
Loss after epoch 11: 187738.0
Loss after epoch 12: 170986.5
Loss after epoch 13: 157336.0
Loss after epoch 14: 144931.5
Loss after epoch 15: 134555.0
Loss after epoch 16: 124860.5
Loss after epoch 17: 115810.5
Loss after epoch 18: 108662.5
Loss after epoch 19: 101135.0
Loss after epoch 20: 95610.0
Loss after epoch 21: 91162.5
Loss after epoch 22: 84446.0
Loss after epoch 23: 79951.0
Loss after epoch 24: 77138.5
Loss after epoch 25: 73615.5
Loss after epoch 26: 71936.0
Loss after epoch 27: 68350.5
Loss after epoch 28: 66689.5
Loss after epoch 29: 65687.5
Loss after epoch 30: 61697.5
Loss after epoch 31: 60767.0
Loss after epoch 32: 58664.0
Loss after epoch 33: 

In [286]:
def get_node_vectors(w2v_model):
    vectors_dict = {} 
    for node,vec  in  w2v_model.wv.vocab.items():
        _type = node[0]
        _id = int(node[1:])
        if _type not in vectors_dict.keys(): vectors_dict[_type]= {}
        vectors_dict[_type][_id] = w2v_model.wv[node]
    return vectors_dict 

In [287]:
vectors_dict = get_node_vectors(model)

In [290]:
# Save the node2vec vectors
with open("node2vec_vectors_dblp.pkl",'wb') as fh:
    pickle.dump(vectors_dict, fh, pickle.HIGHEST_PROTOCOL)