In [1]:
import dgl
import torch as th
import sys
import os
sys.path.append('./..')
import pandas as pd
import numpy as np
from pandarallel import pandarallel
pandarallel.initialize()
from dgl.data.utils import save_graphs
import pickle
from dgl.data.utils import load_graphs

Using backend: pytorch


INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
from torch import FloatTensor as FT
from torch import LongTensor as LT


In [3]:
def read_DBLP_data():
    
    loc = './../DBLP'
    fname_e = 'dblp_edges.csv'
    fname_n = 'dblp_nodes.csv'
    df_e = pd.read_csv(os.path.join(loc,fname_e),low_memory=False)
    df_n = pd.read_csv(os.path.join(loc,fname_n),low_memory=False)
    
    # ----------------------------------------
    # replace the node id by synthetic id 
    # ----------------------------------------
    def replace_id(val):      
        num = val[1:]
        t = val[0]
        return df_n.loc[
            (df_n['type']==t) & (df_n['ID']==val)
        ]['synID'].values.tolist()
        
    df_e['n1'] = df_e['n1'].parallel_apply(replace_id)
    df_e['n2'] = df_e['n2'].parallel_apply(replace_id)
    
    graph_data = {}
    print('Types of edges', set(df_e['etype']))
    
    for et in set(df_e['etype']):
        et_R = et[::-1]
        tmp = df_e.loc[df_e['etype']==et]
        n1 = tmp['n1'].values.tolist()
        n2 = tmp['n2'].values.tolist()
        _list = []
        _list_R = []
        for i,j in zip(n1,n2):
            _list.append((i[0], j[0]))
            _list_R.append((j[0], i[0]))
        graph_data [(et[0], et, et[1])] = _list 
        graph_data [(et[1], et_R, et[0])] = _list_R  
    return graph_data

In [4]:
graph_data = read_DBLP_data()
g = dgl.heterograph(graph_data)
print('Node types, edge types', g.ntypes, g.etypes)
print('Graph ::', g)

Types of edges {'PA', 'PC', 'PT'}
Node types, edge types ['A', 'C', 'P', 'T'] ['AP', 'CP', 'PA', 'PC', 'PT', 'TP']
Graph :: Graph(num_nodes={'A': 4057, 'C': 20, 'P': 14328, 'T': 3590},
      num_edges={('A', 'AP', 'P'): 19645, ('C', 'CP', 'P'): 14328, ('P', 'PA', 'A'): 19645, ('P', 'PC', 'C'): 14328, ('P', 'PT', 'T'): 81823, ('T', 'TP', 'P'): 81823},
      metagraph=[('A', 'P', 'AP'), ('P', 'A', 'PA'), ('P', 'C', 'PC'), ('P', 'T', 'PT'), ('C', 'P', 'CP'), ('T', 'P', 'TP')])


In [5]:
SAVE_FILE = "./dblp_graph_obj.dgl"
save_graphs(SAVE_FILE, g)

In [6]:
g,_ = load_graphs(SAVE_FILE)

In [7]:
graph_obj = g[0]

In [8]:
print('Node types, edge types', graph_obj.ntypes, graph_obj.etypes)
print('Graph ::', graph_obj)

Node types, edge types ['A', 'C', 'P', 'T'] ['AP', 'CP', 'PA', 'PC', 'PT', 'TP']
Graph :: Graph(num_nodes={'A': 4057, 'C': 20, 'P': 14328, 'T': 3590},
      num_edges={('A', 'AP', 'P'): 19645, ('C', 'CP', 'P'): 14328, ('P', 'PA', 'A'): 19645, ('P', 'PC', 'C'): 14328, ('P', 'PT', 'T'): 81823, ('T', 'TP', 'P'): 81823},
      metagraph=[('A', 'P', 'AP'), ('P', 'A', 'PA'), ('P', 'C', 'PC'), ('P', 'T', 'PT'), ('C', 'P', 'CP'), ('T', 'P', 'TP')])


In [70]:
metapaths = {
    'C' : ['CP', 'PC' ],
    'T' : ['TP', 'PT' ],
    'P' : ['PT', 'TP', 'PC' , 'CP']
}



In [71]:
def get_RW_list(graph_obj, metapaths, prefix=True):
    def add_prefix(prefix, val):
        return prefix + str(val)
    
    node_typeID2typename = {}
    for e in  enumerate(graph_obj.ntypes): 
        node_typeID2typename[e[0]]= e[1]
    
    RW_list =[]
    for ntype, mp in metapaths.items():
        mp =  mp * 10
        RW_mp = dgl.sampling.random_walk(
            graph_obj,
            metapath= mp,
            nodes = graph_obj.nodes(ntype),
        )
        print(RW_mp[1])
        _random_walks = RW_mp[0].data.numpy()
        if prefix:
            pattern = RW_mp[1].data.numpy().tolist()
            pattern = [node_typeID2typename[_] for _ in pattern ]
            print(pattern)
            vectorized_func = np.vectorize(add_prefix)
            _random_walks = vectorized_func( patterlengthn, _random_walks)
           
        RW_list.extend(_random_walks.tolist())
        
    return RW_list

In [72]:
random_walks = get_RW_list(graph_obj, metapaths)

tensor([1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1])
['C', 'P', 'C', 'P', 'C', 'P', 'C', 'P', 'C', 'P', 'C', 'P', 'C', 'P', 'C', 'P', 'C', 'P', 'C', 'P', 'C']
tensor([3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3])
['T', 'P', 'T', 'P', 'T', 'P', 'T', 'P', 'T', 'P', 'T', 'P', 'T', 'P', 'T', 'P', 'T', 'P', 'T', 'P', 'T']
tensor([2, 3, 2, 1, 2, 3, 2, 1, 2, 3, 2, 1, 2, 3, 2, 1, 2, 3, 2, 1, 2, 3, 2, 1,
        2, 3, 2, 1, 2, 3, 2, 1, 2, 3, 2, 1, 2, 3, 2, 1, 2])
['P', 'T', 'P', 'C', 'P', 'T', 'P', 'C', 'P', 'T', 'P', 'C', 'P', 'T', 'P', 'C', 'P', 'T', 'P', 'C', 'P', 'T', 'P', 'C', 'P', 'T', 'P', 'C', 'P', 'T', 'P', 'C', 'P', 'T', 'P', 'C', 'P', 'T', 'P', 'C', 'P']


True

In [13]:
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec
import multiprocessing as mp

cpu_count = mp.cpu_count()

In [14]:
class loss_callback(CallbackAny2Vec):
    '''Callback to print loss after each epoch.'''

    def __init__(self):
        self.epoch = 0
        self.loss_to_be_subed = 0
            
    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        loss_now = loss - self.loss_to_be_subed
        self.loss_to_be_subed = loss
        print('Loss after epoch {}: {}'.format(self.epoch, loss_now))
        self.epoch += 1


In [15]:
def node2vec(random_walks, epochs = 100):
    cpu_count = mp.cpu_count()
    model = Word2Vec(
        random_walks, 
        size=128, 
        window=3, 
        negative = 10,
        sg = 1,
        min_count=1, 
        iter = epochs,
        compute_loss=True,
        callbacks= [loss_callback()]
    )
  
    return model

In [16]:
model = node2vec(random_walks)

Loss after epoch 0: 2959041.25
Loss after epoch 1: 1616846.75
Loss after epoch 2: 1539773.5
Loss after epoch 3: 1494218.5
Loss after epoch 4: 1329412.0
Loss after epoch 5: 1068293.0
Loss after epoch 6: 1019163.0
Loss after epoch 7: 1126825.0
Loss after epoch 8: 1063675.0
Loss after epoch 9: 925662.0
Loss after epoch 10: 1026505.0
Loss after epoch 11: 965177.0
Loss after epoch 12: 907416.0
Loss after epoch 13: 795062.0
Loss after epoch 14: 781654.0
Loss after epoch 15: 731722.0
Loss after epoch 16: 750374.0
Loss after epoch 17: 710116.0
Loss after epoch 18: 652882.0
Loss after epoch 19: 711134.0
Loss after epoch 20: 694964.0
Loss after epoch 21: 627070.0
Loss after epoch 22: 689686.0
Loss after epoch 23: 614478.0
Loss after epoch 24: 646726.0
Loss after epoch 25: 669510.0
Loss after epoch 26: 611840.0
Loss after epoch 27: 662960.0
Loss after epoch 28: 653640.0
Loss after epoch 29: 650250.0
Loss after epoch 30: 567358.0
Loss after epoch 31: 658266.0
Loss after epoch 32: 578638.0
Loss aft

In [17]:
model.save("node2vec.model")

In [75]:
def get_node_vectors(w2v_model):
    vectors_dict = {} 
    for node, vec  in  w2v_model.wv.vocab.items():
        _type = node[0]
        _id = int(node[1:])
        if _id < 0 : continue
     
        if _type not in vectors_dict.keys(): vectors_dict[_type]= {}
        vectors_dict[_type][_id] = w2v_model.wv[node]
    return vectors_dict 

In [78]:
vectors_dict =  get_node_vectors(model)

In [79]:
len(vectors_dict['C'])

20

In [80]:
# Save the node2vec vectors
with open("node2vec_vectors_dblp.pkl",'wb') as fh:
    pickle.dump(vectors_dict, fh, pickle.HIGHEST_PROTOCOL)

In [81]:
graph_obj

Graph(num_nodes={'A': 4057, 'C': 20, 'P': 14328, 'T': 3590},
      num_edges={('A', 'AP', 'P'): 19645, ('C', 'CP', 'P'): 14328, ('P', 'PA', 'A'): 19645, ('P', 'PC', 'C'): 14328, ('P', 'PT', 'T'): 81823, ('T', 'TP', 'P'): 81823},
      metagraph=[('A', 'P', 'AP'), ('P', 'A', 'PA'), ('P', 'C', 'PC'), ('P', 'T', 'PT'), ('C', 'P', 'CP'), ('T', 'P', 'TP')])

In [82]:
graph_obj.nodes('P')

tensor([    0,     1,     2,  ..., 14325, 14326, 14327])

In [88]:
# ---------------------------------
# Assign the vectors to nodes 
# ---------------------------------
for _type,vec_dict in vectors_dict.items():
    # Sort by keys
    vec_dict = {k: vec_dict[k] for k in sorted(vec_dict)}
    arr = np.array(list(vec_dict.values()))
    print(arr.shape)
    graph_obj.nodes[_type].data['mp2v'] = FT(arr)
    graph_obj.nodes[_type].data['n2v'] = FT(np.zeros(len(vec_dict)))  

(20, 128)
(14328, 128)
(3590, 128)


In [89]:
graph_obj.nodes['P']

NodeSpace(data={'n2v': tensor([0., 0., 0.,  ..., 0., 0., 0.]), 'mp2v': tensor([[-0.2676,  0.0374,  0.7205,  ..., -0.0482,  0.5645, -0.7812],
        [-1.2595,  0.8070,  0.5722,  ..., -0.6487, -0.1616, -0.9677],
        [-0.7353,  0.8474,  0.1744,  ...,  0.0428, -0.2648, -0.4524],
        ...,
        [-0.6150,  0.7447,  0.0518,  ..., -0.0323,  0.8357, -0.2118],
        [ 0.1136,  1.6048,  0.6605,  ..., -0.7495, -0.6699,  0.0742],
        [-1.1061, -0.3933, -0.3096,  ..., -1.1272,  0.2970,  0.7285]])})