In [23]:
import pathlib
import os
import numpy as np
import scipy.sparse
import scipy.io
import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer
import networkx as nx
# import utils.preprocess
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as sklearn_stopwords
from nltk import word_tokenize
from nltk.corpus import stopwords as nltk_stopwords
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
import stellargraph as sg 
from stellargraph import StellarGraph
from stellargraph.data import BiasedRandomWalk
import multiprocessing
from joblib import Parallel,delayed
from gensim.models import Word2Vec
import warnings
import collections
import os


[nltk_data] Downloading package wordnet to /home/ddatta/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ddatta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [24]:
src_dir = './processed_data/DBLP/'

nodes_author_df = pd.read_csv(
    os.path.join(
        src_dir,
        'nodes_author.csv'),
    index_col = 0
)
nodes_paper_df = pd.read_csv(
    os.path.join(
        src_dir,
        'nodes_paper.csv'),
    index_col = 0
)
nodes_term_df = pd.read_csv(
    os.path.join(
        src_dir,
        'nodes_term.csv'),
    index_col = 0
)
nodes_conf_df = pd.read_csv(
    os.path.join(
        src_dir,
        'nodes_conf.csv'),
    index_col = 0
)


fpath_list = ['PT_edges.csv','PC_edges.csv','PA_edges.csv']
df_edges = None
for fpath in fpath_list:
    _df = pd.read_csv( os.path.join(src_dir,fpath), index_col = None )
    if df_edges is None : df_edges = _df
    else:
        df_edges = df_edges.append(_df,ignore_index= True)



In [25]:

graph_obj = StellarGraph({
    "author": nodes_author_df, 
    "paper":nodes_paper_df,
    "term": nodes_term_df,
    "conf": nodes_conf_df
},
    df_edges
)

In [26]:
print('Graph details :: ', graph_obj.info())


Graph details ::  StellarGraph: Undirected multigraph
 Nodes: 26128, Edges: 119783

 Node types:
  paper: [14328]
    Features: none
    Edge types: paper-default->author, paper-default->conf, paper-default->term
  term: [7723]
    Features: none
    Edge types: term-default->paper
  author: [4057]
    Features: none
    Edge types: author-default->paper
  conf: [20]
    Features: none
    Edge types: conf-default->paper

 Edge types:
    paper-default->term: [85810]
        Weights: all 1 (default)
    paper-default->author: [19645]
        Weights: all 1 (default)
    paper-default->conf: [14328]
        Weights: all 1 (default)


In [27]:
def generate_random_walks(graph_obj , num_walks_per_node, walk_length):
    random_walk_object = BiasedRandomWalk(graph_obj)
    cpu_count = multiprocessing.cpu_count()
    res =  Parallel(n_jobs=8)(delayed(aux_gen_walks)(graph_obj, walk_length, random_walk_object) for i in range(num_walks_per_node))
    all_walks = []
    for r in res:
        all_walks.extend(r)
    return all_walks

def aux_gen_walks(
    graph_obj, 
    walk_length, 
    random_walk_object,
    num_walks = 1,
    p = 0.5, 
    q = 2
):
    walks = random_walk_object.run(
        nodes=graph_obj.nodes(),
        length=walk_length,
        n=num_walks,
        p=p, 
        q=q,
        weighted=False,
        seed=np.random.randint(100)
    )
    return walks

# ================================================================= # 
print(" ========================== ")

walk_length = 64
num_walks_per_node = 15
emb_dim


walks_save_file = "random_walks_{}_{}.npy".format(walk_length, num_walks_per_node)
try:
    walks_np_arr = np.load( walks_save_file )
    walks = [ list(_) for _ in walks_np_arr]
except:
    walks = generate_random_walks(graph_obj, num_walks_per_node, walk_length)
    walks_np_arr = np.array(walks)
    np.save( walks_save_file, walks_np_arr)

    
print("Number of random walks: {}".format(len(walks)))


Number of random walks: 209024


In [31]:
str_walks = [[str(n) for n in walk] for walk in walks]
n2v_model = Word2Vec(
    str_walks, size=emb_dim, window=5, min_count=0, sg=1, workers=-1, iter=1
)