In [1]:
## Packages 
import csv
import pandas as pd
import numpy as np
import os
import random
from tqdm import tqdm
import pickle

# For graphs
import networkx as nx 
import stellargraph as sg
from stellargraph.data import EdgeSplitter
from stellargraph.mapper import GraphSAGELinkGenerator
from stellargraph.layer import GraphSAGE, HinSAGE, link_classification
from stellargraph import globalvar

# For deep learning
from tensorflow import keras 
import tensorflow as tf

'''
config = tf.ConfigProto(intra_op_parallelism_threads=8, inter_op_parallelism_threads=2, allow_soft_placement=True, device_count = {'CPU': 8})

session = tf.Session(config=config)

os.environ["OMP_NUM_THREADS"] = "8"

os.environ["KMP_BLOCKTIME"] = "30"

os.environ["KMP_SETTINGS"] = "1"

os.environ["KMP_AFFINITY"]= "granularity=fine,verbose,compact,1,0"
'''

# For processing node texts
from sklearn.feature_extraction import text as fe

# Dimensionality reduction
from sklearn.decomposition import NMF, LatentDirichletAllocation

# Word embeddings
import gensim 
from gensim.models import Word2Vec

# For stemming
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk import word_tokenize

In [2]:
'''
tf.test.gpu_device_name()
#!cat /proc/meminfo
!cat /proc/cpuinfo
'''

'\ntf.test.gpu_device_name()\n#!cat /proc/meminfo\n!cat /proc/cpuinfo\n'

In [3]:
# Defining paths
small_matrix_path = r"pickles/stemmed_lda_64_matrix.PICKLE"
corpus_path = r"pickles/stemmed_corpus.PICKLE"

if os.path.exists(small_matrix_path):
    with open(small_matrix_path, 'rb') as f:
        lda_matrix = pickle.load(f)
    f.close()
else:
    print('Error')

In [4]:
# Storing in dataframe
n1, n2 = lda_matrix.shape
# Creating feature names
feature_names = ["w_{}".format(ii) for ii in range(n2)]
ids = sorted(range(n1), key=str)
node_data = pd.DataFrame(data=lda_matrix, index=ids, columns=feature_names)
print(node_data.shape)

(33226, 64)


In [5]:
print(node_data)

           w_0       w_1       w_2       w_3       w_4       w_5       w_6  \
0     0.000051  0.000051  0.000051  0.000051  0.000051  0.111335  0.000051   
1     0.000008  0.000008  0.000008  0.000008  0.000008  0.000008  0.000008   
10    0.000011  0.017704  0.000011  0.000011  0.000011  0.000011  0.000011   
100   0.155756  0.000953  0.000001  0.000001  0.000001  0.003889  0.000001   
1000  0.000049  0.000049  0.000049  0.000049  0.000049  0.016454  0.000049   
...        ...       ...       ...       ...       ...       ...       ...   
9995  0.000081  0.000081  0.000081  0.000081  0.000081  0.000081  0.000081   
9996  0.000009  0.000009  0.000009  0.000009  0.000009  0.000009  0.000009   
9997  0.000005  0.000005  0.000005  0.000005  0.000005  0.008869  0.000005   
9998  0.000009  0.000009  0.000009  0.000009  0.000009  0.014683  0.000009   
9999  0.001042  0.001042  0.001042  0.001042  0.001042  0.403546  0.001042   

           w_7       w_8       w_9  ...      w_54      w_55    

## Make Graphs Great Again

Time to get training and test data

In [6]:
with open(r"training.txt", "r") as f:
    reader = csv.reader(f)
    training  = list(reader)
# in order of training examples
training = [element[0].split(" ") for element in training]
training = pd.DataFrame(training, columns=['Node1', 'Node2', 'Link'])
print("Training examples shape: {}".format(training.shape))

with open(r"testing.txt", "r") as f:
    reader = csv.reader(f)
    testing  = list(reader)
# in order of testing examples
testing = [element[0].split(" ") for element in testing]
testing = pd.DataFrame(testing, columns=['Node1', 'Node2'])
print("Testing examples shape: {}".format(testing.shape))

Training examples shape: (453797, 3)
Testing examples shape: (113450, 2)


In [7]:
linked_nodes = training.loc[training['Link']=='1']
linked_nodes = linked_nodes[['Node1', 'Node2']]
linked_nodes.to_csv('linked_nodes.txt', sep=' ', index=False, header=False)

In [8]:
# Read edges and create NetworkX graph
edgelist = pd.read_csv("linked_nodes.txt", sep=' ', header=None, names=["source", "target"])
edgelist["label"] = "cites"  # set the edge type
G_all_nx = nx.from_pandas_edgelist(edgelist, edge_attr="label")
G_all_nx.add_nodes_from(ids)
nx.set_node_attributes(G_all_nx, "paper", "label")

# Initialize Stellargraph with node features of text
#G_all = sg.StellarGraph(G_all_nx, node_features=node_data[feature_names])

# Define an edge splitter on the original graph G:
edge_splitter_test = EdgeSplitter(G_all_nx)

# Randomly sample a fraction p=0.1 of all positive links, 
# and same number of negative links, from G, and obtain the
# reduced graph G_test with the sampled links removed:
G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split(
    p=0.1, method="global", keep_connected=True)

# Define an edge splitter on the reduced graph G_test:
edge_splitter_train = EdgeSplitter(G_test)

# Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G_test, and obtain the
# reduced graph G_train with the sampled links removed:
G_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split(
    p=0.1, method="global", keep_connected=True)

G_test = sg.StellarGraph(G_test, node_features=node_data[feature_names])
G_train = sg.StellarGraph(G_train, node_features=node_data[feature_names])

Removed 1000 edges
Removed 2000 edges
Removed 3000 edges
Removed 4000 edges
Removed 5000 edges
Removed 6000 edges
Removed 7000 edges
Removed 8000 edges
Removed 9000 edges
Removed 10000 edges
Removed 11000 edges
Removed 12000 edges
Removed 13000 edges
Removed 14000 edges
Removed 15000 edges
Removed 16000 edges
Removed 17000 edges
Removed 18000 edges
Removed 19000 edges
Removed 20000 edges
Removed 21000 edges
Removed 22000 edges
Removed 23000 edges
Removed 24000 edges
Removed 25000 edges
Removed 26000 edges
Removed 27000 edges
Removed 28000 edges
Sampled 1000 negative examples
Sampled 2000 negative examples
Sampled 3000 negative examples
Sampled 4000 negative examples
Sampled 5000 negative examples
Sampled 6000 negative examples
Sampled 7000 negative examples
Sampled 8000 negative examples
Sampled 9000 negative examples
Sampled 10000 negative examples
Sampled 11000 negative examples
Sampled 12000 negative examples
Sampled 13000 negative examples
Sampled 14000 negative examples
Sampled 15

In [9]:
print(G_test.info())
print(G_train.info())

StellarGraph: Undirected multigraph
 Nodes: 33226, Edges: 255261

 Node types:
  paper: [33226]
    Edge types: paper-cites->paper

 Edge types:
    paper-cites->paper: [255261]

StellarGraph: Undirected multigraph
 Nodes: 33226, Edges: 229735

 Node types:
  paper: [33226]
    Edge types: paper-cites->paper

 Edge types:
    paper-cites->paper: [229735]



In [24]:
batch_size = 20
epochs = 5
num_samples = [200, 200]

train_gen = GraphSAGELinkGenerator(G_train, batch_size, num_samples).flow(
    edge_ids_train, edge_labels_train, shuffle=True)
test_gen = GraphSAGELinkGenerator(G_test,  batch_size, num_samples).flow(
    edge_ids_test, edge_labels_test)

layer_sizes = [200, 200]
assert len(layer_sizes) == len(num_samples)

graphsage = GraphSAGE(
        layer_sizes=layer_sizes, generator=train_gen, bias=True, dropout=0.2)

In [25]:
# Build the model and expose input and output sockets of graphsage model for link prediction via graphsage.build() method
x_inp, x_out = graphsage.build()

prediction = link_classification(
    output_dim=1, output_act="relu", edge_embedding_method='ip')(x_out)
model = keras.Model(inputs=x_inp, outputs=prediction)

model.compile(
        optimizer=keras.optimizers.Adam(lr=1e-3),
        loss=keras.losses.binary_crossentropy,
        metrics=["acc"],)

link_classification: using 'ip' method to combine node embeddings into edge embeddings


In [26]:
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_26 (InputLayer)           [(None, 200, 64)]    0                                            
__________________________________________________________________________________________________
input_27 (InputLayer)           [(None, 40000, 64)]  0                                            
__________________________________________________________________________________________________
input_29 (InputLayer)           [(None, 200, 64)]    0                                            
__________________________________________________________________________________________________
input_30 (InputLayer)           [(None, 40000, 64)]  0                                            
____________________________________________________________________________________________

In [None]:
#init_train_metrics = model.evaluate_generator(train_gen)
#init_test_metrics = model.evaluate_generator(test_gen)
#
#print("\nTrain Set Metrics of the initial (untrained) model:")
#for name, val in zip(model.metrics_names, init_train_metrics):
#    print("\t{}: {:0.4f}".format(name, val))
#
#print("\nTest Set Metrics of the initial (untrained) model:")
#for name, val in zip(model.metrics_names, init_test_metrics):
#    print("\t{}: {:0.4f}".format(name, val))
    
history = model.fit_generator(
    train_gen,
    epochs=epochs,
    validation_data=test_gen,
    verbose=1
)

Epoch 1/5


 193/2553 [=>............................] - ETA: 79:19:50 - loss: 0.7471 - acc: 0.450 - ETA: 41:19:28 - loss: 0.6982 - acc: 0.525 - ETA: 28:17:32 - loss: 0.6251 - acc: 0.616 - ETA: 21:42:57 - loss: 0.6265 - acc: 0.650 - ETA: 17:59:44 - loss: 0.6235 - acc: 0.660 - ETA: 15:50:48 - loss: 0.6271 - acc: 0.641 - ETA: 14:26:39 - loss: 0.6237 - acc: 0.642 - ETA: 13:34:12 - loss: 0.6298 - acc: 0.631 - ETA: 12:52:37 - loss: 0.6265 - acc: 0.644 - ETA: 12:22:07 - loss: 0.6320 - acc: 0.635 - ETA: 11:44:15 - loss: 0.6304 - acc: 0.636 - ETA: 11:09:09 - loss: 0.6306 - acc: 0.641 - ETA: 10:41:50 - loss: 0.6216 - acc: 0.657 - ETA: 10:22:31 - loss: 0.6152 - acc: 0.660 - ETA: 10:08:37 - loss: 0.6163 - acc: 0.670 - ETA: 10:01:26 - loss: 0.6164 - acc: 0.675 - ETA: 9:57:47 - loss: 0.6104 - acc: 0.685 - ETA: 9:49:40 - loss: 0.6085 - acc: 0.68 - ETA: 9:38:34 - loss: 0.6092 - acc: 0.68 - ETA: 9:41:11 - loss: 0.6094 - acc: 0.68 - ETA: 9:34:12 - loss: 0.6070 - acc: 0.69 - ETA: 9:29:48 - loss: 0.6006 - acc: 0.69 



















Epoch 2/5


 195/2553 [=>............................] - ETA: 6:03:52 - loss: 0.2300 - acc: 0.90 - ETA: 5:25:17 - loss: 0.4021 - acc: 0.80 - ETA: 5:30:02 - loss: 0.3952 - acc: 0.81 - ETA: 5:39:40 - loss: 0.3990 - acc: 0.80 - ETA: 5:33:14 - loss: 0.3746 - acc: 0.82 - ETA: 5:34:03 - loss: 0.3791 - acc: 0.83 - ETA: 5:26:23 - loss: 0.3728 - acc: 0.82 - ETA: 5:30:34 - loss: 0.3867 - acc: 0.80 - ETA: 5:39:42 - loss: 0.4802 - acc: 0.78 - ETA: 5:34:50 - loss: 0.4775 - acc: 0.78 - ETA: 5:35:24 - loss: 0.4831 - acc: 0.77 - ETA: 5:28:26 - loss: 0.4868 - acc: 0.77 - ETA: 5:25:30 - loss: 0.4758 - acc: 0.77 - ETA: 5:21:23 - loss: 0.4623 - acc: 0.78 - ETA: 5:26:04 - loss: 0.4582 - acc: 0.79 - ETA: 5:25:40 - loss: 0.4594 - acc: 0.78 - ETA: 5:21:39 - loss: 0.4529 - acc: 0.78 - ETA: 5:15:39 - loss: 0.4464 - acc: 0.79 - ETA: 5:12:21 - loss: 0.4458 - acc: 0.79 - ETA: 5:09:15 - loss: 0.4467 - acc: 0.79 - ETA: 5:04:30 - loss: 0.4542 - acc: 0.79 - ETA: 5:00:00 - loss: 0.4603 - acc: 0.77 - ETA: 4:52:24 - loss: 0.4601 - a

 585/2553 [=====>........................] - ETA: 3:14:30 - loss: 0.4489 - acc: 0.79 - ETA: 3:14:23 - loss: 0.4485 - acc: 0.79 - ETA: 3:14:16 - loss: 0.4482 - acc: 0.79 - ETA: 3:14:10 - loss: 0.4483 - acc: 0.79 - ETA: 3:14:03 - loss: 0.4484 - acc: 0.79 - ETA: 3:13:57 - loss: 0.4482 - acc: 0.79 - ETA: 3:13:50 - loss: 0.4479 - acc: 0.79 - ETA: 3:13:42 - loss: 0.4478 - acc: 0.79 - ETA: 3:13:34 - loss: 0.4475 - acc: 0.79 - ETA: 3:13:27 - loss: 0.4472 - acc: 0.79 - ETA: 3:13:20 - loss: 0.4472 - acc: 0.79 - ETA: 3:13:12 - loss: 0.4477 - acc: 0.79 - ETA: 3:13:05 - loss: 0.4473 - acc: 0.79 - ETA: 3:12:59 - loss: 0.4471 - acc: 0.79 - ETA: 3:12:52 - loss: 0.4471 - acc: 0.79 - ETA: 3:12:45 - loss: 0.4492 - acc: 0.79 - ETA: 3:12:39 - loss: 0.4492 - acc: 0.79 - ETA: 3:12:32 - loss: 0.4490 - acc: 0.79 - ETA: 3:12:27 - loss: 0.4489 - acc: 0.79 - ETA: 3:12:21 - loss: 0.4487 - acc: 0.79 - ETA: 3:12:15 - loss: 0.4482 - acc: 0.79 - ETA: 3:12:08 - loss: 0.4481 - acc: 0.79 - ETA: 3:12:02 - loss: 0.4496 - a



















