In [1]:
## Packages 

# For graphs
import networkx as nx 
import stellargraph as sg
from stellargraph.data import EdgeSplitter
from stellargraph.mapper import GraphSAGELinkGenerator
from stellargraph.layer import GraphSAGE, HinSAGE, link_classification
from stellargraph import globalvar

# For DL
from tensorflow import keras 

import pandas as pd
import numpy as np
import os
import random

from tqdm import tqdm # progess bar
import pickle

# For processing node texts
import spacy
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction import text as fe

# For stemming
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk import word_tokenize



In [2]:
# Load corpus and ids from pickles
corpus_path = r"pickles/corpus.PICKLE" 
ids_path = r"pickles/IDs.PICKLE"
with open(corpus_path, 'rb') as f:
    corpus = pickle.load(f)
f.close()
with open(ids_path, 'rb') as f:
    ids = pickle.load(f)
f.close()

# Save in dataframe
node_info = pd.DataFrame({'ID': ids, 'Corpus': corpus})
node_info_ID = node_info.set_index(['ID'])

In [3]:
# For each set of tokens calculate ratio of each language present
def calculate_languages_ratios_from_tokens(tokens):
    languages_ratios = []
    
    # Lower words in set of tokens
    words = [word.lower() for word in tokens]
    
    # Supported languages as intersection
    supported_languages = set(stopwords.fileids()) & set(SnowballStemmer.languages)
    
    # For each language, identify ratio in set of tokens
    for language in supported_languages:
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)
        languages_ratios.append(len(common_elements))
        
    # Set to zero if ratio is zero
    if sum(languages_ratios) == 0:
        return np.zeros(len(languages_ratios))
    
    return np.array(languages_ratios)/sum(languages_ratios)

In [4]:
## Stem corpus based on frequent languages

# Defining path
stemmed_corpus_path = r"pickles/stemmed_corpus.PICKLE"


if os.path.exists(stemmed_corpus_path):
    pass
else:
    # Supported languages as intersection
    supported_languages = list(set(stopwords.fileids()) & set(SnowballStemmer.languages))
    stemmed_corpus = []

    # For each text in corpus
    for text in tqdm(node_info['Corpus'].values, position=0, leave=True):

        # Identify tokens
        tokens = word_tokenize(text)

        # Compute language ratios
        ratio = calculate_languages_ratios_from_tokens(tokens)

        # Note most frequent languages in langs
        if np.sum(ratio == 0):
            pass
        if np.any(ratio>=0.25):
            indices = np.where(ratio >= 0.25)[0]
            langs = [supported_languages[j] for j in indices]
        elif np.all(ratio<0.25) and np.any(ratio>0.10):
            indices = np.where(ratio > 0.10)[0]
            langs = [supported_languages[j] for j in indices]
        else:
            langs = [supported_languages[np.argmax(ratio)]]

        # For each frequent language stem word if not a stopword 
        # and if it consists of alphabet letters
        for lang in langs:
            lang_stopwords = stopwords.words(lang)
            stemmer = SnowballStemmer(lang)
            tokens = [stemmer.stem(word) for word in tokens if (word not in lang_stopwords) and word.isalpha()]
        stemmed_corpus.append(' '.join(tokens))

    # Dump pickle
    with open(stemmed_corpus_path, '+wb') as f:
        pickle.dump(stemmed_corpus, f)
    f.close()

In [5]:
## Generating smaller dictionary with stemmed words, no stopwords and frequency > 20

# Defining paths
small_matrix_path = r"pickles/small_word_matrix.PICKLE"
corpus_path = r"pickles/stemmed_corpus.PICKLE"

if os.path.exists(small_matrix_path):
    with open(small_matrix_path, 'rb') as f:
        word_matrix = pickle.load(f)
    f.close()
else:
    with open(corpus_path, 'rb') as f:
        corpus = pickle.load(f)
        
        # Get vectorizer from feature extraction package
        vectorizer = fe.CountVectorizer(max_features = 5000, strip_accents = 'unicode')
        
        # Vectorize corpus
        word_matrix = vectorizer.fit_transform(tqdm(corpus))
        # Dump pickle
        with open(small_matrix_path, '+wb') as g:
            pickle.dump(word_matrix, g)
        g.close()
    f.close

In [6]:
# Corpus and ids path
small_matrix_path = r"pickles/small_word_matrix.PICKLE"

# Open pickle and store word matrix
with open(small_matrix_path, 'rb') as f:
    small_matrix = pickle.load(f)
f.close()

# Creating ids and feature names
n1, n2 = small_matrix.shape
ids = range(n1)
feature_names = ["w_{}".format(ii) for ii in range(n2)]

# Storing in dataframe
node_data = pd.DataFrame(data=small_matrix.toarray(), index=ids, columns=feature_names)

In [7]:
print(node_data[feature_names])

       w_0  w_1  w_2  w_3  w_4  w_5  w_6  w_7  w_8  w_9  ...  w_4990  w_4991  \
0        0    0    0    4    0    0    0    0    0    0  ...       0       0   
1        0    0    0    0    0    0    0    0    0    0  ...       0       0   
2        0    0    0    0    0    0    0    0    0    0  ...       0       0   
3        0    0    0    0  176    6    0    0    0    0  ...       0       0   
4        0    0    0    0    0    0    0    0    0    0  ...       0       0   
...    ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...     ...     ...   
33221    0    0    0    0    0    0    1    0    0    0  ...       0       0   
33222    0    0    0    0    0    0    0    0    0    0  ...       0       0   
33223    0    0    0    0    0    1    0    0    0    0  ...       0       0   
33224    0    0    0    0    0    0    0    0    0    0  ...       0       0   
33225    0    0    0    0    0    0    0    0    0    0  ...       0       0   

       w_4992  w_4993  w_4994  w_4995  

In [8]:
# Read edges and create NetworkX graph
edgelist = pd.read_csv("linked_nodes.txt", sep=' ', header=None, names=["source", "target"])
edgelist["label"] = "cites"  # set the edge type
G_all_nx = nx.from_pandas_edgelist(edgelist, edge_attr="label")
nx.set_node_attributes(G_all_nx, "paper", "label")

# Initialize Stellargraph with node features of text
#G_all = sg.StellarGraph(G_all_nx, node_features=node_data[feature_names])

# Define an edge splitter on the original graph G:
edge_splitter_test = EdgeSplitter(G_all_nx)

# Randomly sample a fraction p=0.1 of all positive links, 
# and same number of negative links, from G, and obtain the
# reduced graph G_test with the sampled links removed:
G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split(
    p=0.1, method="global", keep_connected=True)

# Define an edge splitter on the reduced graph G_test:
edge_splitter_train = EdgeSplitter(G_test)

# Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G_test, and obtain the
# reduced graph G_train with the sampled links removed:
G_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split(
    p=0.1, method="global", keep_connected=True)

G_test = sg.StellarGraph(G_test, node_features=node_data[feature_names])
G_train = sg.StellarGraph(G_train, node_features=node_data[feature_names])

Removed 1000 edges
Removed 2000 edges
Removed 3000 edges
Removed 4000 edges
Removed 5000 edges
Removed 6000 edges
Removed 7000 edges
Removed 8000 edges
Removed 9000 edges
Removed 10000 edges
Removed 11000 edges
Removed 12000 edges
Removed 13000 edges
Removed 14000 edges
Removed 15000 edges
Removed 16000 edges
Removed 17000 edges
Removed 18000 edges
Removed 19000 edges
Removed 20000 edges
Removed 21000 edges
Removed 22000 edges
Removed 23000 edges
Removed 24000 edges
Removed 25000 edges
Removed 26000 edges
Sampled 1000 negative examples
Sampled 2000 negative examples
Sampled 3000 negative examples
Sampled 4000 negative examples
Sampled 5000 negative examples
Sampled 6000 negative examples
Sampled 7000 negative examples
Sampled 8000 negative examples
Sampled 9000 negative examples
Sampled 10000 negative examples
Sampled 11000 negative examples
Sampled 12000 negative examples
Sampled 13000 negative examples
Sampled 14000 negative examples
Sampled 15000 negative examples
Sampled 16000 nega

In [9]:
print(G_test.info())
print(G_train.info())

StellarGraph: Undirected multigraph
 Nodes: 33119, Edges: 242704

 Node types:
  paper: [33119]
    Edge types: paper-cites->paper

 Edge types:
    paper-cites->paper: [242704]

StellarGraph: Undirected multigraph
 Nodes: 33119, Edges: 218434

 Node types:
  paper: [33119]
    Edge types: paper-cites->paper

 Edge types:
    paper-cites->paper: [218434]



In [10]:
batch_size = 20
epochs = 20
num_samples = [10, 5]

train_gen = GraphSAGELinkGenerator(G_train, batch_size, num_samples).flow(
    edge_ids_train, edge_labels_train, shuffle=True)
test_gen = GraphSAGELinkGenerator(G_test,  batch_size, num_samples).flow(
    edge_ids_test, edge_labels_test)

layer_sizes = [20, 20]
assert len(layer_sizes) == len(num_samples)

graphsage = GraphSAGE(
        layer_sizes=layer_sizes, generator=train_gen, bias=True, dropout=0.3)





Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [11]:
# Build the model and expose input and output sockets of graphsage model for link prediction via graphsage.build() method
x_inp, x_out = graphsage.build()

prediction = link_classification(
    output_dim=1, output_act="relu", edge_embedding_method='ip')(x_out)
model = keras.Model(inputs=x_inp, outputs=prediction)

model.compile(
        optimizer=keras.optimizers.Adam(lr=1e-3),
        loss=keras.losses.binary_crossentropy,
        metrics=["acc"],)

link_classification: using 'ip' method to combine node embeddings into edge embeddings


In [13]:
#init_train_metrics = model.evaluate_generator(train_gen)
#init_test_metrics = model.evaluate_generator(test_gen)
#
#print("\nTrain Set Metrics of the initial (untrained) model:")
#for name, val in zip(model.metrics_names, init_train_metrics):
#    print("\t{}: {:0.4f}".format(name, val))
#
#print("\nTest Set Metrics of the initial (untrained) model:")
#for name, val in zip(model.metrics_names, init_test_metrics):
#    print("\t{}: {:0.4f}".format(name, val))
    
history = model.fit_generator(
    train_gen,
    epochs=epochs,
    validation_data=test_gen,
    verbose=2
)

Epoch 1/20
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
2427/2427 - 344s - loss: 0.7332 - acc: 0.5841 - val_loss: 0.7043 - val_acc: 0.5951
Epoch 2/20
2427/2427 - 327s - loss: 0.6722 - acc: 0.6215 - val_loss: 0.6830 - val_acc: 0.6097
Epoch 3/20


KeyboardInterrupt: 