In [1]:
## Packages 

import csv
import pandas as pd
import numpy as np
import os
import random

#from tqdm import tqdm # progess bar
import pickle

# For graphs
import networkx as nx 
import stellargraph as sg
from stellargraph.data import EdgeSplitter
from stellargraph.mapper import GraphSAGELinkGenerator
from stellargraph.layer import GraphSAGE, HinSAGE, link_classification
from stellargraph import globalvar

# For DL
from tensorflow import keras 
######
import tensorflow as tf

config = tf.ConfigProto(intra_op_parallelism_threads=8, inter_op_parallelism_threads=2, allow_soft_placement=True, device_count = {'CPU': 8})

session = tf.Session(config=config)

os.environ["OMP_NUM_THREADS"] = "8"

os.environ["KMP_BLOCKTIME"] = "30"

os.environ["KMP_SETTINGS"] = "1"

os.environ["KMP_AFFINITY"]= "granularity=fine,verbose,compact,1,0"

#####




# For processing node texts
#import spacy
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction import text as fe

# Dimensionality reduction
from sklearn.decomposition import NMF, LatentDirichletAllocation

# Word embeddings
#import gensim 
#from gensim.models import Word2Vec

# For stemming
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk import word_tokenize



  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  from ._conv import register_converters as _register_converters
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [None]:
tf.test.gpu_device_name()
#!cat /proc/meminfo
!cat /proc/cpuinfo

In [None]:
# Load corpus and ids from pickles
corpus_path = r"pickles/corpus.PICKLE" 
ids_path = r"pickles/IDs.PICKLE"
with open(corpus_path, 'rb') as f:
    corpus = pickle.load(f)
f.close()
with open(ids_path, 'rb') as f:
    ids = pickle.load(f)
f.close()

# Save in dataframe
node_info = pd.DataFrame({'ID': ids, 'Corpus': corpus})
node_info_ID = node_info.set_index(['ID'])

In [None]:
# For each set of tokens calculate ratio of each language present
def calculate_languages_ratios_from_tokens(tokens):
    languages_ratios = []
    
    # Lower words in set of tokens
    words = [word.lower() for word in tokens]
    
    # Supported languages as intersection
    supported_languages = set(stopwords.fileids()) & set(SnowballStemmer.languages)
    
    # For each language, identify ratio in set of tokens
    for language in supported_languages:
        stopwords_set = set(stopwords.words(language))
        words_set = set(words)
        common_elements = words_set.intersection(stopwords_set)
        languages_ratios.append(len(common_elements))
        
    # Set to zero if ratio is zero
    if sum(languages_ratios) == 0:
        return np.zeros(len(languages_ratios))
    
    return np.array(languages_ratios)/sum(languages_ratios)

In [None]:
## Stem corpus based on frequent languages

# Defining path
stemmed_corpus_path = r"pickles/stemmed_corpus.PICKLE"


if os.path.exists(stemmed_corpus_path):
    pass
else:
    # Supported languages as intersection
    supported_languages = list(set(stopwords.fileids()) & set(SnowballStemmer.languages))
    stemmed_corpus = []

    # For each text in corpus
    for text in tqdm(node_info['Corpus'].values, position=0, leave=True):

        # Identify tokens
        tokens = word_tokenize(text)

        # Compute language ratios
        ratio = calculate_languages_ratios_from_tokens(tokens)

        # Note most frequent languages in langs
        if np.sum(ratio == 0):
            pass
        if np.any(ratio>=0.25):
            indices = np.where(ratio >= 0.25)[0]
            langs = [supported_languages[j] for j in indices]
        elif np.all(ratio<0.25) and np.any(ratio>0.10):
            indices = np.where(ratio > 0.10)[0]
            langs = [supported_languages[j] for j in indices]
        else:
            langs = [supported_languages[np.argmax(ratio)]]

        # For each frequent language stem word if not a stopword 
        # and if it consists of alphabet letters
        for lang in langs:
            lang_stopwords = stopwords.words(lang)
            stemmer = SnowballStemmer(lang)
            tokens = [stemmer.stem(word) for word in tokens if (word not in lang_stopwords) and word.isalpha()]
        stemmed_corpus.append(' '.join(tokens))

    # Dump pickle
    with open(stemmed_corpus_path, '+wb') as f:
        pickle.dump(stemmed_corpus, f)
    f.close()

In [None]:
## Generating smaller dictionary with stemmed words, no stopwords and frequency > 20

# Defining paths
small_matrix_path = r"pickles/small_word_matrix.PICKLE"
corpus_path = r"pickles/stemmed_corpus.PICKLE"

if os.path.exists(small_matrix_path):
    with open(small_matrix_path, 'rb') as f:
        word_matrix = pickle.load(f)
    f.close()
else:
    with open(corpus_path, 'rb') as f:
        stemmed_corpus = pickle.load(f)
        
        # Get vectorizer from feature extraction package
        vectorizer = fe.CountVectorizer(min_df = 20, max_df = 0.9, strip_accents = 'unicode')
        
        # Vectorize corpus
        word_matrix = vectorizer.fit_transform(tqdm(stemmed_corpus))
        
        # Dump pickle
        with open(small_matrix_path, '+wb') as g:
            pickle.dump(word_matrix, g)
        g.close()
    f.close

In [None]:
corpus_matrix_path = r"pickles/corpus_tfidf_matrix.PICKLE"
if os.path.exists(corpus_matrix_path):
    with open(corpus_matrix_path, 'rb') as f:
        corpus_tfidf_matrix = pickle.load(f)
    f.close()
else:
    with open(corpus_path, 'rb') as g:
        stemmed_corpus = pickle.load(g)
        vectorizer3 = fe.TfidfVectorizer(min_df = 20, max_df = 0.9, strip_accents = 'unicode')
        corpus_tfidf_matrix = vectorizer3.fit_transform(tqdm(stemmed_corpus))
        with open(corpus_matrix_path, '+wb') as f:
            pickle.dump(corpus_tfidf_matrix, f)
        f.close()
    g.close()

In [None]:
n_features = corpus_tfidf_matrix.shape[1]
n_components = 100

nmf_frobenius_path = r"pickles/nmf_frobenius_matrix.PICKLE"
if os.path.exists(nmf_frobenius_path):
    with open(nmf_frobenius_path, 'rb') as f:
        nmf_frobenius = pickle.load(f)
    f.close()
else:
    print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
          "n_samples=%d and n_features=%d..." % (corpus_tfidf_matrix.shape[0], n_features))

    nmf_frobenius = NMF(n_components=n_components, random_state=1, alpha=.1, l1_ratio=.5).fit(corpus_tfidf_matrix)
    with open(nmf_frobenius_path, '+wb') as f:
        pickle.dump(nmf_frobenius, f)
    f.close()

In [None]:
small_reduced_matrix_path = r"pickles/nmf_frobenius_matrix_realized.PICKLE"
if os.path.exists(small_reduced_matrix_path):
    with open(small_reduced_matrix_path, 'rb') as f:
        small_reduced_matrix = pickle.load(f)
    f.close()
else:
    with open(small_reduced_matrix_path, '+wb') as f:
        small_reduced_matrix = nmf_frobenius.fit_transform(corpus_tfidf_matrix)
        pickle.dump(small_reduced_matrix, f)
    f.close()

In [2]:
lda_path = r"pickles/lda_matrix.PICKLE"


if os.path.exists(lda_path):
    with open(lda_path, 'rb') as f:
        lda_matrix = pickle.load(f)
    f.close()
else:
    n_features = word_matrix.shape[1]
    n_components = 100
    print("Fitting LDA models with tf features, "
          "n_samples=%d and n_features=%d..."
          % (word_matrix.shape[0], n_features))
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                    learning_method='online',
                                    learning_offset=50.,
                                    random_state=0)
    lda.fit(word_matrix)
    lda_matrix = lda.fit_transform(word_matrix)
    with open(lda_path, '+wb') as f:
        pickle.dump(lda_matrix, f)
    f.close()

In [3]:
# Storing in dataframe
n1, n2 = lda_matrix.shape
# Creating feature names
feature_names = ["w_{}".format(ii) for ii in range(n2)]
ids = sorted(range(n1), key=str)
node_data = pd.DataFrame(data=lda_matrix, index=ids, columns=feature_names)
print(n1,n2)

33226 500


In [4]:
print(node_data)

               w_0           w_1           w_2           w_3           w_4  \
0     5.797101e-06  5.797101e-06  5.797101e-06  5.797101e-06  5.797101e-06   
1     9.319664e-07  9.319664e-07  4.437605e-02  9.319664e-07  9.319664e-07   
10    1.323627e-06  1.323627e-06  1.816354e-06  1.323627e-06  1.323627e-06   
100   1.316553e-04  2.465207e-02  1.498062e-02  1.635072e-03  1.238620e-07   
1000  6.006006e-06  6.006006e-06  6.104773e-06  6.006006e-06  6.006006e-06   
...            ...           ...           ...           ...           ...   
9995  9.950249e-06  9.950249e-06  9.950249e-06  9.950249e-06  9.950249e-06   
9996  1.071237e-06  1.656704e-03  1.071237e-06  1.474441e-03  1.071237e-06   
9997  6.009615e-07  6.009615e-07  6.009615e-07  1.773803e-03  6.009615e-07   
9998  1.164144e-06  1.164144e-06  5.824716e-02  1.164144e-06  1.164144e-06   
9999  1.052632e-04  1.052632e-04  1.052632e-04  1.052632e-04  1.052632e-04   

               w_5           w_6           w_7           w_8   

## Make graphs great again

Time to get training and test data

In [5]:
# Read training
with open(r"training.txt", "r") as f:
    reader = csv.reader(f)
    training  = list(reader)
# in order of training examples
training = [element[0].split(" ") for element in training]
training = pd.DataFrame(training, columns=['Node1', 'Node2', 'Link'])
print("Training examples shape: {}".format(training.shape))

# Read testing
with open(r"testing.txt", "r") as f:
    reader = csv.reader(f)
    testing  = list(reader)
# in order of testing examples
testing = [element[0].split(" ") for element in testing]
testing = pd.DataFrame(testing, columns=['Node1', 'Node2'])
print("Testing examples shape: {}".format(testing.shape))

Training examples shape: (453797, 3)
Testing examples shape: (113450, 2)


In [6]:
linked_nodes = training.loc[training['Link']=='1']
linked_nodes = linked_nodes[['Node1', 'Node2']]
linked_nodes.to_csv('linked_nodes.txt', sep=' ', index=False, header=False)

In [7]:
# Read edges and create NetworkX graph
edgelist = pd.read_csv("linked_nodes.txt", sep=' ', header=None, names=["source", "target"])
edgelist["label"] = "cites"  # set the edge type
G_all_nx = nx.from_pandas_edgelist(edgelist, edge_attr="label")
G_all_nx.add_nodes_from(ids)
nx.set_node_attributes(G_all_nx, "paper", "label")

# Initialize Stellargraph with node features of text
#G_all = sg.StellarGraph(G_all_nx, node_features=node_data[feature_names])

# Define an edge splitter on the original graph G:
edge_splitter_test = EdgeSplitter(G_all_nx)

# Randomly sample a fraction p=0.1 of all positive links, 
# and same number of negative links, from G, and obtain the
# reduced graph G_test with the sampled links removed:
G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split(
    p=0.1, method="global", keep_connected=True)

# Define an edge splitter on the reduced graph G_test:
edge_splitter_train = EdgeSplitter(G_test)

# Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G_test, and obtain the
# reduced graph G_train with the sampled links removed:
G_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split(
    p=0.1, method="global", keep_connected=True)

G_test = sg.StellarGraph(G_test, node_features=node_data[feature_names])
G_train = sg.StellarGraph(G_train, node_features=node_data[feature_names])

Removed 1000 edges
Removed 2000 edges
Removed 3000 edges
Removed 4000 edges
Removed 5000 edges
Removed 6000 edges
Removed 7000 edges
Removed 8000 edges
Removed 9000 edges
Removed 10000 edges
Removed 11000 edges
Removed 12000 edges
Removed 13000 edges
Removed 14000 edges
Removed 15000 edges
Removed 16000 edges
Removed 17000 edges
Removed 18000 edges
Removed 19000 edges
Removed 20000 edges
Removed 21000 edges
Removed 22000 edges
Removed 23000 edges
Removed 24000 edges
Removed 25000 edges
Removed 26000 edges
Removed 27000 edges
Removed 28000 edges
Sampled 1000 negative examples
Sampled 2000 negative examples
Sampled 3000 negative examples
Sampled 4000 negative examples
Sampled 5000 negative examples
Sampled 6000 negative examples
Sampled 7000 negative examples
Sampled 8000 negative examples
Sampled 9000 negative examples
Sampled 10000 negative examples
Sampled 11000 negative examples
Sampled 12000 negative examples
Sampled 13000 negative examples
Sampled 14000 negative examples
Sampled 15

In [8]:
print(G_test.info())
print(G_train.info())

StellarGraph: Undirected multigraph
 Nodes: 33226, Edges: 255261

 Node types:
  paper: [33226]
    Edge types: paper-cites->paper

 Edge types:
    paper-cites->paper: [255261]

StellarGraph: Undirected multigraph
 Nodes: 33226, Edges: 229735

 Node types:
  paper: [33226]
    Edge types: paper-cites->paper

 Edge types:
    paper-cites->paper: [229735]



In [9]:
batch_size = 200
epochs = 100
num_samples = [30, 20]

train_gen = GraphSAGELinkGenerator(G_train, batch_size, num_samples).flow(
    edge_ids_train, edge_labels_train, shuffle=True)
test_gen = GraphSAGELinkGenerator(G_test,  batch_size, num_samples).flow(
    edge_ids_test, edge_labels_test)

layer_sizes = [20, 20]
assert len(layer_sizes) == len(num_samples)

graphsage = GraphSAGE(
        layer_sizes=layer_sizes, generator=train_gen, bias=True, dropout=0.3)





Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [10]:
# Build the model and expose input and output sockets of graphsage model for link prediction via graphsage.build() method
x_inp, x_out = graphsage.build()

prediction = link_classification(
    output_dim=1, output_act="relu", edge_embedding_method='ip')(x_out)
model = keras.Model(inputs=x_inp, outputs=prediction)

model.compile(
        optimizer=keras.optimizers.Adam(lr=1e-3),
        loss=keras.losses.binary_crossentropy,
        metrics=["acc"],)

link_classification: using 'ip' method to combine node embeddings into edge embeddings


In [11]:
#init_train_metrics = model.evaluate_generator(train_gen)
#init_test_metrics = model.evaluate_generator(test_gen)
#
#print("\nTrain Set Metrics of the initial (untrained) model:")
#for name, val in zip(model.metrics_names, init_train_metrics):
#    print("\t{}: {:0.4f}".format(name, val))
#
#print("\nTest Set Metrics of the initial (untrained) model:")
#for name, val in zip(model.metrics_names, init_test_metrics):
#    print("\t{}: {:0.4f}".format(name, val))
    
history = model.fit_generator(
    train_gen,
    epochs=epochs,
    validation_data=test_gen,
    verbose=2
)

Epoch 1/100
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


MemoryError: Unable to allocate array with shape (120000, 500) and data type float64