In [1]:
# install StellarGraph if running on Google Colab
import sys
if 'google.colab' in sys.modules:
  %pip install -q stellargraph[demos]==1.0.0rc1

In [2]:
# verify that we're using the correct version of StellarGraph for this notebook
import stellargraph as sg

try:
    sg.utils.validate_notebook_version("1.0.0rc1")
except AttributeError:
    raise ValueError(
        f"This notebook requires StellarGraph version 1.0.0rc1, but a different version {sg.__version__} is installed.  Please see <https://github.com/stellargraph/stellargraph/issues/1172>."
    ) from None

<div class="alert alert-block alert-danger">This notebook is designed for an older StellarGraph version 1.0.0rc1 and may not function correctly with the newer installed version 1.1.0. Please see: <a href="https://github.com/stellargraph/stellargraph/issues/1172">https://github.com/stellargraph/stellargraph/issues/1172</a>.</div>

  """


In [3]:
import networkx as nx
import pandas as pd
import numpy as np
import os
import random

import stellargraph as sg
from stellargraph.data import EdgeSplitter
from stellargraph.mapper import DirectedGraphSAGELinkGenerator
from stellargraph.layer import DirectedGraphSAGE, link_classification
from stellargraph.data import UniformRandomWalk
from stellargraph.data import UnsupervisedSampler
from sklearn.model_selection import train_test_split

from tensorflow import keras
import tensorflow as tf
from sklearn import preprocessing, feature_extraction, model_selection
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.metrics import accuracy_score

from stellargraph import globalvar

from stellargraph import datasets
from IPython.display import display, HTML

if tf.test.gpu_device_name():
  print('Default GPU Device:{}'.format(tf.test.gpu_device_name()))

Default GPU Device:/device:GPU:0


### Loading the TF network data

In [4]:
FILE_NAME = 'GM12878_tf2tf.csv'
SEP = ','

df = pd.read_csv('../data/'+FILE_NAME, sep=SEP)

df

Unnamed: 0,cell_type,source_id,source,target_id,target,weight,type
0,GM12878,0,ATF3,2,BHLHE40,88.392564,TSS
1,GM12878,0,ATF3,24,MAX,315.000000,TSS
2,GM12878,0,ATF3,29,MXI1,98.760086,TSS
3,GM12878,0,ATF3,32,NFE2,161.798194,TSS
4,GM12878,0,ATF3,58,TBP,144.000000,TSS
...,...,...,...,...,...,...,...
2191,GM12878,67,ZNF143,64,ZBED1,1545.632722,TSS
2192,GM12878,67,ZNF143,65,ZBTB33,1000.000000,TSS
2193,GM12878,67,ZNF143,66,ZBTB40,1000.000000,TSS
2194,GM12878,67,ZNF143,67,ZNF143,1000.000000,TSS


In [5]:
# Print basic info
nx_graph = nx.from_pandas_edgelist(df[['source', 'target', 'weight']], 'source', 'target', edge_attr='weight')
print(nx.info(nx_graph))

Name: 
Type: Graph
Number of nodes: 69
Number of edges: 1643
Average degree:  47.6232


**use one-hot encoding of node names as features**

In [6]:
feature_df = pd.read_csv('../data/features/onehot_names.csv', index_col=0)


# G = sg.StellarDiGraph(edges=df[['source', 'target', 'weight']], nodes=feature_df)

# Experiment with unweighted graph
G = sg.StellarDiGraph(edges=df[['source', 'target']], nodes=feature_df)


print(G.info())

StellarDiGraph: Directed multigraph
 Nodes: 69, Edges: 2196

 Node types:
  default: [69]
    Features: float32 vector, length 69
    Edge types: default-default->default

 Edge types:
    default-default->default: [2196]
        Weights: all 1 (default)
        Features: none


In [11]:
dir(G)
G.node_features(['ATF3'])

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.]], dtype=float32)

In [147]:
G.nodes()

Index(['ATF3', 'BCLAF1', 'BHLHE40', 'CBX5', 'CEBPB', 'CEBPZ', 'CHD1', 'CHD2',
       'CTCF', 'E2F4', 'EGR1', 'ELF1', 'ELK1', 'EP300', 'ETS1', 'ETV6', 'EZH2',
       'FOS', 'GABPA', 'HDGF', 'IKZF1', 'JUNB', 'JUND', 'MAFK', 'MAX', 'MAZ',
       'MEF2A', 'MLLT1', 'MTA2', 'MXI1', 'MYC', 'NBN', 'NFE2', 'NFYA', 'NFYB',
       'NR2C2', 'NRF1', 'PML', 'POLR2A', 'POLR2AphosphoS2', 'POLR2AphosphoS5',
       'POLR3G', 'RAD21', 'RCOR1', 'REST', 'RFX5', 'SIN3A', 'SIX5', 'SMAD5',
       'SMC3', 'SP1', 'SPI1', 'SRF', 'STAT5A', 'SUZ12', 'TAF1', 'TARDBP',
       'TBL1XR1', 'TBP', 'UBTF', 'USF1', 'USF2', 'YBX1', 'YY1', 'ZBED1',
       'ZBTB33', 'ZBTB40', 'ZNF143', 'ZNF274'],
      dtype='object')

**1. Specify the other optional parameter values: root nodes, the number of walks to take per node, the length of each walk, and random seed.**

In [148]:
nodes = list(G.nodes())
number_of_walks = 200
length = 2

SEED = 0

**2. Create the UnsupervisedSampler instance with the relevant parameters passed to it.**

In [149]:
biased_walker = sg.data.BiasedRandomWalk(G, n=number_of_walks, length=length, p=2, q=.5, seed=SEED)

unsupervised_samples = UnsupervisedSampler(
    G, nodes=nodes, walker=biased_walker
)

In [163]:
# unsupervised_samples.run(2)

**3. Create a node pair generator:**

Next, create the node pair generator for sampling and streaming the training data to the model. The node pair generator essentially "maps" pairs of nodes `(target, context)` to the input of GraphSAGE: it either takes minibatches of node pairs, or an `UnsupervisedSampler` instance which generates the minibatches of node pairs on demand. The generator samples 2-hop subgraphs with `(target, context)` head nodes extracted from those pairs, and feeds them, together with the corresponding binary labels indicating which pair represent positive or negative sample, to the input layer of the node pair classifier with GraphSAGE node encoder, for SGD updates of the model parameters.

Specify:
1. The minibatch size (number of node pairs per minibatch).
2. The number of epochs for training the model.
3. The sizes of 1- and 2-hop neighbor samples for GraphSAGE:

Note that the length of `num_samples` list defines the number of layers/iterations in the GraphSAGE encoder. In this example, we are defining a 2-layer GraphSAGE encoder.

In [164]:
batch_size = 10
epochs = 20
in_samples = [20, 10]
out_samples = [20, 10]

In the following we show the working of node pair generator with the UnsupervisedSampler, which will generate samples on demand.

In [165]:
generator = DirectedGraphSAGELinkGenerator(G, batch_size, in_samples=in_samples, out_samples=out_samples, seed=SEED)
train_gen = generator.flow(unsupervised_samples)


Build the model: a 2-layer GraphSAGE encoder acting as node representation learner, with a link classification layer on concatenated (`citing-paper`, `cited-paper`) node embeddings.

GraphSAGE part of the model, with hidden layer sizes of 50 for both GraphSAGE layers, a bias term, and no dropout. (Dropout can be switched on by specifying a positive dropout rate, 0 < dropout < 1).
**Note that the length of `layer_sizes` list must be equal to the length of `num_samples`, as `len(num_samples)` defines the number of hops (layers) in the GraphSAGE encoder**.

In [173]:
layer_sizes = [32, 32]
graphsage = DirectedGraphSAGE(
    layer_sizes=layer_sizes, generator=generator, bias=True, dropout=0.3, normalize="l2", aggregator=sg.layer.MeanAggregator
)

In [174]:
# Build the model and expose input and output sockets of graphsage, for node pair inputs:
x_inp, x_out = graphsage.in_out_tensors()

In [175]:
prediction = link_classification(
    output_dim=1, output_act="sigmoid", edge_embedding_method="concat"
)(x_out)

link_classification: using 'concat' method to combine node embeddings into edge embeddings


In [176]:
model = keras.Model(inputs=x_inp, outputs=prediction)

model.compile(
    optimizer=keras.optimizers.Adam(lr=1e-4),
    loss=keras.losses.binary_crossentropy,
    metrics=[keras.metrics.binary_accuracy],
)

**4. Train the model.**

In [177]:
history = model.fit(
    train_gen,
    epochs=epochs,
    verbose=1,
    use_multiprocessing=False,
    workers=4,
    shuffle=True,
)

  ...
    to  
  ['...']
Train for 2760 steps
Epoch 1/20
Epoch 2/20
  14/2760 [..............................] - ETA: 9:17 - loss: 0.6883 - binary_accuracy: 0.5462

KeyboardInterrupt: 

### Extracting node embeddings
Now that the node pair classifier is trained, we can use its node encoder part as node embeddings evaluator. Below we evaluate node embeddings as activations of the output of graphsage layer stack, and visualise them, coloring nodes by their subject label.

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from stellargraph.mapper import GraphSAGENodeGenerator
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

**Building a new node-based model**

The `(src, dst)` node pair classifier `model` has two identical node encoders: one for source nodes in the node pairs, the other for destination nodes in the node pairs passed to the model. We can use either of the two identical encoders to evaluate node embeddings. Below we create an embedding model by defining a new Keras model with `x_inp_src` (a list of odd elements in `x_inp`) and `x_out_src` (the 1st element in `x_out`) as input and output, respectively. Note that this model's weights are the same as those of the corresponding node encoder in the previously trained node pair classifier.

In [None]:
x_inp_src = x_inp[0::2]
x_out_src = x_out[0]
embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)

In [None]:
print(x_inp_src)

We also need a node generator to feed graph nodes to `embedding_model`. We want to evaluate node embeddings for all nodes in the graph:

In [None]:
df

In [None]:
node_ids = feature_df.index
node_gen = GraphSAGENodeGenerator(G, batch_size, num_samples).flow(node_ids)

In [None]:
node_embeddings = embedding_model.predict(node_gen, workers=4, verbose=1)

#### Visualize the node embeddings 
Next we visualize the node embeddings in 2D using t-SNE. Colors of the nodes depict their true classes (subject in the case of Cora dataset) of the nodes. 

In [None]:
# node_subject = node_subjects.astype("category").cat.codes

X = node_embeddings
if X.shape[1] > 2:
    transform = TSNE  # PCA

    trans = transform(n_components=2)
    emb_transformed = pd.DataFrame(trans.fit_transform(X), index=node_ids)
#     emb_transformed["label"] = node_subject
else:
    emb_transformed = pd.DataFrame(X, index=node_ids)
    emb_transformed = emb_transformed.rename(columns={"0": 0, "1": 1})
#     emb_transformed["label"] = node_subject

In [None]:
alpha = 0.7

fig, ax = plt.subplots(figsize=(7, 7))
ax.scatter(
    emb_transformed[0],
    emb_transformed[1],
#     c=emb_transformed["label"].astype("category"),
    cmap="jet",
    alpha=alpha,
)
ax.set(aspect="equal", xlabel="$X_1$", ylabel="$X_2$")
plt.title(
    "{} visualization of GraphSAGE embeddings for cora dataset".format(transform.__name__)
)
plt.show()

In [None]:
emb_transformed.head()