In [1]:
import stellargraph as sg

print('Version: ' + sg.__version__)
try:
    sg.utils.validate_notebook_version("1.2.1")
except AttributeError:
    raise ValueError(
        f"This notebook requires StellarGraph version 1.2.1, but a different version {sg.__version__} is installed.  Please see <https://github.com/stellargraph/stellargraph/issues/1172>."
    ) from None

from neomodel import config, db

config.DATABASE_URL = 'neo4j://neo4j:l5IKrx07DGYdclK@151.106.35.64:7687'

from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import pairwise_distances
from sklearn import preprocessing

import numpy as np

from stellargraph.data import BiasedRandomWalk
from stellargraph import StellarGraph, IndexedArray

from gensim.models import Word2Vec

import warnings
import collections
from stellargraph import datasets
from IPython.display import display, HTML
import matplotlib.pyplot as plt

import pandas as pd

walk_length = 100  # maximum length of a random walk to use throughout this notebook


def jaccard_weights(graph, _subjects, edges):
    sources = graph.node_features(edges.source)
    targets = graph.node_features(edges.target)

    intersection = np.logical_and(sources, targets)
    union = np.logical_or(sources, targets)

    return intersection.sum(axis=1) / union.sum(axis=1)



Version: 1.2.1


In [2]:
results, meta = db.cypher_query(
    "MATCH(e:Entity {name:'Mona Lisa'})-[rel]->(e2:Entity)-[rel2]->(e3:Entity) return e.name,type(rel),e2.name,type(rel2),e3.name")


In [None]:
nodes, sources, targets, edge_types = [], [], [], []
for row in results:
    nodes.append(row[0]) if row[0] and row[0] not in nodes else None
    nodes.append(row[2]) if row[2] and row[2] not in nodes else None
    nodes.append(row[4]) if row[4] and row[4] not in nodes else None
    if row[0] and row[2] and row[0] not in sources and row[2] not in targets:
        sources.append(row[0])
        targets.append(row[2])
        edge_types.append(row[1])
    if row[2] and row[4] and row[2] not in sources and row[4] not in targets:
        sources.append(row[2])
        targets.append(row[4])
        edge_types.append(row[3])

nodes_ = IndexedArray(index=nodes)
edges = pd.DataFrame({
    'source': sources,
    'target': targets,
    'type': edge_types
}
)

G, subjects = StellarGraph(nodes_, edges, edge_type_column="type"), pd.Series(nodes)



In [2]:
query = """
MATCH(e:Entity{name: 'Mona Lisa'})-[rel]->(e2:Entity)-[rele2ins:`https://www.wikidata.org/wiki/Property:P31`]->(e2ins:Entity) 
WITH e,rel,e2,collect(e2ins) as e2InsNodes  UNWIND e2InsNodes as e2ins WITH * WHERE e2ins.wikidata_id in ['Q5','Q6256','Q515'] 
MATCH(e2)-[rel2]->(e3:Entity)-[rele3ins:`https://www.wikidata.org/wiki/Property:P31`]->(e3ins:Entity)
WITH e,rel,rel2,e2,e3,e2ins,collect(e3ins) as e3InsNodes  UNWIND e3InsNodes as e3ins WITH * WHERE e3ins.wikidata_id in ['Q5','Q6256','Q515']
MATCH(e3)-[rel3]->(e4:Entity)-[rele4ins:`https://www.wikidata.org/wiki/Property:P31`]->(e4ins:Entity)
WITH e,rel,rel2,rel3,e2,e3,e4,e2ins,e3ins,collect(e4ins) as e4InsNodes  UNWIND e4InsNodes as e4ins WITH * WHERE e4ins.wikidata_id in ['Q5','Q6256','Q515']
return  distinct e.name,type(rel),e2.name,e2ins.name,type(rel2),e3.name,e3ins.name,type(rel3),e4.name,e4ins.name

"""
results, meta = db.cypher_query(query
    )


In [3]:
nodes, edges, sources, targets, edge_types, subjects = {}, [],[], [], [], []
for row in results:
    node_group = []
    if row[3] in nodes:
        node_group = nodes[row[3]]
    else:
        nodes[row[3]] = node_group
    node_group.append(row[2]) if row[2] and row[2] not in node_group else None

    node_group = []
    if row[6] in nodes:
        node_group = nodes[row[6]]
    else:
        nodes[row[6]] = node_group
    node_group.append(row[5]) if row[5] and row[5] not in node_group else None

    node_group = []
    if row[9] in nodes:
        node_group = nodes[row[9]]
    else:
        nodes[row[9]] = node_group
    node_group.append(row[8]) if row[8] and row[8] not in node_group else None

    if row[2] and row[5] and not any(e['source'] == row[2] and e['target'] == row[5] for e in edges):
        edges.append({'source': row[2], 'target': row[5]})
        edge_types.append(row[4])

    if row[5] and row[8]  and not any(e['source'] == row[5] and e['target'] == row[8] for e in edges):
        edges.append({'source': row[5], 'target': row[8]})
        edge_types.append(row[7])

    if row[2] and row[3]:
        subjects.append({'node': row[2], 'subject': row[3]}) if not any(s['node'] == row[2] for s in subjects) else None
    if row[5] and row[6]:
        subjects.append({'node': row[5], 'subject': row[6]}) if not any(s['node'] == row[5] for s in subjects) else None
    if row[8] and row[9]:
        subjects.append({'node': row[8], 'subject': row[9]}) if not any(s['node'] == row[8] for s in subjects) else None

for node_key in nodes.keys():
    for node in nodes[node_key]:
        s = [s for s in subjects if s['node'] == node]
        if len(s) == 0:
            subjects.append({'node': node, 'subject': 'None'})

subjects_sorted = []
for node_key in nodes.keys():
    for node in nodes[node_key]:
        s = [s for s in subjects if s['node'] == node][0]
        subjects_sorted.append(s)

nodes_ = {}
for node_key in nodes:
    nodes_[node_key] = IndexedArray(index=nodes[node_key])

edges = pd.DataFrame({
    'source': [e['source'] for e in edges],
    'target': [e['target'] for e in edges],
    'type': edge_types
}
)

G,subjects = StellarGraph(nodes_, edges, edge_type_column="type"), pd.Series([s['subject'] for s in subjects_sorted])


In [4]:
print(G.info())

StellarGraph: Undirected multigraph
 Nodes: 282, Edges: 667

 Node types:
  Human: [202]
    Features: none
    Edge types: Human-http://dbpedia.org/property/after->Human, Human-http://dbpedia.org/property/before->Human, Human-http://dbpedia.org/property/canonizedBy->Human, Human-http://dbpedia.org/property/caption->Country, Human-http://dbpedia.org/property/constituency->City, ... (28 more)
  Country: [65]
    Features: none
    Edge types: Country-http://dbpedia.org/property/caption->Human, Country-http://dbpedia.org/property/deathPlace->Human, Country-http://dbpedia.org/property/establishedEvent->Human, Country-http://dbpedia.org/property/label->Country, Country-http://dbpedia.org/property/largestCity->City, ... (13 more)
  City: [15]
    Features: none
    Edge types: City-http://dbpedia.org/property/constituency->Human, City-http://dbpedia.org/property/largestCity->Country, City-https://www.wikidata.org/wiki/Property:P19->Human, City-https://www.wikidata.org/wiki/Property:P36->Cou

In [5]:
_, weights = G.edges(include_edge_weight=True)

wt, cnt = np.unique(weights, return_counts=True)

plt.figure(figsize=(10, 8))
plt.bar(wt, cnt, width=0.005, color="b")
plt.title("Edge weights histogram")
plt.ylabel("Count")
plt.xlabel("edge weights")
plt.xticks(np.linspace(0, 1, 10))

plt.show()

In [6]:
rw = BiasedRandomWalk(G)
walks = rw.run(
    nodes=G.nodes(),  # root nodes
    length=walk_length,  # maximum length of a random walk
    n=10,  # number of random walks per root node
    p=0.5,  # Defines (unormalised) probability, 1/p, of returning to source node
    q=2.0,  # Defines (unormalised) probability, 1/q, for moving away from source node
    weighted=False,  # for weighted random walks
    seed=42,  # random seed fixed for reproducibility
)
print("Number of random walks: {}".format(len(walks)))

Number of random walks: 2820


In [7]:
model = Word2Vec(
    walks,  window=5, min_count=0, sg=1, workers=1
)

In [8]:
# The embedding vectors can be retrieved from model.wv using the node ID as key.
# E.g., for node id '19231', the embedding vector is retrieved as
emb = model.wv["Italy"]
emb.shape

(100,)

In [26]:
# Retrieve node embeddings and corresponding subjects
node_ids = model.wv.index_to_key  # list of node IDs
node_indexes = [index for index,n in enumerate(node_ids)]
node_embeddings = (
    model.wv.vectors
)  # numpy.ndarray of size number of nodes times embeddings dimensionality
# the gensim ordering may not match the StellarGraph one, so rearrange
node_targets = subjects.loc[node_indexes].astype("category")
# node_targets = subjects.astype("category")

In [27]:
node_targets

0      Human
1      Human
2      Human
3      Human
4      Human
       ...  
277     City
278     City
279     City
280     City
281     City
Length: 282, dtype: category
Categories (3, object): ['City', 'Country', 'Human']

In [11]:
# Apply t-SNE transformation on node embeddings
tsne = TSNE(n_components=2, random_state=42)
node_embeddings_2d = tsne.fit_transform(node_embeddings)

In [12]:
# draw the points
alpha = 0.7

plt.figure(figsize=(10, 8))
plt.scatter(
    node_embeddings_2d[:, 0],
    node_embeddings_2d[:, 1],
    c=node_targets.cat.codes,
    cmap="jet",
    alpha=0.7,
)
plt.show()

In [28]:
# X will hold the 128-dimensional input features
X = node_embeddings
# y holds the corresponding target values
y = np.array(node_targets)

In [29]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.75, test_size=None, random_state=42
)
print(
    "Array shapes:\n X_train = {}\n y_train = {}\n X_test = {}\n y_test = {}".format(
        X_train.shape, y_train.shape, X_test.shape, y_test.shape
    )
)

Array shapes:
 X_train = (211, 100)
 y_train = (211,)
 X_test = (71, 100)
 y_test = (71,)


In [30]:
y_train

array(['Country', 'Human', 'Human', 'Human', 'Human', 'Human', 'Human',
       'Human', 'Human', 'Human', 'Country', 'Human', 'Human', 'Country',
       'Human', 'Human', 'Human', 'Country', 'Human', 'Human', 'Country',
       'Human', 'City', 'Human', 'Human', 'Human', 'Human', 'Country',
       'Human', 'Human', 'Human', 'Country', 'Human', 'Country', 'Human',
       'Human', 'Human', 'Human', 'Human', 'Human', 'Human', 'Human',
       'Human', 'Human', 'City', 'Human', 'Human', 'Human', 'Human',
       'Human', 'Country', 'Human', 'City', 'Human', 'Human', 'Human',
       'Country', 'Country', 'Country', 'Human', 'Human', 'Human', 'City',
       'Human', 'Human', 'Human', 'Human', 'Human', 'Human', 'Human',
       'Country', 'Human', 'Human', 'Human', 'Human', 'Human', 'Human',
       'Country', 'Human', 'Human', 'Human', 'City', 'Human', 'Human',
       'Human', 'Country', 'Human', 'Country', 'Country', 'Human',
       'Human', 'Human', 'Human', 'Country', 'Country', 'Country',
   

In [22]:
clf = LogisticRegressionCV(
    Cs=2,
    cv=2,
    tol=0.001,
    max_iter=1000,
    scoring="accuracy",
    verbose=False,
    multi_class="ovr",
    random_state=5434,
)
clf.fit(X_train, y_train)

TypeError: _repr_mimebundle_() takes 1 positional argument but 3 were given

In [None]:

print("Predicting...")
y_pred = clf.predict(X_test)

print("Accuracy %.2f" % accuracy_score(y_test, y_pred))
print(classification_report(y_test,y_pred))