In [4]:
import sys
sys.version

'3.7.4 (default, Oct 15 2019, 22:29:14) \n[GCC 7.4.0]'

In [72]:
import neo4j
import py2neo
print(neo4j.__version__)
print(py2neo.__version__)

1.7.6
4.3.0


In [373]:
from neo4j import GraphDatabase

# instantiate driver
NEO4J_URI="bolt://localhost:7687"
gdb = GraphDatabase.driver(uri=NEO4J_URI, auth=None)

## Fake citations


In [313]:
q = "MATCH (n:article) RETURN ID(n) LIMIT 10000"

with gdb.session() as session:
    article_ids = [v[0] for v in session.run(q).values()]
    
article_ids[:5]

[12, 13, 14, 15, 16]

In [285]:
len(article_ids)

10000

### Create cited_by relationships

In [329]:
# https://neo4j.com/docs/driver-manual/1.7/sessions-transactions/#driver-transactions-transaction-functions

q_add_citation_rel = """MATCH (a:article),(b:article)
WHERE a.article = $id_a AND b.article = $id_b
CREATE (a)-[r:cited_by]->(b)
RETURN a, b"""

q_add_citation_rel_id = """MATCH (a:article),(b:article)
WHERE ID(a) = $id_a AND ID(b) = $id_b
CREATE (a)-[r:cited_by]->(b)
RETURN a, b"""



def add_citation_rel(driver, id_a, id_b):
    with driver.session() as session:
        # Caller for transactional unit of work
        return session.write_transaction(create_citation_rel, id_a, id_b)

# Simple implementation of the unit of work
def create_citation_rel(tx, id_a, id_b):
    return tx.run(q_add_citation_rel_id, id_a=id_a, id_b = id_b)

Add 500 relationships of type `cited_by`

In [325]:
random.seed(42)
article_id_sample_citing = random.sample(article_ids, 500)

In [330]:
# we don't seed this cell
# pick a sample of 500 papers, and make them be cited by other three papers at random

for article in article_id_sample_citing:
    for citation in random.sample(articles_ids, 3):
        if article != citation: # they can't cite themselves
            add_citation_rel(gdb, id_a=article, id_b=citation)
        

### Make articles `UNIQUE`

In [331]:
# no repeated articles
with gdb.session() as session:
    session.run("CREATE CONSTRAINT ON (n:article) ASSERT n.article IS UNIQUE")

### Delete citations

In [288]:
with gdb.session() as session:
    session.run("MATCH p=(:article)-[r:cited_by]->(:article) DELETE r ")

### Query citations

In [334]:
with gdb.session() as session:
    out = session.run("MATCH p=(:article)-[r:cited_by]->(:article) RETURN r LIMIT 5").values()
    
for elem in out:
    print(elem)

[<Relationship id=19920006 nodes=(<Node id=18 labels=set() properties={}>, <Node id=379 labels=set() properties={}>) type='cited_by' properties={}>]
[<Relationship id=19920005 nodes=(<Node id=18 labels=set() properties={}>, <Node id=162 labels=set() properties={}>) type='cited_by' properties={}>]
[<Relationship id=19920004 nodes=(<Node id=18 labels=set() properties={}>, <Node id=350 labels=set() properties={}>) type='cited_by' properties={}>]
[<Relationship id=19919497 nodes=(<Node id=21 labels=set() properties={}>, <Node id=259 labels=set() properties={}>) type='cited_by' properties={}>]
[<Relationship id=19919672 nodes=(<Node id=47 labels=set() properties={}>, <Node id=391 labels=set() properties={}>) type='cited_by' properties={}>]


## Fake topics and keywords

In [340]:
fake.sentence?

[0;31mSignature:[0m [0mfake[0m[0;34m.[0m[0msentence[0m[0;34m([0m[0mnb_words[0m[0;34m=[0m[0;36m6[0m[0;34m,[0m [0mvariable_nb_words[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m [0mext_word_list[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Generate a random sentence
:example 'Lorem ipsum dolor sit amet.'

:param nb_words: around how many words the sentence should contain
:param variable_nb_words: set to false if you want exactly ``nb``
    words returned, otherwise the result may include a number of words
    of ``nb`` +/-40% (with a minimum of 1)
:param ext_word_list: a list of words you would like to have instead of
    'Lorem ipsum'.

:rtype: str
[0;31mFile:[0m      ~/.pyenv/versions/3.7.4/envs/od/lib/python3.7/site-packages/faker/providers/lorem/__init__.py
[0;31mType:[0m      method


In [341]:
from faker import Faker
from faker.providers import lorem

fake = Faker()
fake.seed_instance(42)
fake.add_provider(lorem)

# 50 fake topics
fake_topics = [(ix, fake.sentence(nb_words=3, variable_nb_words=True, ext_word_list=None)) for ix, _ in enumerate(range(30), start=1)]

In [365]:
# +-100 fake, non repeated keywords
fake_keywords = [(ix, word) for ix, word in enumerate(list({fake.word() for _ in range(100)}), start=1)]
len(fake_keywords)

97

http://www.jexp.de/blog/html/create_random_data.html

In [367]:
q_create_keyword = "CREATE (:keyword {id:$id, keyword:$keyword})"

with gdb.session() as session:
    for ix, keyword in fake_keywords:
        session.run(q_create_keyword, id=ix, keyword=keyword)

In [366]:
with gdb.session() as session:
    for ix, keyword in fake_keywords:
        session.run("MATCH (n:keyword) DELETE n ")

In [368]:
with gdb.session() as session:
    session.run("CREATE CONSTRAINT ON (n:keyword) ASSERT n.keyword IS UNIQUE")

### Assign 5 keywords to 1000 articles, at random

In [372]:
# assign 5 keywords randomly to 1000 articles
random.seed(42)

q_add_keywords = """MATCH (a:article),(b:keyword)
WHERE a.article = $article AND b.keyword = $keyword
CREATE (a)-[r:has_keyword]->(b)
RETURN a, b"""

with gdb.session() as session:
    for article in random.sample(article_ids, 1000):
        for ix, keyword in random.sample(fake_keywords, 5):
            session.run(q_add_keywords, article=article, keyword=keyword)

In [371]:
with gdb.session() as session:
    session.run("MATCH p=(:article)-[r:has_keyword]->(:keyword) delete r")