In [389]:
import sys
sys.version

'3.7.4 (default, Oct 15 2019, 22:29:14) \n[GCC 7.4.0]'

In [390]:
import neo4j
import py2neo
print(neo4j.__version__)
print(py2neo.__version__)

1.7.6
4.3.0


In [408]:
from neo4j import GraphDatabase

# instantiate driver
NEO4J_URI="bolt://localhost:7687"
gdb = GraphDatabase.driver(uri=NEO4J_URI, auth=None)

## A.2 Loading Data


We will read data from dblp.uni-trier.de. From the XML's description of data in https://dblp.org/faq/16154937.html, the following elements are represented

> - article – An article from a journal or magazine.
> - inproceedings – A paper in a conference or workshop proceedings.
> - proceedings – The proceedings volume of a conference or workshop.
> - book – An authored monograph or an edited collection of articles.
> - incollection – A part or chapter in a monograph.
> - phdthesis – A PhD thesis.
> - mastersthesis – A Master's thesis. There are only very few Master's theses in dblp.
> - www – A web page. There are only very few web pages in dblp. See also the notes on person records.

We will rely on the script provided in https://github.com/ThomHurks/dblp-to-csv, and we will be removing some of the elements by editing from the `dtd` file. In particular we will be removing

- book
- incollection
- phdthesis
- masterthesis
- www

The script is then executed as

```bash
#!/bin/bash
./XMLToCSV.py --annotate --neo4j dblp-raw/dblp.xml dblp-raw/dblp_slim.dtd output_slim/output.csv --relations author:authored_by journal:published_in publisher:published_by school:submitted_at editor:edited_by cite:has_citation
```

and the `neo4j-admin import` command is

```bash
#!/bin/bash
neo4j-admin import --mode=csv --database=dblp_slim.db --delimiter ";" --array-delimiter "|" --id-type INTEGER --nodes:inproceedings "output_slim/output_inproceedings_header.csv,output_slim/output_inproceedings.csv" --nodes:article "output_slim/output_article_header.csv,output_slim/output_article.csv" --nodes:proceedings "output_slim/output_proceedings_header.csv,output_slim/output_proceedings.csv" --nodes:editor "output_slim/output_editor.csv" --relationships:edited_by "output_slim/output_editor_edited_by.csv" --nodes:publisher "output_slim/output_publisher.csv" --relationships:published_by "output_slim/output_publisher_published_by.csv" --nodes:journal "output_slim/output_journal.csv" --relationships:published_in "output_slim/output_journal_published_in.csv" --nodes:author "output_slim/output_author.csv" --relationships:authored_by "output_slim/output_author_authored_by.csv" --nodes:cite "output_slim/output_cite.csv" --relationships:has_citation "output_slim/output_cite_has_citation.csv"
```



By modifying the `dtd` file, we obtain a smaller graph
- node count from 9,985,270 to 7,338, 701
- relationship count from 19,917,751 to 17,079,387

Finally, running the scripts, we get something like this

![schema1](images/graph.png)

## Missing nodes and relationships

We are then missing the following nodes

- topics
- keywords
- journals
- volumes

and the following relationships

- topic -> has -> keywords
- article -> cited_by -> article
- author -> reviews -> article

## Faking citations

citations are hard to parse from xml data, so we will be randomly linking articles between them using the `cited_in` relationship

creating a relationship

```cypher
MATCH (a:article),(b:article)
WHERE ID(a) = 12 AND ID(b) = 13
CREATE (a)-[r:cited_by]->(b)
RETURN type(r)
```

deleting a relationship

```cypher
MATCH p=(:article)-[r:cited_by]->(:article) delete r
```

query relationships

```
MATCH p=(:article)-[r:cited_by]->(:article) RETURN p LIMIT 25
```

### Fake citations


Fetch existing articles IDs

In [410]:
q = "MATCH (n:article) RETURN ID(n) LIMIT 10000"

with gdb.session() as session:
    article_ids = [v[0] for v in session.run(q).values()]
    
article_ids[:5]

[2508965, 2508966, 2508967, 2508968, 2508969]

Optional, add proceedings and journals as citable elements

In [406]:
len(article_ids)

10000

#### Delete citations

In [415]:
# delete existing citations before inserting new ones
with gdb.session() as session:
    session.run("MATCH p=(:article)-[r:cited_by]->(:article) DELETE r ")

#### Create `cited_by` relationships

In [407]:
# https://neo4j.com/docs/driver-manual/1.7/sessions-transactions/#driver-transactions-transaction-functions

q_add_citation_rel_id = """MATCH (a:article),(b:article)
WHERE ID(a) = $id_a AND ID(b) = $id_b
CREATE (a)-[r:cited_by]->(b)
RETURN a, b"""


def add_citation_rel(driver, id_a, id_b):
    with driver.session() as session:
        # Caller for transactional unit of work
        return session.write_transaction(create_citation_rel, id_a, id_b)

# Simple implementation of the unit of work
def create_citation_rel(tx, id_a, id_b):
    return tx.run(q_add_citation_rel_id, id_a=id_a, id_b = id_b)

Add 500 relationships of type `cited_by`

In [416]:
random.seed(42)
# pick a sample of 500 papers, and make them be cited by other three papers at random
created = []

for article in random.sample(article_ids, 500):
    for citation in random.sample(article_ids, 3):
        if article != citation: # they can't cite themselves
            created.append((article, citation))
            add_citation_rel(gdb, id_a=article, id_b=citation)
        else:
            print(article, citation)
        

2542972 2542972


In [417]:
created[:5]

[(2527960, 2560941),
 (2527960, 2536901),
 (2527960, 2552365),
 (2509374, 2556357),
 (2509374, 2553167)]

#### Make articles `UNIQUE`

In [398]:
# no repeated articles
with gdb.session() as session:
    session.run("CREATE CONSTRAINT ON (n:article) ASSERT n.article IS UNIQUE")

#### Query citations

In [429]:
with gdb.session() as session:
    out = session.run("MATCH p=(:article)-[r:cited_by]->(:article) RETURN r LIMIT 5").values()
    for elem in out:
        print(elem)


[<Relationship id=17081502 nodes=(<Node id=2508971 labels=set() properties={}>, <Node id=2548988 labels=set() properties={}>) type='cited_by' properties={}>]
[<Relationship id=17081501 nodes=(<Node id=2508971 labels=set() properties={}>, <Node id=2557826 labels=set() properties={}>) type='cited_by' properties={}>]
[<Relationship id=17081500 nodes=(<Node id=2508971 labels=set() properties={}>, <Node id=2555890 labels=set() properties={}>) type='cited_by' properties={}>]
[<Relationship id=17080549 nodes=(<Node id=2508974 labels=set() properties={}>, <Node id=2557687 labels=set() properties={}>) type='cited_by' properties={}>]
[<Relationship id=17080548 nodes=(<Node id=2508974 labels=set() properties={}>, <Node id=2545595 labels=set() properties={}>) type='cited_by' properties={}>]


### Fake keywords
It's not trivial to parse keywords and topics from data, so we will fake some topics and random keywords using the `faker` library, as explained in http://www.jexp.de/blog/html/create_random_data.html

In [419]:
from faker import Faker
from faker.providers import lorem

fake = Faker()
fake.seed_instance(42)
fake.add_provider(lorem)

In [420]:
# +-100 fake, non repeated keywords
fake_keywords = [(ix, word) for ix, word in enumerate(list({fake.sentence(nb_words=3) for _ in range(100)}), start=1)]
len(fake_keywords)

100

In [421]:
fake_keywords[:10]

[(1, 'Behavior benefit.'),
 (2, 'Nation strong.'),
 (3, 'Campaign little.'),
 (4, 'Last everything.'),
 (5, 'Better present music.'),
 (6, 'Water beat magazine.'),
 (7, 'War real.'),
 (8, 'Lead upon.'),
 (9, 'Article finish anyone.'),
 (10, 'Page southern role.')]

In [423]:
# delete keywords
with gdb.session() as session:
    for ix, keyword in fake_keywords:
        session.run("MATCH (n:keyword) DELETE n")

In [424]:
q_create_keyword = "CREATE (:keyword {id:$id, keyword:$keyword})"

with gdb.session() as session:
    for ix, keyword in fake_keywords:
        session.run(q_create_keyword, id=ix, keyword=keyword)

In [425]:
# create UNIQUE constraint
with gdb.session() as session:
    session.run("CREATE CONSTRAINT ON (n:keyword) ASSERT n.keyword IS UNIQUE")

#### Assign 5 keywords to 1000 articles, at random

In [439]:
# delete relationships
with gdb.session() as session:
    session.run("MATCH p=(:article)-[r:has_keyword]->(:keyword) delete r")

In [440]:
# assign 5 keywords randomly to 1000 articles
random.seed(42)

q_add_keywords = """MATCH (a:article),(b:keyword)
WHERE ID(a) = $article_id AND b.keyword = $keyword
CREATE (a)-[r:has_keyword]->(b)
RETURN a, b"""

created_kw_rel = []

with gdb.session() as session:
    for article_id in random.sample(article_ids, 1000):
        for ix, keyword in random.sample(fake_keywords, 5):
            created_kw_rel.append((article_id, keyword))
            session.run(q_add_keywords, article_id=article_id, keyword=keyword)

In [441]:
created_kw_rel[:5]

[(2527960, 'Necessary into act.'),
 (2527960, 'You available defense.'),
 (2527960, 'Prove reduce.'),
 (2527960, 'Coach magazine.'),
 (2527960, 'Economy traditional anything.')]

In [432]:
with gdb.session() as session:
    out = session.run("MATCH p=(:article)-[r:has_keyword]->(:keyword) RETURN r LIMIT 5").values()
    for elem in out:
        print(elem)

## A.3 Evolving the graph 