# Part D

In [1]:
import sys
import random
from pprint import pprint as pp
random.seed(42)
sys.version

'3.7.4 (default, Oct 15 2019, 22:29:14) \n[GCC 7.4.0]'

In [2]:
import neo4j
import py2neo
print(neo4j.__version__)
print(py2neo.__version__)

1.7.6
4.3.0


In [3]:
%load_ext cypher
# https://ipython-cypher.readthedocs.io/en/latest/

In [5]:
from neo4j import GraphDatabase
from py2neo import Graph

# instantiate drivers
NEO4J_URI="bolt://localhost:7687"
gdb = GraphDatabase.driver(uri=NEO4J_URI, auth=None)
graph = Graph(NEO4J_URI)

In [6]:
graph.run("CALL algo.list();").data()[0]

{'name': 'algo.allShortestPaths.stream',
 'description': "CALL algo.allShortestPaths.stream(weightProperty:String{nodeQuery:'labelName', relationshipQuery:'relationshipName', defaultValue:1.0, concurrency:4}) YIELD sourceNodeId, targetNodeId, distance - yields a stream of {sourceNodeId, targetNodeId, distance}",
 'signature': 'algo.allShortestPaths.stream(propertyName :: STRING?, config = {} :: MAP?) :: (sourceNodeId :: INTEGER?, targetNodeId :: INTEGER?, distance :: FLOAT?)',
 'type': 'procedure'}

### Define research communities

The first thing to do is to find/define the research communities. A community is
defined by a set of keywords. Assume that the database community is defined through
the following keywords: 

> data management, indexing, data modeling, big data, data
processing, data storage and data querying.

Since we don't have these keywords in the database, we will have to create them

In [83]:
dbms_kw = ['data management',
     'indexing',
     'data modeling',
     'big data',
     'data processing',
     'data storage',
     'data querying']

In [84]:
%%cypher
MATCH (kw:Keyword)
    WHERE kw.name in ['data management',
     'indexing',
     'data modeling',
     'big data',
     'data processing',
     'data storage',
     'data querying']
DETACH DELETE kw
RETURN count(kw)

1 rows affected.


count(kw)
0


In [85]:
%%cypher
UNWIND ['data management',
     'indexing',
     'data modeling',
     'big data',
     'data processing',
     'data storage',
     'data querying'] as dbkw
MERGE (kw:Keyword {name: dbkw, fake: true})
RETURN kw

7 nodes created.
14 properties set.
7 labels added.


kw
"{'name': 'data management', 'fake': True}"
"{'name': 'indexing', 'fake': True}"
"{'name': 'data modeling', 'fake': True}"
"{'name': 'big data', 'fake': True}"
"{'name': 'data processing', 'fake': True}"
"{'name': 'data storage', 'fake': True}"
"{'name': 'data querying', 'fake': True}"


#### Assign papers to these keywords randomly

In [86]:
article_ids = graph.run("MATCH (a:Article) RETURN a.id").data()
article_ids[:5]

[{'a.id': '8fb9c95bf34a0f28dc05819cb4aada0cb94fe555'},
 {'a.id': '5cfdb256b6ae968374469bd36702ed341cfe9485'},
 {'a.id': '0fbe46932967ec0db80b18e70fa199fb652313ea'},
 {'a.id': 'c4062742b4e0d13cfa0e992fdf2cebf2eb71c415'},
 {'a.id': 'f218ce53248d756db61726985f73e6e8c109b3e2'}]

In [87]:
for aid in article_ids:
    for kw in random.sample(dbms_kw, random.randint(0, len(dbms_kw))):
        graph.run(
            """MATCH (a:Article), (kw:Keyword)
            WHERE a.id = $aid AND kw.name = $kwname
            MERGE (a)-[:CONTAINS]->(kw)
            ON CREATE SET kw.name = $kwname, kw.fake = true
            ON MATCH SET kw.name = $kwname, kw.fake = true""",
            aid = aid["a.id"],
            kwname = kw
        )

Next, we need to find the conferences and journals related to the database community
(i.e., are specific to the field of databases). Assume that if 90% of the papers published
in a conference/journal contain one of the keywords of the database community we
consider that conference/journal as related to that community.

#### Create a research community

In [88]:
graph.run("MATCH (rc:ResearchCommunity) DETACH DELETE rc").stats()

constraints_added: 0
constraints_removed: 0
contained_updates: True
indexes_added: 0
indexes_removed: 0
labels_added: 0
labels_removed: 0
nodes_created: 0
nodes_deleted: 1
properties_set: 0
relationships_created: 0
relationships_deleted: 182

In [89]:
graph.run("MERGE (rc:ResearchCommunity {name: 'databases', fake: true}) RETURN rc").stats()

constraints_added: 0
constraints_removed: 0
contained_updates: True
indexes_added: 0
indexes_removed: 0
labels_added: 1
labels_removed: 0
nodes_created: 1
nodes_deleted: 0
properties_set: 2
relationships_created: 0
relationships_deleted: 0

In [90]:
%%cypher
MATCH (kw:Keyword)
WHERE kw.name IN ['data management',
     'indexing',
     'data modeling',
     'big data',
     'data processing',
     'data storage',
     'data querying']
return kw

7 rows affected.


kw
"{'name': 'data querying', 'fake': True}"
"{'name': 'indexing', 'fake': True}"
"{'name': 'data management', 'fake': True}"
"{'name': 'data modeling', 'fake': True}"
"{'name': 'big data', 'fake': True}"
"{'name': 'data processing', 'fake': True}"
"{'name': 'data storage', 'fake': True}"


In [91]:
%%cypher
MATCH (kw:Keyword)
WHERE kw.name IN ['data management',
     'indexing',
     'data modeling',
     'big data',
     'data processing',
     'data storage',
     'data querying']
WITH kw
MATCH (rc:ResearchCommunity)
WHERE rc.name = "databases"
MERGE p=(rc)-[r:CONTAINS]->(kw)
RETURN p

7 relationships created.


p
"[{'name': 'databases', 'fake': True}, {}, {'name': 'data querying', 'fake': True}]"
"[{'name': 'databases', 'fake': True}, {}, {'name': 'indexing', 'fake': True}]"
"[{'name': 'databases', 'fake': True}, {}, {'name': 'data management', 'fake': True}]"
"[{'name': 'databases', 'fake': True}, {}, {'name': 'data modeling', 'fake': True}]"
"[{'name': 'databases', 'fake': True}, {}, {'name': 'big data', 'fake': True}]"
"[{'name': 'databases', 'fake': True}, {}, {'name': 'data processing', 'fake': True}]"
"[{'name': 'databases', 'fake': True}, {}, {'name': 'data storage', 'fake': True}]"


get the count of all the articles published in a journal

In [92]:
%%cypher
MATCH (a:Article)-[:PUBLISHED_IN]->(instance)-[:OF]->(publication)
WHERE (publication:Journal OR publication:Conference) AND (instance:Volume OR instance:Edition)
RETURN publication.name AS publication_name, LABELS(publication) AS publication_type, COUNT(a) AS total_articles
ORDER BY total_articles DESC
LIMIT 5

5 rows affected.


publication_name,publication_type,total_articles
Dr. Gregory Arnold,['Conference'],93
Lance Rose,['Conference'],90
Stephanie Hoffman,['Conference'],85
Laurie Carter,['Conference'],80
Rebecca Lane,['Conference'],79


In [93]:
%%cypher
MATCH (:ResearchCommunity {name: 'databases'})-[:CONTAINS]->(:Keyword)<-[:CONTAINS]-(article:Article)-[:PUBLISHED_IN]->(instance)-[:OF]->(publication)
WHERE (publication:Journal OR publication:Conference) AND (instance:Volume OR instance:Edition)
WITH publication, COUNT(DISTINCT article) AS related_articles
RETURN 
    LABELS(publication) AS publication_type, 
    publication.name AS publication_name,
    related_articles
ORDER BY related_articles DESC
LIMIT 5

5 rows affected.


publication_type,publication_name,related_articles
['Conference'],Heather Allen,22
['Conference'],William Mcmillan,22
['Conference'],Richard Harper,21
['Conference'],Jason Bell,20
['Conference'],Mckenzie Green,20


We join both queries

In [94]:
MATCH (p:paper) -[:published_in]-> (v) -[:of]-> (j)
WHERE (v:volume or v:edition) and (j:journal or j:conference)
WITH j, count(p) as total_papers
MATCH (:community {name: 'database'})<-[:composes]-(:keyword) <-[:related_to]- (p:paper) -[:published_in]-> (v) -[:of]-> (j)
WHERE (v:volume or v:edition) and (j:journal or j:conference)
WITH j, total_papers, count(distinct p) as related_papers
WHERE toFloat(related_papers) / toFloat(total_papers) >= 0.9
MATCH (c:community {name:'database'})
CREATE (j) -[:related_to]-> (c)
RETURN j;

SyntaxError: invalid syntax (<ipython-input-94-2e137eef1989>, line 1)

In [97]:
%%cypher
MATCH (article:Article)-[:PUBLISHED_IN]->(instance)-[:OF]->(publication)
WHERE (publication:Journal OR publication:Conference) AND (instance:Volume OR instance:Edition)
WITH instance, count(article) AS total_articles
MATCH (:ResearchCommunity {name: 'databases'})-[:CONTAINS]->(:Keyword)<-[:CONTAINS]-(article:Article)-[:PUBLISHED_IN]->(instance)-[:OF]->(publication)
WHERE (publication:Journal OR publication:Conference) AND (instance:Volume or instance:Edition)
WITH publication, total_articles, COUNT(DISTINCT article) as related_articles
RETURN publication, related_articles, total_articles, toFloat(related_articles)/toFloat(total_articles) >= 0.9 AS is_relevant_for_community
ORDER BY total_articles DESC
LIMIT 5

5 rows affected.


publication,related_articles,total_articles,is_relevant_for_community
"{'review_policy_min_count': 3, 'name': 'Dr. Gregory Arnold', 'id': 80}",5,42,False
"{'review_policy_min_count': 3, 'name': 'Christopher Patterson', 'id': 26}",5,42,False
"{'review_policy_min_count': 3, 'name': 'Barbara Flores', 'id': 96}",5,42,False
"{'review_policy_min_count': 3, 'name': 'Steven Shaw', 'id': 22}",5,42,False
"{'review_policy_min_count': 3, 'name': 'Kelly Hubbard', 'id': 25}",5,42,False


In [98]:
%%cypher
MATCH (article:Article)-[:PUBLISHED_IN]->(instance)-[:OF]->(publication)
WHERE (publication:Journal OR publication:Conference) AND (instance:Volume OR instance:Edition)

WITH instance, count(article) AS total_articles
MATCH (:ResearchCommunity {name: 'databases'})-[:CONTAINS]->(:Keyword)<-[:CONTAINS]-(article:Article)-[:PUBLISHED_IN]->(instance)-[:OF]->(publication)
WHERE (publication:Journal OR publication:Conference) AND (instance:Volume or instance:Edition)

WITH publication, total_articles, COUNT(DISTINCT article) as related_articles
WHERE toFloat(related_articles)/toFloat(total_articles) >= 0.9

MATCH (rc:ResearchCommunity {name: 'databases'})
MERGE p=(publication)-[:RELEVANT_TO]->(rc)

180 relationships created.


Next, we want to identify the top papers of these conferences/journals. We need to
find the papers with the highest page rank provided the number of citations from the
papers of the same community (papers in the conferences/journals of the database
community). As a result we would obtain (highlight), say, the top-100 papers of the
conferences of the database community.


In [102]:
%%cypher
CALL algo.pageRank.stream(
    'MATCH (c:ResearchCommunity {name: "databases"})<-[:RELEVANT_TO]-(publication)<-[:OF]-(instance)<-[:PUBLISHED_IN]-(article:Article) WHERE (instance:Volume OR instance:Edition) AND (publication:Journal OR publication:Conference) RETURN article',
    'CITED_BY', {iterations:20, dampingFactor:0.85})
YIELD nodeId, score
MATCH (a:Article) WHERE ID(a) = nodeId
WITH a, score
ORDER by score DESC
RETURN a.title, score
LIMIT 5

5 rows affected.


a.title,score
"""Bump"": using a mobile app to enhance learning in simulation scenarios.",1.3380366005469115
Linear Hypopigmentation After Triamcinolone Injection: A Rare Complication of a Common Procedure,1.2711560409981757
Oleanolic acid suppresses the proliferation of lung carcinoma cells by miR-122/Cyclin G1/MEF2D axis,1.2702508100308478
Acetic acid as a sclerosing agent for renal cysts: Comparison with ethanol in follow-up results,1.2594134361017493
Restoration of blue scratches in digital image sequences,1.2585701291449365


In [107]:
%%cypher
CALL algo.pageRank.stream(
    'MATCH (c:ResearchCommunity {name: "databases"})<-[:RELEVANT_TO]-(publication)<-[:OF]-(instance)<-[:PUBLISHED_IN]-(article:Article) WHERE (instance:Volume OR instance:Edition) AND (publication:Journal OR publication:Conference) RETURN article',
    'CITED_BY', {iterations:20, dampingFactor:0.85})
YIELD nodeId, score
MATCH (a:Article) WHERE ID(a) = nodeId
WITH a, score
ORDER by score DESC
WITH a, score LIMIT 100
MATCH (rc:ResearchCommunity {name: "databases"})
MERGE p=(a)-[:RECOMMENDED_TO]->(rc)
RETURN p, score;

100 rows affected.


p,score
"[{'doi_url': 'https://doi.org/10.1097/SIH.0b013e31825e8bcf', 'year': 2012, 'id': 'c104539460e4e4eac8b630ac81ef04d45a683286', 'title': '""Bump"": using a mobile app to enhance learning in simulation scenarios.', 'doi': '10.1097/SIH.0b013e31825e8bcf'}, {}, {'name': 'databases', 'fake': True}]",1.3380366005469115
"[{'doi_url': 'https://doi.org/10.1007/s00266-005-0131-z', 'year': 2005, 'id': '6d246d9ffb02ada8a3e3b4dc11cd14bd36468290', 'title': 'Linear Hypopigmentation After Triamcinolone Injection: A Rare Complication of a Common Procedure', 'doi': '10.1007/s00266-005-0131-z'}, {}, {'name': 'databases', 'fake': True}]",1.2711560409981757
"[{'doi_url': 'https://doi.org/10.1007/s11010-014-2228-7', 'year': 2014, 'id': '665c0f7c26ba4c5b99fea43ac85509200a76dfe8', 'title': 'Oleanolic acid suppresses the proliferation of lung carcinoma cells by miR-122/Cyclin G1/MEF2D axis', 'doi': '10.1007/s11010-014-2228-7'}, {}, {'name': 'databases', 'fake': True}]",1.2702508100308478
"[{'doi_url': 'https://doi.org/10.1007/s002700010039', 'year': 2000, 'id': '7f332daa1c7f6556c2094af68c512fa532349ec1', 'title': 'Acetic acid as a sclerosing agent for renal cysts: Comparison with ethanol in follow-up results', 'doi': '10.1007/s002700010039'}, {}, {'name': 'databases', 'fake': True}]",1.2594134361017493
"[{'doi_url': 'https://doi.org/10.1016/j.imavis.2006.04.013', 'year': 2008, 'id': '4a2a536bc9ed1d032a41a6c0507cfc0dd714bab6', 'title': 'Restoration of blue scratches in digital image sequences', 'doi': '10.1016/j.imavis.2006.04.013'}, {}, {'name': 'databases', 'fake': True}]",1.2585701291449365
"[{'doi_url': 'https://doi.org/10.1109/ICASSP.2007.366746', 'year': 2007, 'id': 'a17699a73ac3c6dc8bcec10ecc420857d88eb632', 'title': 'Signal Processing Model and Receiver Algorithms for a Higher Rate Multi-User TR-UWB System', 'doi': '10.1109/ICASSP.2007.366746'}, {}, {'name': 'databases', 'fake': True}]",1.230960787693039
"[{'doi_url': 'https://doi.org/10.1111/idh.12112', 'year': 2015, 'id': '7ec7fd995b8d81c5ccf58806569391efb520a5d2', 'title': 'Efficacy of dental floss impregnated with chlorhexidine on reduction of supragingival biofilm: a randomized controlled trial.', 'doi': '10.1111/idh.12112'}, {}, {'name': 'databases', 'fake': True}]",1.2180380919948224
"[{'doi_url': 'https://doi.org/10.1021/acsami.7b00092', 'year': 2017, 'id': 'a2dc6402747f633f5e25c4e37b9debe5efe0dc26', 'title': 'Enabling Inkjet Printed Graphene for Ion Selective Electrodes with Postprint Thermal Annealing.', 'doi': '10.1021/acsami.7b00092'}, {}, {'name': 'databases', 'fake': True}]",1.212353684892878
"[{'doi_url': 'https://doi.org/10.1016/j.engappai.2008.04.018', 'year': 2009, 'id': '2c86deb0e77a7fb9853472ec39749d209adb9440', 'title': 'Automatic semantic indexing of medical images using a web ontology language for case-based image retrieval', 'doi': '10.1016/j.engappai.2008.04.018'}, {}, {'name': 'databases', 'fake': True}]",1.1766117617487908
"[{'doi_url': 'https://doi.org/10.1159/000470896', 'year': 2017, 'id': '65d67e1d3f74b749dfdaa8debf902e337c92dbd0', 'title': 'Use of an Osteoblast Overload Damage Model to Probe the Effect of Icariin on the Proliferation, Differentiation and Mineralization of MC3T3-E1 Cells through the Wnt/β-Catenin Signalling Pathway.', 'doi': '10.1159/000470896'}, {}, {'name': 'databases', 'fake': True}]",1.1721596704795958


Finally, an author of any of these top-100 papers is automatically considered a potential
good match to review database papers. In addition, we want to identify gurus, i.e.,
very reputated authors that would be able to review for top conferences. We identify
gurus as those authors that are authors of, at least, two papers among the top-100
identified.


All the papers recommended to the research community are those in the top 100 of the PageRank algorithm. Who are the authors of these articles?

In [114]:
%%cypher
MATCH (a:Author)-[:AUTHORS]->(b:Article)-[:RECOMMENDED_TO]->(rc:ResearchCommunity {name: 'databases'})
RETURN a.name as author, b.title as article
LIMIT 5

5 rows affected.


author,article
P. M. Boland,Adenocarcinoma of the Prostate Presenting as an Obstructing Rectal Mass
S. D. Goldstein,Adenocarcinoma of the Prostate Presenting as an Obstructing Rectal Mass
D. M. Kastenberg,Adenocarcinoma of the Prostate Presenting as an Obstructing Rectal Mass
B. J. O’Hara,Adenocarcinoma of the Prostate Presenting as an Obstructing Rectal Mass
R. S. Dhillon,Adenocarcinoma of the Prostate Presenting as an Obstructing Rectal Mass


Assign these authors to the research community, as potential reviewers

In [115]:
%%cypher
MATCH (a:Author)-[:AUTHORS]->(b:Article)-[:RECOMMENDED_TO]->(rc:ResearchCommunity {name: 'databases'})
WITH a, rc
MERGE (a)-[:REVIEWER_CANDIDATE]->(rc)

491 relationships created.


Now, check how many articles these authors authored

In [118]:
%%cypher
MATCH (a:Author)-[:AUTHORS]->(b:Article)-[:RECOMMENDED_TO]->(rc:ResearchCommunity {name: 'databases'})
RETURN a.name, b.title, count(b) as n_articles
ORDER BY n_articles DESC
LIMIT 5

5 rows affected.


a.name,b.title,n_articles
S. D. Goldstein,Adenocarcinoma of the Prostate Presenting as an Obstructing Rectal Mass,1
D. M. Kastenberg,Adenocarcinoma of the Prostate Presenting as an Obstructing Rectal Mass,1
B. J. O’Hara,Adenocarcinoma of the Prostate Presenting as an Obstructing Rectal Mass,1
R. S. Dhillon,Adenocarcinoma of the Prostate Presenting as an Obstructing Rectal Mass,1
P. M. Boland,Adenocarcinoma of the Prostate Presenting as an Obstructing Rectal Mass,1


In [120]:
%%cypher
MATCH (a:Author)-[:AUTHORS]->(b:Article)-[:RECOMMENDED_TO]->(rc:ResearchCommunity {name: 'databases'})
WITH a, count(b) as n_articles, rc
WHERE n_articles >= 2
MERGE (a)-[:GURU_OF]->(rc)

0 rows affected.


It seems there are no gurus, since no one has authored more than 2 articles