In [32]:
from neo4j import GraphDatabase
import pandas as pd

### **Database connection**

Here, the connection settings to link this script with Neo4j are setup

In [33]:
# Connection settings
uri = "neo4j://localhost:7687"
pass = "pass"
driver = GraphDatabase.driver(uri, auth=("neo4j", ))

### **Reccomender**

*Query 1*

Here we retrieve the articles with keywords associated to the DB community

In [34]:
def find_articles_by_keywords(tx):
    answer = tx.run("""\

        MATCH (a:Article)-[:talks_about]->(k:Keyword) 
        WHERE k.word IN ["data management", "indexing", "data modeling", "big data", "data processing", "data storage", "data querying"] 
        RETURN DISTINCT a
        
        """)
    articles = []
    for article in answer:
        articles.append(article)
    return articles


def get_database_community_articles():
    with driver.session() as session:
        articles = session.read_transaction(find_articles_by_keywords)
        articles_list = []

    for i in range(len(articles)):
        article_title = articles[i].get("a").get("title")
        articles_list.append(article_title)

    return pd.DataFrame(data=articles_list, columns=["article"])

In [35]:

database_community_articles = get_database_community_articles()
database_community_articles

Unnamed: 0,article
0,Predictive Non-equilibrium Social Science
1,Machine Friendly Machine Learning: Interpretat...
2,From depth image to semantic scene synthesis t...
3,Quantum Information Processing in Cavity-QED.
4,Distributed Reconstruction of Nonlinear Networ...
...,...
959,Haraka - Efficient Short-Input Hashing for Pos...
960,Sensor-based extraction of physical property m...
961,Learning a Weighted Classifier for Conditional...
962,Mitiq: A software package for error mitigation...


Create a node for the database community and relate the keywords to that node

In [36]:
# Create node of database community
def create_community(tx, community):
    tx.run("MERGE (n:Community {name: $community})", community=community)

with driver.session() as session:
    session.write_transaction(create_community, "database community")

In [37]:
# Create edges from keywords to the database community
def create_feature_of(tx, keywords, community):
    for i in range(len(keywords)):
        tx.run("MATCH (n: Keyword {word: $kname}), (m: Community {name: $cname}) MERGE (n) - [:feature_of] -> (m) ", kname=keywords[i], cname=community)

keywords = ["data management", "indexing", "data modeling", "big data", "data processing", "data storage", "data querying"]
community = "database community"

with driver.session() as session:
    session.write_transaction(create_feature_of, keywords, community)

*Query 2*

Here we retrieve the conferences/journals from the DB community

In [38]:
def find_journal_conf_by_keywords(tx):
    answer = tx.run("""\

    MATCH (conference_journal)<-[:belongs_to]-()<-[:published_in]-(a:Article)-[:talks_about]->(:Keyword)-[:feature_of]->(database_community:Community {name: "database community"})
    WITH DISTINCT conference_journal, a
    WITH conference_journal, count(a) AS community_articles
    MATCH (conference_journal)<-[:belongs_to]-()<-[:published_in]-(a:Article)
    WITH conference_journal, community_articles, count(a) AS total_articles
    WHERE community_articles/total_articles >=0.9
    RETURN conference_journal.name as conference_journal
        
    """)
    conferences_journals = []
    for conference_journal in answer:
        conferences_journals.append(conference_journal)
    return conferences_journals


def get_database_community_conferences_journals():
    with driver.session() as session:
        conferences_journals = session.read_transaction(find_journal_conf_by_keywords)
        conferences_jorunals_list = []

    for i in range(len(conferences_journals)):
        conference_journal_title = conferences_journals[i].get("conference_journal")
        conferences_jorunals_list.append(conference_journal_title)

    return conferences_jorunals_list


# Create edges from the obtained conferences/journals to the database community node
def create_is_part_of(tx, conferences_journals, community):
    # Create edges from conferences
    for i in range(len(conferences_journals)):
        tx.run("""\
            
            MATCH (conference:Conference {name: $name}), (community: Community {name: $community}) 
            CREATE (conference) - [:is_part_of] -> (community) 
            
            """, 
            community=community,
            name=conferences_journals[i]
            )

    # Create edges from journals
    for i in range(len(conferences_journals)):
        tx.run("""\
            
            MATCH (journal:Journal {name: $name}), (community: Community {name: $community}) 
            CREATE (journal) - [:is_part_of] -> (community) 
            
            """, 
            community=community,
            name=conferences_journals[i]
            )

In [39]:
# Get the conferences or journals belonging to the database community
conferences_journals = get_database_community_conferences_journals()
conference_journals_df = pd.DataFrame(data=conferences_journals, columns=["Conferences/Journals DB Community"])
# Add the corresponding edges to the graph between conferences/journals and the database community node
community = "database community"
with driver.session() as session:
    session.write_transaction(create_is_part_of, conferences_journals, community)
#Show results
conference_journals_df

Unnamed: 0,Conferences/Journals DB Community
0,Int. J. Spatial Data Infrastructures Res.
1,Int. J. Digit. Libr.
2,Des. Autom. Embed. Syst.
3,Trans. Data Priv.
4,J. Inf. Syst.
...,...
58,Proc. Natl. Acad. Sci. USA
59,AGIT Journal Angew. Geoinformatik
60,Int. J. Intell. Robotics Appl.
61,J. Log. Comput.


*Query 3*

In [40]:
def create_graph_pagerank(tx, community):
    answer = tx.run("""\

    CALL gds.graph.create.cypher(
    'DBCommunityGraph',
    'MATCH (art:Article)-[:talks_about]->(:Keyword)-[:feature_of]->(com:Community{name:"database community"}) MATCH (art)-[:published_in]->()-[:belongs_to]->()-[:is_part_of]->(com) RETURN DISTINCT id(art) AS id, labels(art) AS labels',
    'MATCH (art1:Article)-[:talks_about]->(:Keyword)-[:feature_of]->(com:Community{name:"database community"}) MATCH (art1)-[:published_in]->()-[:belongs_to]->()-[:is_part_of]->(com) MATCH (art2:Article)-[:talks_about]->(:Keyword)-[:feature_of]->(com:Community{name:"database community"}) MATCH (art2)-[:published_in]->()-[:belongs_to]->()-[:is_part_of]->(com) WHERE art1.article_id<art2.article_id MATCH (art1)-[:cited_by]-(art2) RETURN id(art1) AS source, id(art2) AS target'
    )
    """, 
    community=community
    )
    return answer

In [41]:
community = "database community"
with driver.session() as session:
    answer = session.write_transaction(create_graph_pagerank, community)
answer

ClientError: {code: Neo.ClientError.Procedure.ProcedureCallFailed} {message: Failed to invoke procedure `gds.graph.create.cypher`: Caused by: java.lang.IllegalArgumentException: A graph with name 'DBCommunityGraph' already exists.}

Apply the page rank algorithm over the graph created

In [None]:
def find_top_articles_from_community(tx):
    answer = tx.run("""\

    // applying page rank
    CALL gds.pageRank.stream('DBCommunityGraph')
    YIELD nodeId, score
    WITH gds.util.asNode(nodeId).title AS name, score
    ORDER BY score DESC
    WITH COLLECT({name:name, score:score}) as articles, COLLECT(score)[49] AS LastPlaceCitation
    WITH articles, LastPlaceCitation, reduce(draws=0, i in range(49,size(articles)-1) | CASE WHEN articles[i].score = LastPlaceCitation THEN draws+1 ELSE draws END) AS LastPlace
    WITH LastPlace, articles[0..(49+LastPlace)] AS articles
    UNWIND articles as Article_Scores
    RETURN Article_Scores.name AS name, Article_Scores.score AS score

    """)
    articles = []
    for article in answer:
        articles.append(article)
    return articles

In [None]:
with driver.session() as session:
    articles = session.read_transaction(find_top_articles_from_community)

In [None]:
# Create table with answer
cols=["article","score"]
articles_df = pd.DataFrame(columns=cols)
for i in range(len(articles)):
    data = [
        articles[i].get("name"),
        articles[i].get("score")
        ]
    new_df = pd.DataFrame([data], columns=cols)
    articles_df = pd.concat([articles_df, new_df])
articles_df

Unnamed: 0,article,score
0,Grpah DB new era 6,0.667662
0,Grpah DB new era 5,0.544848
0,Notes on Apple 4 Fonts.,0.470875
0,Grpah DB new era 4,0.414527
0,Automation in construction.,0.385875
...,...,...
0,A Complete Axiomatization for Blocks World.,0.15
0,Foreword.,0.15
0,MayaArch3D - webbasierte archäologische 3D-Vis...,0.15
0,Coverage optimization of visual sensor network...,0.15


Write the score as a property in the above article nodes

In [None]:
def write_rank(tx):
    answer = tx.run("""\

    CALL gds.pageRank.write('DBCommunityGraph',{writeProperty:"pageRankInDBComm"})

    """)
    return answer

In [None]:
with driver.session() as session:
    answer = session.write_transaction(write_rank)
answer

<neo4j.work.result.Result at 0x7fe7a8ed9c70>

*Query 4*

With the previously obtained top articles, find the corresponding authors (who will be the potential reviewers for the community)

In [None]:
top_articles = articles_df['article'].tolist()

In [None]:
def find_authors(tx, top_articles):
    answer = tx.run("""

    MATCH (author:Author)<-[:written_by]-(article:Article)
    WHERE article.title IN $top_articles
    RETURN DISTINCT author.author_id as author_id, author.name as author_name

    """, 
    top_articles=top_articles
    )
    
    authors = []
    for author in answer:
        authors.append(author)
    return authors

In [None]:
with driver.session() as session:
    authors = session.read_transaction(find_authors, top_articles)

In [None]:
# Create table with answer
cols=["author id", "author name"]
authors_df = pd.DataFrame(columns=cols)
for i in range(len(authors)):
    data = [
        authors[i].get("author_id"),
        authors[i].get("author_name")
        ]
    new_df = pd.DataFrame([data], columns=cols)
    authors_df = pd.concat([authors_df, new_df])
authors_df

Unnamed: 0,author id,author name
0,10086759,Thomas Wrede
0,10097024,Simon Geletta
0,10097023,Lendie Follett
0,10097025,Marcia Laugerman
0,9085809,Weiwei Xing
...,...,...
0,9063196,Xuan Yang
0,9644301,Guoliang Chen 0005
0,9343960,Terry Caelli
0,8974693,Horst Bunke


Now, from this list get the author gurus

In [44]:
def find_author_gurus(tx, top_articles):
    answer = tx.run("""

    MATCH (author:Author)<-[:written_by]-(article:Article)
    WHERE article.title IN $top_articles
    WITH author, count(article) AS articles_amount ORDER BY articles_amount DESC
    WHERE articles_amount >= 2
    RETURN author.name AS author, articles_amount
    
    """, 
    top_articles=top_articles
    )
    
    authors = []
    for author in answer:
        authors.append(author)
    return authors

In [45]:
with driver.session() as session:
    authors = session.read_transaction(find_author_gurus, top_articles)

In [46]:
# Create table with answer
cols=["author guru", "articles"]
authors_df = pd.DataFrame(columns=cols)
for i in range(len(authors)):
    data = [
        authors[i].get("author"),
        authors[i].get("articles_amount")
        ]
    new_df = pd.DataFrame([data], columns=cols)
    authors_df = pd.concat([authors_df, new_df])
authors_df

Unnamed: 0,author guru,articles
0,Chu-Sing Yang,4
0,Tien-Wen Sung 0001,4
0,Dana E. Hart,2


Assert this information into the graph by setting a property on the authors that are gurus

In [47]:
names = authors_df['author guru'].tolist()

# put a property guruOfDB
def set_property_guruOfDB(tx, names):
    answer = tx.run("""\

    MATCH (a:Author)
    WHERE a.name IN $names
    SET a.GuruOfDB=1
    
    """, 
    names=names
    )

with driver.session() as session:
    articles = session.write_transaction(set_property_guruOfDB, names)