In [1]:
from neo4j import GraphDatabase
import pandas as pd

### **Database connection**

Here, the connection settings to link this script with Neo4j are setup

In [9]:
# Connection settings
uri = "neo4j://localhost:7687"

In [3]:
class Neo4jConnection:
    
    def __init__(self, uri, user, pwd):
        
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
        
    def close(self):
        
        if self.__driver is not None:
            self.__driver.close()
        
    def query(self, query, parameters=None, db=None):
        
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session() 
            response = list(session.run(query, parameters))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response

In [10]:
pass = "pass"
conn=Neo4jConnection(uri=uri, user="neo4j", pass)

### **TASK C algorithms**

**Louvain algorithm**: Communiy detection. Detect the communities of authors defined by the conferences. I.e find if there are authors that are related by the fact that they often go to the same conferences.


In [13]:
createAuthorToAuthorGraph= """
    CALL gds.graph.create.cypher(
    'sameConferenceAuthorsGraph',
    'MATCH (n) WHERE n:Author 
    RETURN id(n) AS id, labels(n) AS labels',
    'MATCH (auth1:Author)<-[:written_by]-(:Article)-[:published_in]-> 
    (:Proceeding) -[:belongs_to]->(conf:Conference)<-[:belongs_to]-(:Proceeding)<-[:published_in]-(:Article)-[:written_by]->(auth2:Author)
    WHERE id(auth1) < id(auth2)
    WITH auth1, auth2
    RETURN id(auth1) AS source, id(auth2) AS target')
"""

conn.query(createAuthorToAuthorGraph)

[<Record nodeQuery='MATCH (n) WHERE n:Author \n    RETURN id(n) AS id, labels(n) AS labels' relationshipQuery='MATCH (auth1:Author)<-[:written_by]-(:Article)-[:published_in]-> \n    (:Proceeding) -[:belongs_to]->(conf:Conference)<-[:belongs_to]-(:Proceeding)<-[:published_in]-(:Article)-[:written_by]->(auth2:Author)\n    WHERE id(auth1) < id(auth2)\n    WITH auth1, auth2\n    RETURN id(auth1) AS source, id(auth2) AS target' graphName='sameConferenceAuthorsGraph' nodeCount=18201 relationshipCount=45542 createMillis=287>]

The algorithm is then launched:

In [18]:
louvain='''CALL gds.louvain.stream('sameConferenceAuthorsGraph')
YIELD nodeId, communityId
WITH DISTINCT communityId
RETURN count(communityId )'''
conn.query(louvain)

[<Record count(communityId )=16052>]

Trying different parameters

In [17]:
louvain='''CALL gds.louvain.stream('sameConferenceAuthorsGraph', {maxIterations:20, maxLevels:15, tolerance:001})
YIELD nodeId, communityId
WITH DISTINCT communityId
RETURN count(communityId )'''
conn.query(louvain)

[<Record count(communityId )=16154>]

In [19]:
louvain='''CALL gds.louvain.stream('sameConferenceAuthorsGraph', {maxIterations:10, maxLevels:15, tolerance:0.01})
YIELD nodeId, communityId
WITH DISTINCT communityId
RETURN count(communityId )'''
conn.query(louvain)

[<Record count(communityId )=16077>]

The number is always almost the same.

**Similarity algorithm**: articles similarity based on the topics they talk about. We want to find all the articles similar to the one "Parallel Machine Scheduling with Time Dependent Processing Times".


first, all the articles has to be check for similarity

In [20]:
createAuthorKeywordsGraph="""
    CALL gds.graph.create.cypher(
    'articleTopicsGraph',
    'MATCH (n) 
    WHERE n:Article OR n:Keyword 
    RETURN id(n) AS id, labels(n) AS labels',
    'MATCH (art:Article)-[:talks_about]->(k:Keyword)
    RETURN id(art) AS source, id(k) AS target')
"""

conn.query(createAuthorKeywordsGraph)

[<Record nodeQuery='MATCH (n) \n    WHERE n:Article OR n:Keyword \n    RETURN id(n) AS id, labels(n) AS labels' relationshipQuery='MATCH (art:Article)-[:talks_about]->(k:Keyword)\n    RETURN id(art) AS source, id(k) AS target' graphName='articleTopicsGraph' nodeCount=6439 relationshipCount=30623 createMillis=117>]

The algorithm is then launched:

In [21]:
similarity = """
CALL gds.nodeSimilarity.stream('articleTopicsGraph')
YIELD node1, node2, similarity
WHERE gds.util.asNode(node1).title="Evolutionary analysis of collaboration networks in the field of information systems."
RETURN gds.util.asNode(node2).title AS Article2, similarity
ORDER BY  similarity DESC"""

conn.query(similarity)

[<Record Article2='Neurobiological Models of Two-Choice Decision Making Can Be Reduced to a One-Dimensional Nonlinear Diffusion Equation.' similarity=0.3333333333333333>,
 <Record Article2='A State Secret - Dissertations in the German Democratic Republic.' similarity=0.2857142857142857>,
 <Record Article2='Editorial.' similarity=0.25>,
 <Record Article2='Guest Editorial: Special Section from the 11th International Conference on Quality Software (QSIC 2011).' similarity=0.2>,
 <Record Article2='Consistent Operations on a Spatial Data Structure.' similarity=0.2>,
 <Record Article2='The effects of mismatch in Gm-C polyphase filters.' similarity=0.2>,
 <Record Article2='MPlot - a server to analyze and visualize tertiary structure contacts and geometrical features of helical membrane proteins.' similarity=0.2>,
 <Record Article2='Structural and Dynamical Properties of Concentrated Aqueous NaOH Solutions: A Computer Simulation Study.' similarity=0.2>,
 <Record Article2='Variance-Based Risk E