# Prerequisites

In [None]:
import os

import dotenv
import neo4j
import umap

In [2]:
dotenv.load_dotenv()

URI = "neo4j://localhost:7687"
AUTH = (os.getenv("DB_USER"), os.getenv("DB_PASS"))
DB = "graphml"

# Connect to Neo4j

*Resources*

1. [Graph modeling guidelines](https://neo4j.com/docs/getting-started/data-modeling/guide-data-modeling/#_defining_properties)
2. [Graph management](https://neo4j.com/docs/graph-data-science/current/management-ops/)
3. [Cypher query language manual](https://neo4j.com/docs/cypher-manual/current/clauses/match/#_match_on_an_undirected_relationship)
4. [Python neo4j-driver API docs](https://neo4j.com/docs/api/python-driver/current/api.html)

In [3]:
query = """
MATCH(n)
RETURN count(*)
"""

with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=DB) as session:
        result = session.run(query)
        print(result.to_df())

   count(*)
0     50426


In [27]:
query = """
MATCH ()-[r]->() 
RETURN COUNT(r) AS num_edges
"""

with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=DB) as session:
        result = session.run(query)
        print(result.to_df())

   num_edges
0  241247854


## Example Query

In [4]:
query = """
MATCH (source:Paper)-[:similarTo]->(target:Paper)
RETURN source, target LIMIT 25
"""

with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=DB) as session:
        result = session.run(query)
        # data = result.data()
        data = result.to_df(expand=True)

data

Unnamed: 0,source().element_id,source().labels,source().prop.docId,source().prop.title,source().prop.category,target().element_id,target().labels,target().prop.docId,target().prop.title,target().prop.category
0,4:edabd16a-8496-4317-9593-1b1d3f62481e:5270,(Paper),1901.07299,SIMCom: Statistical Sniffing of Inter-Module C...,cs,4:edabd16a-8496-4317-9593-1b1d3f62481e:0,(Paper),1811.04133,Integrating Recurrence Dynamics for Speech Emo...,cs
1,4:edabd16a-8496-4317-9593-1b1d3f62481e:5194,(Paper),1901.06091,Transfer Learning and Meta Classification Base...,cs,4:edabd16a-8496-4317-9593-1b1d3f62481e:0,(Paper),1811.04133,Integrating Recurrence Dynamics for Speech Emo...,cs
2,4:edabd16a-8496-4317-9593-1b1d3f62481e:5031,(Paper),1901.03968,A Fully Bayesian Infinite Generative Model for...,cs,4:edabd16a-8496-4317-9593-1b1d3f62481e:0,(Paper),1811.04133,Integrating Recurrence Dynamics for Speech Emo...,cs
3,4:edabd16a-8496-4317-9593-1b1d3f62481e:3804,(Paper),1901.02415,SNRA: A Spintronic Neuromorphic Reconfigurable...,cs,4:edabd16a-8496-4317-9593-1b1d3f62481e:0,(Paper),1811.04133,Integrating Recurrence Dynamics for Speech Emo...,cs
4,4:edabd16a-8496-4317-9593-1b1d3f62481e:2448,(Paper),1812.05555,Kalman-based Spectro-Temporal ECG Analysis usi...,eess,4:edabd16a-8496-4317-9593-1b1d3f62481e:0,(Paper),1811.04133,Integrating Recurrence Dynamics for Speech Emo...,cs
5,4:edabd16a-8496-4317-9593-1b1d3f62481e:2386,(Paper),1812.04618,DCASE 2018 Challenge: Solution for Task 5,eess,4:edabd16a-8496-4317-9593-1b1d3f62481e:0,(Paper),1811.04133,Integrating Recurrence Dynamics for Speech Emo...,cs
6,4:edabd16a-8496-4317-9593-1b1d3f62481e:315,(Paper),1811.07625,Joint reconstruction and prediction of random ...,stat,4:edabd16a-8496-4317-9593-1b1d3f62481e:0,(Paper),1811.04133,Integrating Recurrence Dynamics for Speech Emo...,cs
7,4:edabd16a-8496-4317-9593-1b1d3f62481e:145,(Paper),1811.0566,MT-CGCNN: Integrating Crystal Graph Convolutio...,cs,4:edabd16a-8496-4317-9593-1b1d3f62481e:0,(Paper),1811.04133,Integrating Recurrence Dynamics for Speech Emo...,cs
8,4:edabd16a-8496-4317-9593-1b1d3f62481e:50452,(Paper),1811.0017,PerceptionNet: A Deep Convolutional Neural Net...,cs,4:edabd16a-8496-4317-9593-1b1d3f62481e:0,(Paper),1811.04133,Integrating Recurrence Dynamics for Speech Emo...,cs
9,4:edabd16a-8496-4317-9593-1b1d3f62481e:50056,(Paper),1810.0993,Machine Learning Accelerated Likelihood-Free E...,astro,4:edabd16a-8496-4317-9593-1b1d3f62481e:0,(Paper),1811.04133,Integrating Recurrence Dynamics for Speech Emo...,cs


# Neo4j Graph Data Science

## Projecting Graph

This takes database and puts nodes/edges into working memory for use with `neo4j-gds`

In [5]:
query = """
CALL gds.graph.project(
  'papers',
  'Paper',
  'similarTo'
) YIELD
  graphName AS graph, nodeProjection, nodeCount AS nodes, relationshipProjection, relationshipCount AS rels
"""

with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=DB) as session:
        result = session.run(query)
        print(result.data())

[{'graph': 'papers', 'nodeProjection': {'Paper': {'label': 'Paper', 'properties': {}}}, 'nodes': 50426, 'relationshipProjection': {'similarTo': {'orientation': 'NATURAL', 'indexInverse': False, 'aggregation': 'DEFAULT', 'type': 'similarTo', 'properties': {}}}, 'rels': 241247854}]


## Node Importance

### Degree Centrality

In [6]:
query= """
CALL gds.degree.stream('papers') 
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).title AS title, gds.util.asNode(nodeId).docId AS id, score as rels
ORDER BY score DESC, nodeId DESC
LIMIT 10
"""

with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=DB) as session:
        result = session.run(query)
        data = result.to_df()

data

Unnamed: 0,title,id,rels
0,Bayesian feature selection with strongly-regul...,1411.0591,26048.0
1,Bayesian optimisation for likelihood-free cosm...,1805.07152,25271.0
2,Toward Implicit Sample Noise Modeling: Deviati...,1610.09274,25002.0
3,Bayesian Sparse Global-Local Shrinkage Regress...,1709.04333,24737.0
4,Support Estimation via Regularized and Weighte...,1901.07506,24546.0
5,Longitudinal LASSO: Jointly Learning Features ...,1610.08013,24468.0
6,Smoothed Functional Algorithms for Stochastic ...,1206.4832,24376.0
7,Sparse Identification and Estimation of High-D...,1707.09208,24237.0
8,Adaptive penalization in high-dimensional regr...,1811.02962,24114.0
9,Tests for qualitative features in the random c...,1704.01066,24083.0


### PageRank

In [7]:
query= """
CALL gds.pageRank.stream('papers') 
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).title AS title, gds.util.asNode(nodeId).docId AS id, score
ORDER BY score DESC, nodeId DESC
LIMIT 10
"""

with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=DB) as session:
        result = session.run(query)
        data = result.to_df()

data

Unnamed: 0,title,id,score
0,Bayesian feature selection with strongly-regul...,1411.0591,4.105463
1,Bayesian optimisation for likelihood-free cosm...,1805.07152,4.009198
2,Support Estimation via Regularized and Weighte...,1901.07506,3.954374
3,Toward Implicit Sample Noise Modeling: Deviati...,1610.09274,3.888974
4,Bayesian Sparse Global-Local Shrinkage Regress...,1709.04333,3.883532
5,Sparse Identification and Estimation of High-D...,1707.09208,3.866324
6,Adaptive penalization in high-dimensional regr...,1811.02962,3.835741
7,Smoothed Functional Algorithms for Stochastic ...,1206.4832,3.835681
8,Covariate-Adjusted Tensor Classification in Hi...,1805.04421,3.833962
9,Longitudinal LASSO: Jointly Learning Features ...,1610.08013,3.812858


### Betweenness Centrality

Betweenness Centrality can be very resource-intensive to compute. To help with this, it is possible to approximate the results using a sampling technique. The configuration parameters samplingSize and samplingSeed are used to control the sampling.

In [8]:
query= """
CALL gds.betweenness.stream('papers', {samplingSize: 5, samplingSeed: 43}) 
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).title AS title, gds.util.asNode(nodeId).docId AS id, score
ORDER BY score DESC, nodeId DESC
LIMIT 10
"""

with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=DB) as session:
        result = session.run(query)
        data = result.to_df()

data

Unnamed: 0,title,id,score
0,Fast Direct Methods for Gaussian Processes,1403.6015,677.141451
1,The LASSO with Non-linear Measurements is Equi...,1506.02181,499.705632
2,Data assimilation for massive autonomous syste...,1603.0416,482.859802
3,Solving Systems of Random Quadratic Equations ...,1605.08285,477.713513
4,Exponential decay of reconstruction error from...,1407.8246,395.331423
5,Fast Convergence for Stochastic and Distribute...,1803.02922,314.02806
6,On testing for high-dimensional white noise,1808.03545,242.352105
7,Ten Steps of EM Suffice for Mixtures of Two Ga...,1609.00368,241.860372
8,The composite absolute penalties family for gr...,909.0411,235.650979
9,Sample complexity of population recovery,1702.05574,206.980924


## Community Detection

### Louvain

In [9]:
query= """
CALL gds.louvain.stream('papers') 
YIELD nodeId, communityId, intermediateCommunityIds
RETURN gds.util.asNode(nodeId).title AS title, communityId
ORDER BY communityId, title
LIMIT 10
"""

with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=DB) as session:
        result = session.run(query)
        data = result.to_df()

data

Unnamed: 0,title,communityId
0,Discovering heterogeneous subpopulations for f...,21
1,Managing App Install Ad Campaigns in RTB: A Q-...,39
2,Detection of REM Sleep Behaviour Disorder by A...,59
3,Prediction of Alzheimer's disease-associated g...,80
4,A conjugate prior for the Dirichlet distribution,106
5,Few-shot Learning for Named Entity Recognition...,123
6,Some Moderate Deviations for Ewens-Pitman Samp...,178
7,Adversarial Examples from Cryptographic Pseudo...,201
8,Entropy-regularized Optimal Transport Generati...,227
9,Machine Learning for Health (ML4H) Workshop at...,275


In [10]:
query= """
CALL gds.louvain.stats('papers') 
YIELD communityCount
"""

with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=DB) as session:
        result = session.run(query)
        data = result.to_df()

data

Unnamed: 0,communityCount
0,2300


### Label Propagation

In [11]:
query= """
CALL gds.labelPropagation.stream('papers') 
YIELD nodeId, communityId
RETURN gds.util.asNode(nodeId).title AS title, communityId
ORDER BY communityId, title
LIMIT 10
"""

with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=DB) as session:
        result = session.run(query)
        data = result.to_df()

data

Unnamed: 0,title,communityId
0,$A$-Hypergeometric Distributions and Newton Po...,5
1,$A^{4}NT$: Author Attribute Anonymity by Adver...,5
2,$C_p$ criterion for semiparametric approach in...,5
3,$D$-optimal saturated designs: a simulation study,5
4,$E$-optimal designs for second-order response ...,5
5,$HS^2$: Active Learning over Hypergraphs,5
6,$K$-sample omnibus non-proportional hazards te...,5
7,$L^2$ Asymptotics for High-Dimensional Data,5
8,$L^p$-Wasserstein distance for stochastic diff...,5
9,$L^p$-norm inequality using q-moment and its a...,5


In [12]:
query= """
CALL gds.labelPropagation.stats('papers') 
YIELD communityCount
"""

with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=DB) as session:
        result = session.run(query)
        data = result.to_df()

data

Unnamed: 0,communityCount
0,2264


## Adding Community Ids

We can use the `write` execution mode of bother community detection algos to add these properies to nodes within the database

In [13]:
query = """
CALL gds.louvain.write('papers', { writeProperty: 'louvainId' })
YIELD communityCount, modularity, modularities
"""

with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=DB) as session:
        result = session.run(query)
        data = result.to_df()

data

Unnamed: 0,communityCount,modularity,modularities
0,2299,0.202463,[0.20246337598917208]


In [15]:
query = """
CALL gds.labelPropagation.write('papers', { writeProperty: 'labelPropId' })
YIELD communityCount, ranIterations, didConverge
"""

with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=DB) as session:
        result = session.run(query)
        data = result.to_df()

data

Unnamed: 0,communityCount,ranIterations,didConverge
0,2263,10,False


In [25]:
query = """
MATCH (source:Paper {labelPropId: 5})
RETURN source.title as Title, source.labelPropId as Community
"""

with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=DB) as session:
        result = session.run(query)
        # data = result.data()
        data = result.to_df(expand=True)

data

Unnamed: 0,Title,Community
0,Integrating Recurrence Dynamics for Speech Emo...,5
1,Relative Error RKHS Embeddings for Gaussian Ke...,5
2,Median Confidence Regions in a Nonparametric M...,5
3,Complex Unitary Recurrent Neural Networks usin...,5
4,Simulation of the energy efficiency auction pr...,5
...,...,...
47945,AttS2S-VC: Sequence-to-Sequence Voice Conversi...,5
47946,Surrogate Modeling of Stochastic Functions - A...,5
47947,Second order Stein: SURE for SURE and other ap...,5
47948,Policy Regret in Repeated Games,5
