# Prerequisites

In [None]:
import os

import dotenv
import neo4j
import umap

In [69]:
dotenv.load_dotenv()

URI = "neo4j://localhost:7687"
AUTH = (os.getenv("DB_USER"), os.getenv("DB_PASS"))
DB = "graphml"

# Connect to Neo4j

*Resources*

1. [Graph modeling guidelines](https://neo4j.com/docs/getting-started/data-modeling/guide-data-modeling/#_defining_properties)
2. [Graph management](https://neo4j.com/docs/graph-data-science/current/management-ops/)
3. [Cypher query language manual](https://neo4j.com/docs/cypher-manual/current/clauses/match/#_match_on_an_undirected_relationship)
4. [Python neo4j-driver API docs](https://neo4j.com/docs/api/python-driver/current/api.html)

In [90]:
query = """
MATCH(n)
RETURN count(*)
"""

with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=DB) as session:
        result = session.run(query)
        print(result.to_df())

   count(*)
0     50426


In [71]:
query = """
MATCH ()-[r]->() 
RETURN COUNT(r) AS num_edges
"""

with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=DB) as session:
        result = session.run(query)
        print(result.to_df())

   num_edges
0    4456505


## Example Query

In [72]:
query = """
MATCH (source:Paper)-[:similarTo]->(target:Paper)
RETURN source, target LIMIT 25
"""

with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=DB) as session:
        result = session.run(query)
        # data = result.data()
        data = result.to_df(expand=True)

data

Unnamed: 0,source().element_id,source().labels,source().prop.docId,source().prop.title,source().prop.category,target().element_id,target().labels,target().prop.docId,target().prop.title,target().prop.category
0,4:edabd16a-8496-4317-9593-1b1d3f62481e:48453,(Paper),1808.01527,Deep Reinforcement One-Shot Learning for Artif...,cs.LG,4:edabd16a-8496-4317-9593-1b1d3f62481e:0,(Paper),1810.02927,Q-map: a Convolutional Approach for Goal-Orien...,cs.LG
1,4:edabd16a-8496-4317-9593-1b1d3f62481e:47520,(Paper),1807.02078,Goal-oriented Trajectories for Efficient Explo...,cs.LG,4:edabd16a-8496-4317-9593-1b1d3f62481e:0,(Paper),1810.02927,Q-map: a Convolutional Approach for Goal-Orien...,cs.LG
2,4:edabd16a-8496-4317-9593-1b1d3f62481e:42491,(Paper),1802.01697,Deep Learning with a Rethinking Structure for ...,cs.LG,4:edabd16a-8496-4317-9593-1b1d3f62481e:0,(Paper),1810.02927,Q-map: a Convolutional Approach for Goal-Orien...,cs.LG
3,4:edabd16a-8496-4317-9593-1b1d3f62481e:47001,(Paper),1806.07569,A Distributed Second-Order Algorithm You Can T...,cs.LG,4:edabd16a-8496-4317-9593-1b1d3f62481e:0,(Paper),1810.02927,Q-map: a Convolutional Approach for Goal-Orien...,cs.LG
4,4:edabd16a-8496-4317-9593-1b1d3f62481e:43899,(Paper),1803.08089,Incremental Learning-to-Learn with Statistical...,stat.ML,4:edabd16a-8496-4317-9593-1b1d3f62481e:0,(Paper),1810.02927,Q-map: a Convolutional Approach for Goal-Orien...,cs.LG
5,4:edabd16a-8496-4317-9593-1b1d3f62481e:34779,(Paper),1702.01229,Simple to Complex Cross-modal Learning to Rank,cs.LG,4:edabd16a-8496-4317-9593-1b1d3f62481e:0,(Paper),1810.02927,Q-map: a Convolutional Approach for Goal-Orien...,cs.LG
6,4:edabd16a-8496-4317-9593-1b1d3f62481e:31005,(Paper),1606.04443,A scalable end-to-end Gaussian process adapter...,stat.ML,4:edabd16a-8496-4317-9593-1b1d3f62481e:0,(Paper),1810.02927,Q-map: a Convolutional Approach for Goal-Orien...,cs.LG
7,4:edabd16a-8496-4317-9593-1b1d3f62481e:24541,(Paper),1502.0589,Contextual Semibandits via Supervised Learning...,cs.LG,4:edabd16a-8496-4317-9593-1b1d3f62481e:0,(Paper),1810.02927,Q-map: a Convolutional Approach for Goal-Orien...,cs.LG
8,4:edabd16a-8496-4317-9593-1b1d3f62481e:50405,(Paper),1810.02266,Concept-drifting Data Streams are Time Series;...,cs.LG,4:edabd16a-8496-4317-9593-1b1d3f62481e:1,(Paper),1810.0295,Mining Novel Multivariate Relationships in Tim...,cs.LG
9,4:edabd16a-8496-4317-9593-1b1d3f62481e:44870,(Paper),1804.09619,Identifying and Alleviating Concept Drift in S...,cs.LG,4:edabd16a-8496-4317-9593-1b1d3f62481e:1,(Paper),1810.0295,Mining Novel Multivariate Relationships in Tim...,cs.LG


# Neo4j Graph Data Science

## Projecting Graph

This takes database and puts nodes/edges into working memory for use with `neo4j-gds`

### Encoding Categories

`GDS` only allows for numeric properties in projected graphs, so we'll encode categories as integers.

In [108]:
query = """
MATCH (p:Paper)
RETURN DISTINCT p.category
"""

with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=DB) as session:
        result = session.run(query)
        # data = result.data()
        data = result.to_df(expand=True)

data

Unnamed: 0,p\.category
0,cs.LG
1,math.ST
2,math.PR
3,stat.ME
4,stat.ML
...,...
134,math.RT
135,q-bio.TO
136,nucl-ex
137,nlin.PS


In [112]:
id2cat = dict(enumerate(data.iloc[:, 0].unique()))
cat2id = {v:k for k,v in id2cat.items()}

### Add CategoryId

In [92]:
with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=DB) as session:
        for category, cat_id in cat2id.items():
            query = "MATCH (n {category: '%s'}) SET n.categoryId = %d" % (category, cat_id)
            session.run(query)

### Graph Projection

In [95]:
query = """
CALL gds.graph.project(
  'papers',
  {Paper: {properties: 'categoryId'}},
  'similarTo'
) YIELD
  graphName AS graph, nodeProjection, nodeCount AS nodes, relationshipProjection, relationshipCount AS rels
"""

with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=DB) as session:
        result = session.run(query)
        print(result.data())

[{'graph': 'papers', 'nodeProjection': {'Paper': {'label': 'Paper', 'properties': {'categoryId': {'defaultValue': None, 'property': 'categoryId'}}}}, 'nodes': 50426, 'relationshipProjection': {'similarTo': {'orientation': 'NATURAL', 'indexInverse': False, 'aggregation': 'DEFAULT', 'type': 'similarTo', 'properties': {}}}, 'rels': 4456505}]


## Node Importance

### Degree Centrality

In [96]:
query= """
CALL gds.degree.stream('papers') 
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).title AS title, gds.util.asNode(nodeId).docId AS id, score as rels
ORDER BY score DESC, nodeId DESC
LIMIT 10
"""

with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=DB) as session:
        result = session.run(query)
        data = result.to_df()

data

Unnamed: 0,title,id,rels
0,Smoothed Functional Algorithms for Stochastic ...,1206.4832,4101.0
1,Asymptotic efficiency and finite-sample proper...,903.34,4050.0
2,Bayesian feature selection with strongly-regul...,1411.0591,3975.0
3,Fast Selection of Spectral Variables with B-Sp...,709.3639,3281.0
4,Active Clustering with Model-Based Uncertainty...,1402.1783,3256.0
5,Efficient Marginal Likelihood Computation for ...,1110.6546,3247.0
6,Random Forests for Metric Learning with Implic...,1201.061,3035.0
7,Estimation of the covariance matrix of random ...,803.4112,3026.0
8,Efficient Estimation of Nonlinear Finite Popul...,1201.1375,3022.0
9,Resolution and Scale Independent Function Matc...,1003.4741,2938.0


### PageRank

In [97]:
query= """
CALL gds.pageRank.stream('papers') 
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).title AS title, gds.util.asNode(nodeId).docId AS id, score
ORDER BY score DESC, nodeId DESC
LIMIT 10
"""

with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=DB) as session:
        result = session.run(query)
        data = result.to_df()

data

Unnamed: 0,title,id,score
0,Component selection and smoothing in multivari...,math/0702659,255.266898
1,Functional deconvolution in a periodic setting...,math/0703903,241.501271
2,Alignment Metric Accuracy,q-bio/0510052,173.382701
3,Quasi-arithmetic means of covariance functions...,math/0611275,139.764679
4,Support Vector Machines with Applications,math/0612817,116.203664
5,Identifying evolutionary trees and substitutio...,q-bio/0702050,102.521261
6,A Closed-Form Approximation of Likelihood Func...,physics/0703180,102.470406
7,SigSpec - I. Frequency- and Phase-Resolved Sig...,physics/0703160,98.480819
8,High-dimensional classification using features...,math/0701108,95.189203
9,Deductive semiparametric estimation in Double-...,1902.11147,93.308373


### Betweenness Centrality

Betweenness Centrality can be very resource-intensive to compute. To help with this, it is possible to approximate the results using a sampling technique. The configuration parameters samplingSize and samplingSeed are used to control the sampling.

In [98]:
query= """
CALL gds.betweenness.stream('papers', {samplingSize: 5, samplingSeed: 43}) 
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).title AS title, gds.util.asNode(nodeId).docId AS id, score
ORDER BY score DESC, nodeId DESC
LIMIT 10
"""

with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=DB) as session:
        result = session.run(query)
        data = result.to_df()

data

Unnamed: 0,title,id,score
0,Smoothed Functional Algorithms for Stochastic ...,1206.4832,2097.953285
1,Extrapolation of Urn Models via Poissonization...,1109.299,1258.665376
2,Nonparametric sparsity and regularization,1208.2572,1179.435909
3,Inference on Treatment Effects After Selection...,1201.0224,966.178451
4,Bayesian feature selection with strongly-regul...,1411.0591,947.283387
5,Mixtures of equispaced normal distributions an...,1204.4544,885.999416
6,Transitional annealed adaptive slice sampling ...,1509.00349,880.659322
7,Batch Bayesian Optimization via Local Penaliza...,1505.08052,871.750774
8,Active Clustering with Model-Based Uncertainty...,1402.1783,865.536016
9,On Constrained Spectral Clustering and Its App...,1201.5338,861.628466


## Community Detection

### Louvain

In [99]:
query= """
CALL gds.louvain.stream('papers', {seedProperty: 'categoryId'}) 
YIELD nodeId, communityId, intermediateCommunityIds
RETURN gds.util.asNode(nodeId).title AS title, communityId
ORDER BY communityId, title
LIMIT 10
"""

with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=DB) as session:
        result = session.run(query)
        data = result.to_df()

data

Unnamed: 0,title,communityId
0,An Importance Sampling Scheme for Models in a ...,0
1,An Importance Sampling Scheme on Dual Factor G...,0
2,Decorrelated Jet Substructure Tagging using Ad...,0
3,Finding Density Functionals with Machine Learning,0
4,Full-pulse Tomographic Reconstruction with Dee...,0
5,Iteratively Training Look-Up Tables for Networ...,0
6,Less is more: sampling chemical space with act...,0
7,Low-dose cryo electron ptychography via non-co...,0
8,Machine Learning in High Energy Physics Commun...,0
9,Machine Learning of coarse-grained Molecular D...,0


In [100]:
query= """
CALL gds.louvain.stats('papers') 
YIELD communityCount
"""

with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=DB) as session:
        result = session.run(query)
        data = result.to_df()

data

Unnamed: 0,communityCount
0,16150


### Label Propagation

In [101]:
query= """
CALL gds.labelPropagation.stream('papers', {seedProperty: 'categoryId'}) 
YIELD nodeId, communityId
RETURN gds.util.asNode(nodeId).title AS title, communityId
ORDER BY communityId, title
LIMIT 10
"""

with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=DB) as session:
        result = session.run(query)
        data = result.to_df()

data

Unnamed: 0,title,communityId
0,$HS^2$: Active Learning over Hypergraphs,0
1,$\beta$-VAEs can retain label information even...,0
2,$\ell_0$-Motivated Low-Rank Sparse Subspace Cl...,0
3,$\ell_1$-regression with Heavy-tailed Distribu...,0
4,"$l_{2,p}$ Matrix Norm and Its Application in F...",0
5,"(q,p)-Wasserstein GANs: Comparing Ground Metri...",0
6,2-D Embedding of Large and High-dimensional Da...,0
7,3D G-CNNs for Pulmonary Nodule Detection,0
8,3D Steerable CNNs: Learning Rotationally Equiv...,0
9,3LC: Lightweight and Effective Traffic Compres...,0


In [102]:
query= """
CALL gds.labelPropagation.stats('papers') 
YIELD communityCount
"""

with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=DB) as session:
        result = session.run(query)
        data = result.to_df()

data

Unnamed: 0,communityCount
0,17206


## Adding Community Ids

We can use the `write` execution mode of bother community detection algos to add these properies to nodes within the database

In [103]:
query = """
CALL gds.louvain.write('papers', {writeProperty: 'louvainId', seedProperty: 'categoryId'})
YIELD communityCount, modularity, modularities
"""

with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=DB) as session:
        result = session.run(query)
        data = result.to_df()

data

Unnamed: 0,communityCount,modularity,modularities
0,71,0.114315,"[0.09132657180640755, 0.11431485494419837]"


In [104]:
query = """
CALL gds.labelPropagation.write('papers', {writeProperty: 'labelPropId', seedProperty: 'categoryId'})
YIELD communityCount, ranIterations, didConverge
"""

with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=DB) as session:
        result = session.run(query)
        data = result.to_df()

data

Unnamed: 0,communityCount,ranIterations,didConverge
0,131,10,True


## Quick Glance at Community Detection Results

In [126]:
query = """
MATCH (source:Paper {louvainId: 105})
RETURN source.title as Title, source.louvainId as Community
"""

with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=DB) as session:
        result = session.run(query)
        # data = result.data()
        data = result.to_df(expand=True)

data['Title'].tolist()[:15]

['Q-map: a Convolutional Approach for Goal-Oriented Reinforcement Learning',
 'Mining Novel Multivariate Relationships in Time Series Data Using Correlation Networks',
 'Low rank spatial econometric models',
 'Understanding Recurrent Neural Architectures by Analyzing and Synthesizing Long Distance Dependencies in Benchmark Sequential Datasets',
 'Anytime Stochastic Gradient Descent: A Time to Hear from all the Workers',
 'h-detach: Modifying the LSTM Gradient Towards Better Optimization',
 'Learning to Optimize under Non-Stationarity',
 'Discretizing Logged Interaction Data Biases Learning for Decision-Making',
 'Robust variance estimation and inference for causal effect estimation',
 'Constructing Graph Node Embeddings via Discrimination of Similarity Distributions',
 'Why do Larger Models Generalize Better? A Theoretical Perspective via the XOR Problem',
 'Artificial Intelligence for Diabetes Case Management: The Intersection of Physical and Mental Health',
 'Subspace Tracking from M

In [119]:
query = """
MATCH (source:Paper {labelPropId: 5})
RETURN source.title as Title, source.labelPropId as Community
"""

with neo4j.GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database=DB) as session:
        result = session.run(query)
        # data = result.data()
        data = result.to_df(expand=True)

data['Title'].tolist()[:15]

['Artificial Intelligence for Diabetes Case Management: The Intersection of Physical and Mental Health',
 'PepCVAE: Semi-Supervised Targeted Design of Antimicrobial Peptide Sequences',
 'Getting started in probabilistic graphical models',
 'Notes on the UK Non-Native Organism Risk Assessment Scheme',
 'Discovering heterogeneous subpopulations for fine-grained analysis of opioid use and opioid use disorders',
 'Structure-Based Networks for Drug Validation',
 'Reconstructing probabilistic trees of cellular differentiation from single-cell RNA-seq data',
 'Deep Bayesian Uncertainty Estimation for Adaptation and Self-Annotation of Food Packaging Images',
 'Voice Disorder Detection Using Long Short Term Memory (LSTM) Model',
 'A Robust Deep Learning Approach for Automatic Seizure Detection',
 'Generalization error for decision problems',
 'Feature-Wise Bias Amplification',
 'Drug cell line interaction prediction',
 'Learning Undirected Posteriors by Backpropagation through MCMC Updates',
 '