In [1]:
import os
import sys
import configparser

project_root = os.path.abspath(os.path.join(os.getcwd(), "../../"))
sys.path.append(project_root)

from yfiles_jupyter_graphs_for_neo4j import Neo4jGraphWidget
from database.neo4j_db import Neo4jGraphDB

In [2]:
# Load configuration, database, and visualization
config = configparser.ConfigParser()
config.read('../../config.ini')
neo4j_graph = Neo4jGraphDB()
driver = Neo4jGraphDB()._driver
jg = Neo4jGraphWidget(driver)

# Full-text Search for Similarity

In [3]:
# Lucene Index + SorensenDice
full_index_query = """
    MATCH (p:PersonRecord )
    WHERE p.fullName = "Leo Appelbaum"
    WITH p, p.fullName as name,
        apoc.text.split(apoc.text.replace(p.fullName ,'[^a-zA-Z0-9\\s]', ''), "\\s+") as name_words
    WHERE size(name_words) > 0
    CALL db.index.fulltext.queryNodes(
        "person_record_fullName",
        apoc.text.join([x IN name_words | trim(x) + "~0.65"], " AND ")
    )
    YIELD node, score
    WITH p, name, node
    WHERE p <> node
    WITH p, node, apoc.text.sorensenDiceSimilarity(name, node.fullName) as simil
    WHERE simil > 0.695
    RETURN node.id as recordId, node.fullName as recordName, simil 
    """
with driver.session() as session:
    result = session.run(full_index_query)

    for record in result:
        print(f"ID: {record['recordId']}, Name: {record['recordName']}, Score: {record['simil']} ")

ID: 3109330, Name: Leo Appelbaum, Score: 1.0 
ID: 3144501, Name: Lee D Applbaum, Score: 0.7368421052631579 
ID: 3205309, Name: Lee D. Applbaum, Score: 0.7 
ID: 3000000, Name: Leo Appelbaum, Score: 1.0 
ID: 3144501, Name: Lee D Applbaum, Score: 0.7368421052631579 
ID: 3205309, Name: Lee D. Applbaum, Score: 0.7 


In [4]:
# Lucene Index
index_query = """
    MATCH (p:PersonRecord )
    WHERE p.fullName = "Leo Appelbaum"
    WITH p, p.fullName as name,
        apoc.text.split(apoc.text.replace(p.fullName ,'[^a-zA-Z0-9\\s]', ''), "\\s+") as name_words
    WHERE size(name_words) > 0
    CALL db.index.fulltext.queryNodes(
        "person_record_fullName",
        apoc.text.join([x IN name_words | trim(x) + "~0.65"], " AND ")
    )
    YIELD node, score
    RETURN node.id as recordId, node.fullName as recordName, score as simil 
    """
with driver.session() as session:
    result = session.run(index_query)

    for record in result:
        print(f"ID: {record['recordId']}, Name: {record['recordName']}, Score: {record['simil']} ")

ID: 3000000, Name: Leo Appelbaum, Score: 9.165863037109375 
ID: 3109330, Name: Leo Appelbaum, Score: 9.165863037109375 
ID: 3144501, Name: Lee D Applbaum, Score: 5.777987480163574 
ID: 3205309, Name: Lee D. Applbaum, Score: 5.777987480163574 
ID: 3000000, Name: Leo Appelbaum, Score: 9.165863037109375 
ID: 3109330, Name: Leo Appelbaum, Score: 9.165863037109375 
ID: 3144501, Name: Lee D Applbaum, Score: 5.777987480163574 
ID: 3205309, Name: Lee D. Applbaum, Score: 5.777987480163574 


# Community Detection: Weakly Connected Component vs Louvain

In [5]:
# Highly connected Person to the related records
highest_records_query = """
    MATCH (pr:PersonRecord)-[:RECORD_RESOLVED_TO]->(p:Person)
    RETURN size(collect(pr)) as numberOfRecords, p.name as personName
    ORDER BY numberOfRecords DESC
    LIMIT 10
"""

with driver.session() as session:
    result = session.run(highest_records_query)

    for record in result:
        print(f"Number of Records: {record['numberOfRecords']}, Name: {record['personName']}")

Number of Records: 33353, Name: Li Ji
Number of Records: 8893, Name: None
Number of Records: 2918, Name: Ali Ali
Number of Records: 836, Name: R Patel
Number of Records: 423, Name: John Mark
Number of Records: 311, Name: Kathleen
Number of Records: 157, Name: Mark Ma
Number of Records: 148, Name: Business Owner
Number of Records: 113, Name: John Cole
Number of Records: 101, Name: Mark Allen


In [6]:
# Showing bridges between highly-connected records
jg.show_cypher("MATCH path=(:PersonRecord)-[:IS_SIMILAR_TO]-(:PersonRecord)-[:IS_SIMILAR_TO]-(pr:PersonRecord)-[:RECORD_RESOLVED_TO]->(p:Person) WHERE p.name = 'Li Ji' RETURN path LIMIT 100")

GraphWidget(layout=Layout(height='800px', width='100%'))

In [7]:
# Number of WCC communities vs Number of Louvain communities
wcc_louvain_query = """
    MATCH (pr:PersonRecord)-[:RECORD_RESOLVED_TO]->(p:Person) WHERE p.name = 'Li Ji'
    RETURN DISTINCT size(collect(DISTINCT pr.componentId)) as wccId, size(collect(DISTINCT pr.louvain)) as louvainId 
"""

with driver.session() as session:
    result = session.run(wcc_louvain_query)

    for record in result:
        print(f"Number of WCC communities: {record['wccId']}, Number of Louvain communities: {record['louvainId']}")


Number of WCC communities: 1, Number of Louvain communities: 4677
