In [1]:
import json
from neo4j import GraphDatabase
import numpy as np
import pandas as pd
from pprint import pprint
import time
from tqdm import tqdm

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)

## Senzing resolved data

Prior to running this notebook, you need to have run the entity resolution within the Senzing Docker container.  Here I have export the results of that run to the file `entities.json`.  You can see below what that file looks like.  We will use this then to create relationships between resolved and related entities.

In [2]:
data = []

with open('./entities.json') as f:
    for line in f:
        data.append(json.loads(line))

In [3]:
pprint(data[0])

{'RELATED_ENTITIES': [{'ENTITY_ID': 14802,
                       'ERRULE_CODE': 'MFF',
                       'IS_AMBIGUOUS': 0,
                       'IS_DISCLOSED': 0,
                       'MATCH_KEY': '+ADDRESS+GEO_LOC-PLACEKEY',
                       'MATCH_LEVEL': 3,
                       'MATCH_LEVEL_CODE': 'POSSIBLY_RELATED',
                       'RECORDS': [{'DATA_SOURCE': 'SAFEGRAPH',
                                    'RECORD_ID': '228-223@5yv-j2j-2x5'}]},
                      {'ENTITY_ID': 37854,
                       'ERRULE_CODE': 'MFF',
                       'IS_AMBIGUOUS': 0,
                       'IS_DISCLOSED': 0,
                       'MATCH_KEY': '+ADDRESS+GEO_LOC-PLACEKEY',
                       'MATCH_LEVEL': 3,
                       'MATCH_LEVEL_CODE': 'POSSIBLY_RELATED',
                       'RECORDS': [{'DATA_SOURCE': 'SAFEGRAPH',
                                    'RECORD_ID': 'zzw-222@5yv-j2j-2kz'}]},
                      {'ENTITY_ID': 5211

## Resolved entities

For ease, I will create a dataframe just of the resolved entities and then one for the related entities.  Note that there can be multiple entities assigned to each resolved entity (see the example 2 lines down).  

Note that for ease in visualization (among other things), I am going to create a new node type called `:Entity`, which will contain the resolved entities.  This will serve as a sort of "anchor node" for the clusters we will see form in graph.

In [4]:
resolved_ls = []

for el in data:
    if len(el['RESOLVED_ENTITY']['RECORDS']) > 1:
        resolved_ls.append(el)

In [5]:
pprint(resolved_ls[3])

{'RELATED_ENTITIES': [{'ENTITY_ID': 2465,
                       'ERRULE_CODE': 'CFF',
                       'IS_AMBIGUOUS': 0,
                       'IS_DISCLOSED': 0,
                       'MATCH_KEY': '+GEO_LOC-PLACEKEY',
                       'MATCH_LEVEL': 3,
                       'MATCH_LEVEL_CODE': 'POSSIBLY_RELATED',
                       'RECORDS': [{'DATA_SOURCE': 'SAFEGRAPH',
                                    'RECORD_ID': '224-225@5yv-j8b-btv'}]},
                      {'ENTITY_ID': 41280,
                       'ERRULE_CODE': 'CFF',
                       'IS_AMBIGUOUS': 0,
                       'IS_DISCLOSED': 0,
                       'MATCH_KEY': '+GEO_LOC-PLACEKEY',
                       'MATCH_LEVEL': 3,
                       'MATCH_LEVEL_CODE': 'POSSIBLY_RELATED',
                       'RECORDS': [{'DATA_SOURCE': 'SAFEGRAPH',
                                    'RECORD_ID': '223-224@5yv-j8b-btv'}]}],
 'RESOLVED_ENTITY': {'ENTITY_ID': 436,
                 

In [6]:
tup_ls = []
for el in resolved_ls: 
    source = el['RESOLVED_ENTITY']['RECORDS'][0]
    source_uid = str(el['RESOLVED_ENTITY']['RECORDS'][0]['DATA_SOURCE']) + '.' + str(el['RESOLVED_ENTITY']['RECORDS'][0]['RECORD_ID'])
    for target in el['RESOLVED_ENTITY']['RECORDS'][1:]:
        #tar = target
        target_uid = str(target['DATA_SOURCE']) + '.' + str(target['RECORD_ID'])
        tup = (el['RESOLVED_ENTITY']['RECORDS'][0]['DATA_SOURCE'],
               source['ENTITY_DESC'], 
               source_uid, 
               target['DATA_SOURCE'],
               target['ENTITY_DESC'], 
               target_uid)
        tup_ls.append(tup)

resolved_rel_df = pd.DataFrame(tup_ls, columns=['source_data_source', 'source_desc', 'source_uid',
                                                'target_data_source', 'target_desc', 'target_uid'])
resolved_rel_df.head()

Unnamed: 0,source_data_source,source_desc,source_uid,target_data_source,target_desc,target_uid
0,SAFEGRAPH,Dona Maria Tamales Restaurant,SAFEGRAPH.zzw-223@5yv-hjr-g8v,PPP_LOANS,"DONA MARIA, INC.",PPP_LOANS.8342
1,SAFEGRAPH,Trigg Laboratories,SAFEGRAPH.226-222@5yv-jb7-5fz,PPP_LOANS,TRIGG LABORATORIES INC,PPP_LOANS.10492
2,SAFEGRAPH,Vanguard Integrity Professionals,SAFEGRAPH.22c-222@5yv-jbt-k75,DOL_WHISARD,"Vanguard Integrity Professionals, Inc.",DOL_WHISARD.210643
3,SAFEGRAPH,Vanguard Integrity Professionals,SAFEGRAPH.22c-222@5yv-jbt-k75,PPP_LOANS,VANGUARD INTEGRITY PROFESSIONALS NV,PPP_LOANS.7857
4,SAFEGRAPH,Desert Home Electric,SAFEGRAPH.22w-222@5yv-j8b-btv,PPP_LOANS,DESERT HOME ELECTRIC INC,PPP_LOANS.11784


## Related entities

Similar to the above we need to create the relationships between the anchor node and the related entities.  

In [7]:
tup_ls = []

for el in data:
    #pprint(el)
    source = el['RESOLVED_ENTITY']['RECORDS'][0]
    source_data_source = source['DATA_SOURCE']
    source_desc = source['ENTITY_DESC']
    source_uid = str(source_data_source) + '.' + str(source['RECORD_ID'])
    for target in el['RELATED_ENTITIES']:
        for la in target['RECORDS']:
            target_uid = str(la['DATA_SOURCE']) + '.' + str(la['RECORD_ID'])
            tup = (source_uid,
                   source_data_source,
                   source_desc,
                   la['DATA_SOURCE'],
                   target['MATCH_KEY'],
                   target['MATCH_LEVEL'],
                   target['MATCH_LEVEL_CODE'],
                   target['IS_AMBIGUOUS'],
                   target['IS_DISCLOSED'],
                   target_uid)
            tup_ls.append(tup)

related_rel_df = pd.DataFrame(tup_ls, columns=['source_uid', 'source_data_source', 'source_desc', 
                                           'target_data_source', 'match_key', 'match_level', 'match_level_code', 'is_ambiguous',
                                           'is_disclosed', 'target_uid'])
related_rel_df.head()

Unnamed: 0,source_uid,source_data_source,source_desc,target_data_source,match_key,match_level,match_level_code,is_ambiguous,is_disclosed,target_uid
0,SAFEGRAPH.228-222@5yv-j2j-2x5,SAFEGRAPH,Myle's Nails & Spa,SAFEGRAPH,+ADDRESS+GEO_LOC-PLACEKEY,3,POSSIBLY_RELATED,0,0,SAFEGRAPH.228-223@5yv-j2j-2x5
1,SAFEGRAPH.228-222@5yv-j2j-2x5,SAFEGRAPH,Myle's Nails & Spa,SAFEGRAPH,+ADDRESS+GEO_LOC-PLACEKEY,3,POSSIBLY_RELATED,0,0,SAFEGRAPH.zzw-222@5yv-j2j-2kz
2,SAFEGRAPH.228-222@5yv-j2j-2x5,SAFEGRAPH,Myle's Nails & Spa,SAFEGRAPH,+GEO_LOC-PLACEKEY,3,POSSIBLY_RELATED,0,0,SAFEGRAPH.zzw-22d@5yv-j2j-2kz
3,SAFEGRAPH.225-222@5yv-j92-tn5,SAFEGRAPH,Cantwell Michelle L Atty,SAFEGRAPH,+ADDRESS+PHONE+GEO_LOC-PLACEKEY,3,POSSIBLY_RELATED,0,0,SAFEGRAPH.225-22d@5yv-j92-tn5
4,SAFEGRAPH.225-222@5yv-j92-tn5,SAFEGRAPH,Cantwell Michelle L Atty,SAFEGRAPH,+ADDRESS+GEO_LOC-PLACEKEY,3,POSSIBLY_RELATED,0,0,SAFEGRAPH.225-225@5yv-j92-tn5


In [8]:
resolved_rel_df.shape, related_rel_df.shape

((1600, 6), (351405, 10))

### About this bit

This is where we set up the actual `:Entity`/anchor nodes.

In [9]:
keep_columns = ['source_data_source', 'source_desc', 'source_uid']
ent_df = resolved_rel_df[keep_columns]
ent_df_dedup = ent_df.drop_duplicates()
ent_df.shape, ent_df_dedup.shape

((1600, 3), (1386, 3))

In [10]:
ent_df_dedup.head()

Unnamed: 0,source_data_source,source_desc,source_uid
0,SAFEGRAPH,Dona Maria Tamales Restaurant,SAFEGRAPH.zzw-223@5yv-hjr-g8v
1,SAFEGRAPH,Trigg Laboratories,SAFEGRAPH.226-222@5yv-jb7-5fz
2,SAFEGRAPH,Vanguard Integrity Professionals,SAFEGRAPH.22c-222@5yv-jbt-k75
4,SAFEGRAPH,Desert Home Electric,SAFEGRAPH.22w-222@5yv-j8b-btv
5,SAFEGRAPH,A To Z Environmental Services,SAFEGRAPH.223-222@5yv-jb6-6kz


## Establish connection to Neo4j

You could be running Neo4j in any variety of places. This demo shows running it locally. However, if you are hosting it somewhere else, replace the bolt address with yours. Also update your password below.

In [11]:
URI = "bolt://localhost:7687"
USER = "neo4j"
PWD = " " # Put your password here
AUTH = (USER, PWD)

In [12]:
class Neo4jConnection:
    
    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri, auth=(self.__user, self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)
        
    def close(self):
        if self.__driver is not None:
            self.__driver.close()
        
    def query(self, query, parameters=None, db=None):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try: 
            session = self.__driver.session(database=db) if db is not None else self.__driver.session() 
            response = list(session.run(query, parameters))
        except Exception as e:
            print("Query failed:", e)
        finally: 
            if session is not None:
                session.close()
        return response
        

def insert_data(query, rows, batch_size=1000):

    total = 0
    batch = 0
    start = time.time()
    result = None
    
    while batch * batch_size < len(rows):

        res = neo_conn.query(query, 
                             parameters = {'rows': rows[batch*batch_size:(batch+1)*batch_size].to_dict('records')})
        total += res[0]['total']
        batch += 1
        result = {"total":total, 
                  "batches":batch, 
                  "time":time.time()-start}
        print(result)
        
    return result


neo_conn = Neo4jConnection(uri=URI, user=USER, pwd=PWD)

In [13]:
neo_conn.query('CREATE CONSTRAINT ent IF NOT EXISTS FOR (e:Entity) REQUIRE e.uid IS UNIQUE')

[]

In [21]:
def add_ent(rows, batch_size=1000):

    query = """UNWIND $rows AS row
               MERGE (:Entity {uid: row.source_uid,
                               name: row.source_desc,
                               data_source: row.source_data_source})
               RETURN count(*) AS total
    """

    return insert_data(query, rows, batch_size)


def add_ent_rel(rows, batch_size=1000):

    query = """UNWIND $rows AS row
               MATCH (source {uid: row.source_uid})
               MATCH (target {uid: row.target_uid})
               MERGE (source)-[:RESOLVES]->(target)
               RETURN count(*) AS total
    """

    return insert_data(query, rows, batch_size)


def add_sg_related_rel(rows, batch_size=1000):

    query = """UNWIND $rows AS row
               MATCH (source:Entity {uid: row.source_uid})
               MATCH (target:SGEntity {uid:row.target_uid})
               MERGE (target)-[:RELATED_TO {match_key: row.match_key,
                                            match_level: row.match_level,
                                            match_level_code: row.match_level_code,
                                            is_ambiguous: row.is_ambiguous,
                                            is_disclosed: row.is_disclosed}]->(source)
               RETURN count(*) AS total
    """

    return insert_data(query, rows, batch_size)


def add_dol_related_rel(rows, batch_size=1000):

    query = """UNWIND $rows AS row
               MATCH (source:Entity {uid: row.source_uid})
               MATCH (target:DOLEntity {uid:row.target_uid})
               MERGE (target)-[:RELATED_TO {match_key: row.match_key,
                                            match_level: row.match_level,
                                            match_level_code: row.match_level_code,
                                            is_ambiguous: row.is_ambiguous,
                                            is_disclosed: row.is_disclosed}]->(source)
               RETURN count(*) AS total
    """

    return insert_data(query, rows, batch_size)    

def add_ppp_related_rel(rows, batch_size=1000):

    query = """UNWIND $rows AS row
               MATCH (source:Entity {uid: row.source_uid})
               MATCH (target:PPPEntity {uid:row.target_uid})
               MERGE (target)-[:RELATED_TO {match_key: row.match_key,
                                            match_level: row.match_level,
                                            match_level_code: row.match_level_code,
                                            is_ambiguous: row.is_ambiguous,
                                            is_disclosed: row.is_disclosed}]->(source)
               RETURN count(*) AS total
    """

    return insert_data(query, rows, batch_size)

In [15]:
add_ent(ent_df_dedup)

{'total': 1000, 'batches': 1, 'time': 0.047647714614868164}
{'total': 1386, 'batches': 2, 'time': 0.06178855895996094}


{'total': 1386, 'batches': 2, 'time': 0.06178855895996094}

In [16]:
add_ent_rel(resolved_rel_df, batch_size=50)

{'total': 100, 'batches': 1, 'time': 8.016542673110962}
{'total': 200, 'batches': 2, 'time': 16.055448293685913}
{'total': 300, 'batches': 3, 'time': 23.836010456085205}
{'total': 400, 'batches': 4, 'time': 31.941528797149658}
{'total': 500, 'batches': 5, 'time': 39.81591296195984}
{'total': 600, 'batches': 6, 'time': 47.602574586868286}
{'total': 700, 'batches': 7, 'time': 55.490866899490356}
{'total': 800, 'batches': 8, 'time': 63.43989014625549}
{'total': 900, 'batches': 9, 'time': 71.24176001548767}
{'total': 1000, 'batches': 10, 'time': 78.86564373970032}
{'total': 1100, 'batches': 11, 'time': 86.59602928161621}
{'total': 1200, 'batches': 12, 'time': 94.68604636192322}
{'total': 1300, 'batches': 13, 'time': 102.57813143730164}
{'total': 1400, 'batches': 14, 'time': 110.15722584724426}
{'total': 1500, 'batches': 15, 'time': 117.84597706794739}
{'total': 1600, 'batches': 16, 'time': 125.77107906341553}
{'total': 1700, 'batches': 17, 'time': 133.60603499412537}
{'total': 1800, 'batch

{'total': 3200, 'batches': 32, 'time': 249.40752577781677}

In [18]:
sg_related_rel_df = related_rel_df[related_rel_df['target_data_source']=="SAFEGRAPH"]
dol_related_rel_df = related_rel_df[related_rel_df['target_data_source']=="DOL_WHISARD"]
ppp_related_rel_df = related_rel_df[related_rel_df['target_data_source']=="PPP_LOANS"]
sg_related_rel_df.shape, dol_related_rel_df.shape, ppp_related_rel_df.shape

((340093, 10), (4948, 10), (6364, 10))

In [19]:
add_sg_related_rel(sg_related_rel_df)

{'total': 1, 'batches': 1, 'time': 0.07312226295471191}
{'total': 3, 'batches': 2, 'time': 0.1231985092163086}
{'total': 6, 'batches': 3, 'time': 0.17707228660583496}
{'total': 29, 'batches': 4, 'time': 0.2375621795654297}
{'total': 50, 'batches': 5, 'time': 0.2921774387359619}
{'total': 54, 'batches': 6, 'time': 0.3450784683227539}
{'total': 72, 'batches': 7, 'time': 0.3923454284667969}
{'total': 81, 'batches': 8, 'time': 0.43192172050476074}
{'total': 89, 'batches': 9, 'time': 0.4686744213104248}
{'total': 92, 'batches': 10, 'time': 0.5031101703643799}
{'total': 94, 'batches': 11, 'time': 0.5441758632659912}
{'total': 95, 'batches': 12, 'time': 0.5782947540283203}
{'total': 95, 'batches': 13, 'time': 0.6120483875274658}
{'total': 103, 'batches': 14, 'time': 0.6489179134368896}
{'total': 106, 'batches': 15, 'time': 0.6838831901550293}
{'total': 317, 'batches': 16, 'time': 0.7231006622314453}
{'total': 329, 'batches': 17, 'time': 0.7600753307342529}
{'total': 329, 'batches': 18, 'time'

{'total': 4995, 'batches': 341, 'time': 12.883302688598633}

In [23]:
add_dol_related_rel(dol_related_rel_df)

{'total': 25, 'batches': 1, 'time': 0.040163278579711914}
{'total': 54, 'batches': 2, 'time': 0.10362362861633301}
{'total': 90, 'batches': 3, 'time': 0.13991665840148926}
{'total': 112, 'batches': 4, 'time': 0.17568325996398926}
{'total': 198, 'batches': 5, 'time': 0.21126723289489746}


{'total': 198, 'batches': 5, 'time': 0.21126723289489746}

In [24]:
add_ppp_related_rel(ppp_related_rel_df)

{'total': 45, 'batches': 1, 'time': 0.08003950119018555}
{'total': 77, 'batches': 2, 'time': 0.11926436424255371}
{'total': 101, 'batches': 3, 'time': 0.1554872989654541}
{'total': 136, 'batches': 4, 'time': 0.19196224212646484}
{'total': 158, 'batches': 5, 'time': 0.2290210723876953}
{'total': 215, 'batches': 6, 'time': 0.2651851177215576}
{'total': 229, 'batches': 7, 'time': 0.2801365852355957}


{'total': 229, 'batches': 7, 'time': 0.2801365852355957}