In [1]:
import os
import json
import grpc
from senzing_grpc import SzAbstractFactoryGrpc
import urllib.request

## Download the Senzing Truth Set

Pulls three JSONL files (customers, reference, watchlist) from the official Senzing GitHub repository and saves them to `/workspace/data`.  These files are Senzing's public demo dataset and are designed specifically for exploring entity resolution.

In [2]:
truth_set_urls = {
    'customers': 'https://raw.githubusercontent.com/Senzing/truth-sets/main/truthsets/demo/customers.jsonl',
    'reference': 'https://raw.githubusercontent.com/Senzing/truth-sets/main/truthsets/demo/reference.jsonl',
    'watchlist': 'https://raw.githubusercontent.com/Senzing/truth-sets/main/truthsets/demo/watchlist.jsonl'
}

# Create data directory if it doesn't exist
os.makedirs('/workspace/data', exist_ok=True)

print("Downloading truth set files...")
for name, url in truth_set_urls.items():
    filepath = f'/workspace/data/{name}.jsonl'
    urllib.request.urlretrieve(url, filepath)
    print(f"✅ Downloaded {name}.jsonl")

print("\nTruth sets downloaded successfully!")

Downloading truth set files...
✅ Downloaded customers.jsonl
✅ Downloaded reference.jsonl
✅ Downloaded watchlist.jsonl

Truth sets downloaded successfully!


## Preview the Raw Data

Prints the first three customer records so you can see what the raw input looks like before any entity resolution happens.  Also counts the total number of records in each file so you know what Senzing is working with.

In [3]:
print("Sample customer records:")
print("=" * 60)

with open('/workspace/data/customers.jsonl', 'r') as f:
    for i, line in enumerate(f):
        if i < 3:  # Show first 3 records
            record = json.loads(line)
            print(f"\nRecord {i+1}:")
            print(json.dumps(record, indent=2))
        else:
            break

# Count total records
print("\n" + "=" * 60)
print("Record counts:")
for name in ['customers', 'reference', 'watchlist']:
    with open(f'/workspace/data/{name}.jsonl', 'r') as f:
        count = sum(1 for _ in f)
    print(f"  {name}: {count} records")

Sample customer records:

Record 1:
{
  "DATA_SOURCE": "CUSTOMERS",
  "RECORD_ID": "1001",
  "RECORD_TYPE": "PERSON",
  "PRIMARY_NAME_LAST": "Smith",
  "PRIMARY_NAME_FIRST": "Robert",
  "DATE_OF_BIRTH": "12/11/1978",
  "ADDR_TYPE": "MAILING",
  "ADDR_LINE1": "123 Main Street, Las Vegas NV 89132",
  "PHONE_TYPE": "HOME",
  "PHONE_NUMBER": "702-919-1300",
  "DATE": "1/2/18",
  "STATUS": "Active",
  "AMOUNT": "100"
}

Record 2:
{
  "DATA_SOURCE": "CUSTOMERS",
  "RECORD_ID": "1002",
  "RECORD_TYPE": "PERSON",
  "PRIMARY_NAME_LAST": "Smith",
  "PRIMARY_NAME_FIRST": "Bob",
  "DATE_OF_BIRTH": "11/12/1978",
  "ADDR_TYPE": "HOME",
  "ADDR_LINE1": "1515 Adela Lane",
  "ADDR_CITY": "Las Vegas",
  "ADDR_STATE": "NV",
  "ADDR_POSTAL_CODE": "89111",
  "PHONE_TYPE": "MOBILE",
  "PHONE_NUMBER": "702-919-1300",
  "DATE": "3/10/17",
  "STATUS": "Inactive",
  "AMOUNT": "200"
}

Record 3:
{
  "DATA_SOURCE": "CUSTOMERS",
  "RECORD_ID": "1003",
  "RECORD_TYPE": "PERSON",
  "PRIMARY_NAME_LAST": "Smith",
  "P

## Connect to Senzing

Opens a gRPC channel to the Senzing service and creates the engine instance that all subsequent API calls will use.  The connection details come from environment variables set in the Docker Compose file.

In [4]:
grpc_host = os.getenv('SENZING_GRPC_HOST', 'senzing')
grpc_port = os.getenv('SENZING_GRPC_PORT', '8261')

# Create gRPC channel and connect to Senzing
grpc_url = f"{grpc_host}:{grpc_port}"
print(f"Connecting to Senzing at {grpc_url}...")

try:
    # Create an insecure gRPC channel
    grpc_channel = grpc.insecure_channel(grpc_url)
    
    # Create the factory using the channel
    sz_factory = SzAbstractFactoryGrpc(grpc_channel=grpc_channel)
    
    # Create the engine instance
    sz_engine = sz_factory.create_engine()
    
    print("✅ Connected to Senzing successfully!")
    
except Exception as err:
    print(f"❌ Error connecting to Senzing: {err}")
    raise

Connecting to Senzing at senzing:8261...
✅ Connected to Senzing successfully!


## Load Truth Set Data into Senzing

Iterates through all three JSONL files and loads each record into Senzing via `add_record()`.  As records are ingested, Senzing is running entity resolution in the background, which is why some customer records will later appear merged into a single entity.

In [9]:
data_sources = {
    'CUSTOMERS': '/workspace/data/customers.jsonl',
    'REFERENCE': '/workspace/data/reference.jsonl',
    'WATCHLIST': '/workspace/data/watchlist.jsonl'
}

print("Loading truth set data into Senzing...")
print("=" * 60)

for data_source, filepath in data_sources.items():
    print(f"\nLoading {data_source}...")
    
    records_loaded = 0
    with open(filepath, 'r') as f:
        for line in f:
            record = json.loads(line)
            record_id = record['RECORD_ID']
            
            try:
                sz_engine.add_record(data_source, record_id, line.strip())
                records_loaded += 1
                
                # Print progress every 100 records
                if records_loaded % 100 == 0:
                    print(f"  Loaded {records_loaded} records...")
                    
            except Exception as err:
                print(f"  ❌ Error loading record {record_id}: {err}")
    
    print(f"  ✅ Loaded {records_loaded} records from {data_source}")

print("\n" + "=" * 60)
print("✅ All truth set data loaded successfully!")

Loading truth set data into Senzing...

Loading CUSTOMERS...
  Loaded 100 records...
  ✅ Loaded 120 records from CUSTOMERS

Loading REFERENCE...
  ✅ Loaded 22 records from REFERENCE

Loading WATCHLIST...
  ✅ Loaded 17 records from WATCHLIST

✅ All truth set data loaded successfully!


## Look Up a Resolved Entity by Record ID

Fetches the entity that Senzing resolved for customer record 1001 and shows which other records were merged into it.  This is the core payoff of entity resolution: multiple source records collapsed into one unified entity.

In [5]:
print("Looking up customer record 1001...")
print("=" * 60)

try:
    result = sz_engine.get_entity_by_record_id('CUSTOMERS', '1001')
    entity = json.loads(result)
    
    resolved = entity['RESOLVED_ENTITY']
    
    print(f"\nEntity ID: {resolved['ENTITY_ID']}")
    print(f"Number of records in this entity: {len(resolved['RECORDS'])}")
    
    print("\nRecords that were merged into this entity:")
    for record in resolved['RECORDS']:
        print(f"  - {record['DATA_SOURCE']}: {record['RECORD_ID']}")
    
    # Show some features of the resolved entity
    if 'NAME_DATA' in resolved:
        print(f"\nNames found:")
        for name in resolved['NAME_DATA'][:3]:  # Show first 3
            print(f"  - {name}")
    
    if 'ADDRESS_DATA' in resolved:
        print(f"\nAddresses found:")
        for addr in resolved['ADDRESS_DATA'][:3]:  # Show first 3
            print(f"  - {addr}")
    
except Exception as err:
    print(f"❌ Error: {err}")

Looking up customer record 1001...

Entity ID: 2
Number of records in this entity: 4

Records that were merged into this entity:
  - CUSTOMERS: 1002
  - CUSTOMERS: 1001
  - CUSTOMERS: 1003
  - CUSTOMERS: 1004


## Search by Attributes

Searches Senzing for entities matching the name "Robert Smith" and prints the match details for each result, including the match level, match key, ER rule used, and name similarity score.  This shows how Senzing ranks and explains its fuzzy matching decisions.

In [6]:
print("Searching for entities named 'Robert Smith'...")
print("=" * 60)

search_attributes = {
    "NAME_FIRST": "Robert",
    "NAME_LAST": "Smith"
}

try:
    search_json = json.dumps(search_attributes)
    result = sz_engine.search_by_attributes(search_json)
    search_results = json.loads(result)
    
    if 'RESOLVED_ENTITIES' in search_results:
        print(f"\nFound {len(search_results['RESOLVED_ENTITIES'])} matching entities:")
        
        for i, entity_result in enumerate(search_results['RESOLVED_ENTITIES']):
            print(f"\n{'='*60}")
            print(f"Match {i+1}:")
            print('='*60)
            
            # Match information - shows WHY this matched
            if 'MATCH_INFO' in entity_result:
                match_info = entity_result['MATCH_INFO']
                print(f"\nMatch Details:")
                print(f"  Match Level: {match_info.get('MATCH_LEVEL_CODE', 'N/A')}")
                print(f"  Match Key: {match_info.get('MATCH_KEY', 'N/A')}")
                print(f"  ER Rule: {match_info.get('ERRULE_CODE', 'N/A')}")
                
                # Show feature scores if available
                if 'FEATURE_SCORES' in match_info and 'NAME' in match_info['FEATURE_SCORES']:
                    name_scores = match_info['FEATURE_SCORES']['NAME']
                    if name_scores:
                        print(f"\n  Name Match Score:")
                        for score in name_scores[:1]:  # Show first name match
                            print(f"    Search: {score.get('INBOUND_FEAT_DESC', 'N/A')}")
                            print(f"    Found: {score.get('CANDIDATE_FEAT_DESC', 'N/A')}")
                            print(f"    Score: {score.get('SCORE', 'N/A')}/100")
                            print(f"    Score Bucket: {score.get('SCORE_BUCKET', 'N/A')}")
            
            # Get entity details
            if 'ENTITY' in entity_result and 'RESOLVED_ENTITY' in entity_result['ENTITY']:
                resolved = entity_result['ENTITY']['RESOLVED_ENTITY']
                
                print(f"\nEntity Information:")
                print(f"  Entity ID: {resolved.get('ENTITY_ID', 'N/A')}")
                
                # Show entity name if available
                if 'ENTITY_NAME' in resolved:
                    print(f"  Entity Name: {resolved['ENTITY_NAME']}")
                
                # Show which data sources contributed
                if 'RECORDS' in resolved:
                    records = resolved['RECORDS']
                    sources = set(r['DATA_SOURCE'] for r in records)
                    print(f"  Data Sources: {', '.join(sources)}")
                    print(f"  Total Records: {len(records)}")
                    
                    print(f"\n  Record Details:")
                    for rec in records:
                        print(f"    - {rec['DATA_SOURCE']}: {rec['RECORD_ID']}")
                
                # Show additional entity attributes
                if 'NAME_DATA' in resolved:
                    print(f"\n  Names ({len(resolved['NAME_DATA'])} found):")
                    for name in resolved['NAME_DATA'][:3]:
                        print(f"    - {name}")
                
                if 'ADDRESS_DATA' in resolved:
                    print(f"\n  Addresses ({len(resolved['ADDRESS_DATA'])} found):")
                    for addr in resolved['ADDRESS_DATA'][:3]:
                        print(f"    - {addr}")
                
                if 'PHONE_DATA' in resolved:
                    print(f"\n  Phone Numbers ({len(resolved['PHONE_DATA'])} found):")
                    for phone in resolved['PHONE_DATA'][:3]:
                        print(f"    - {phone}")
                
                if 'EMAIL_DATA' in resolved:
                    print(f"\n  Emails ({len(resolved['EMAIL_DATA'])} found):")
                    for email in resolved['EMAIL_DATA'][:3]:
                        print(f"    - {email}")
    else:
        print("No matching entities found")
        
except Exception as err:
    import traceback
    print(f"❌ Error: {err}")
    traceback.print_exc()

Searching for entities named 'Robert Smith'...

Found 3 matching entities:

Match 1:

Match Details:
  Match Level: POSSIBLY_SAME
  Match Key: +NAME
  ER Rule: SNAME

  Name Match Score:
    Search: Robert Smith
    Found: Robert Smith
    Score: 100/100
    Score Bucket: SAME

Entity Information:
  Entity ID: 2
  Entity Name: Robert Smith

Match 2:

Match Details:
  Match Level: POSSIBLY_SAME
  Match Key: +NAME
  ER Rule: SNAME

  Name Match Score:
    Search: Robert Smith
    Found: Robbie Smith
    Score: 97/100
    Score Bucket: CLOSE

Entity Information:
  Entity ID: 6
  Entity Name: Robert E Smith Sr

Match 3:

Match Details:
  Match Level: POSSIBLY_SAME
  Match Key: +NAME
  ER Rule: SNAME

  Name Match Score:
    Search: Robert Smith
    Found: Robert Smith
    Score: 100/100
    Score Bucket: SAME

Entity Information:
  Entity ID: 146
  Entity Name: Robert Smith


## Find Entities Matched Across Multiple Data Sources

Samples a handful of customer record IDs and checks whether each one resolved into an entity that also contains records from the REFERENCE or WATCHLIST sources.  Cross-source matches are some of the most valuable outputs of entity resolution since they flag customers who appear on reference lists or watchlists.

In [15]:
print("Finding entities with records from multiple data sources...")
print("=" * 60)

# We'll sample some customer records and see if they matched with reference or watchlist
sample_customer_ids = ['1001', '1002', '1003', '1004', '1005', '1010', '1020', '1030']

multi_source_entities = []

for customer_id in sample_customer_ids:
    try:
        result = sz_engine.get_entity_by_record_id('CUSTOMERS', customer_id)
        entity = json.loads(result)
        
        if 'RESOLVED_ENTITY' in entity:
            records = entity['RESOLVED_ENTITY']['RECORDS']
            sources = set(r['DATA_SOURCE'] for r in records)
            
            if len(sources) > 1:
                entity_name = entity['RESOLVED_ENTITY'].get('ENTITY_NAME', 'Unknown')
                multi_source_entities.append({
                    'entity_id': entity['RESOLVED_ENTITY']['ENTITY_ID'],
                    'entity_name': entity_name,
                    'sources': sources,
                    'record_count': len(records),
                    'records': records
                })
    except:
        pass  # Record might not exist

if multi_source_entities:
    print(f"\nFound {len(multi_source_entities)} multi-source entities:")
    for e in multi_source_entities:
        print(f"\n  Entity {e['entity_id']}: {e['entity_name']}")
        print(f"    Sources: {', '.join(e['sources'])}")
        print(f"    Total records: {e['record_count']}")
        print(f"    Record breakdown:")
        for rec in e['records']:
            print(f"      - {rec['DATA_SOURCE']}: {rec['RECORD_ID']}")
else:
    print("\nNo multi-source matches found in this sample")
    print("(This is normal - not all customer records will have reference/watchlist matches)")

Finding entities with records from multiple data sources...

Found 3 multi-source entities:

  Entity 6: Robert E Smith Sr
    Sources: WATCHLIST, CUSTOMERS
    Total records: 2
    Record breakdown:
      - CUSTOMERS: 1005
      - WATCHLIST: 1006

  Entity 7: Eddie Kusha
    Sources: WATCHLIST, CUSTOMERS
    Total records: 5
    Record breakdown:
      - CUSTOMERS: 1009
      - CUSTOMERS: 1010
      - CUSTOMERS: 1011
      - WATCHLIST: 1012
      - WATCHLIST: 1014

  Entity 15: Marsha Kusha
    Sources: WATCHLIST, CUSTOMERS
    Total records: 2
    Record breakdown:
      - CUSTOMERS: 1020
      - WATCHLIST: 1021
