In [1]:
import json
import os
import grpc
import pandas as pd
import networkx as nx
from sentence_transformers import SentenceTransformer
import lancedb
from senzing import SzEngine, SzError
from senzing_grpc import SzAbstractFactoryGrpc
from senzing import SzEngineFlags

print("All imports successful")

All imports successful


## Connect to Senzing

Opens a gRPC connection to the Senzing engine using environment variables.  We need this to export the fully resolved entity data that we will turn into graph nodes and vector embeddings.

In [2]:
SENZING_HOST = os.getenv('SENZING_GRPC_HOST', 'senzing')
SENZING_PORT = os.getenv('SENZING_GRPC_PORT', '8261')

print(f"Connecting to Senzing at {SENZING_HOST}:{SENZING_PORT}")

# Create gRPC channel and engine
grpc_url = f"{SENZING_HOST}:{SENZING_PORT}"
grpc_channel = grpc.insecure_channel(grpc_url)
sz_abstract_factory = SzAbstractFactoryGrpc(grpc_channel)
sz_engine = sz_abstract_factory.create_engine()

print("Connected to Senzing successfully")

Connecting to Senzing at senzing:8261
Connected to Senzing successfully


## Export All Resolved Entities

Streams the complete entity report out of Senzing, requesting the raw source record JSON alongside each resolved entity.  We need the record-level JSON because that is where names, addresses, identifiers, and risk topics are stored.

In [3]:
entities = []
export_handle = sz_engine.export_json_entity_report(
    flags=SzEngineFlags.SZ_EXPORT_INCLUDE_ALL_ENTITIES | 
          SzEngineFlags.SZ_ENTITY_INCLUDE_RECORD_JSON_DATA
)

count = 0
while True:
    try:
        entity_json = sz_engine.fetch_next(export_handle)
        if not entity_json:
            break
        
        entity = json.loads(entity_json)
        entities.append(entity)
        count += 1
        
        if count % 50 == 0:
            print(f"  Exported {count} entities...", end='\r')
    except StopIteration:
        break

sz_engine.close_export_report(export_handle)
print(f"\nExported {len(entities)} entities total")

  Exported 150 entities...
Exported 196 entities total


## Dataset Overview

Summarizes what Senzing resolved: total records, unique entities, how many records were merged, and a breakdown by data source.  The cross-source resolution count (3 entities matched across OPEN-OWNERSHIP and OPEN-SANCTIONS) is the most analytically important number here.

In [4]:
print("Dataset Overview:")
print("="*60)

# Count total records from entities
total_records = sum(len(e.get('RESOLVED_ENTITY', {}).get('RECORDS', [])) for e in entities)
num_entities = len(entities)

print(f"Total records in database: {total_records:,}")
print(f"Total unique entities:     {num_entities:,}")
print(f"Records merged:            {total_records - num_entities:,}")

if total_records > 0:
    print(f"Reduction:                 {((total_records - num_entities) / total_records * 100):.1f}%")

# Count by data source
from collections import Counter
source_counts = Counter()
for entity in entities:
    records = entity.get('RESOLVED_ENTITY', {}).get('RECORDS', [])
    for rec in records:
        source = rec.get('DATA_SOURCE', 'UNKNOWN')
        source_counts[source] += 1

print("\nRecords by data source:")
for source, count in source_counts.items():
    print(f"  {source}: {count:,}")

# Count cross-source resolutions
cross_source = 0
for entity in entities:
    records = entity.get('RESOLVED_ENTITY', {}).get('RECORDS', [])
    sources = set(r.get('DATA_SOURCE') for r in records)
    if len(sources) > 1:
        cross_source += 1

print(f"\nCross-source resolutions: {cross_source}")
print("="*60)

Dataset Overview:
Total records in database: 282
Total unique entities:     196
Records merged:            86
Reduction:                 30.5%

Records by data source:
  OPEN-SANCTIONS: 24
  OPEN-OWNERSHIP: 258

Cross-source resolutions: 3


## Build the Knowledge Graph with NetworkX

Constructs the NetworkX graph from the exported entities.  Each node is a resolved entity carrying its name, type, record count, and data sources.  Edges represent business relationships (shareholding, directorship, etc.) extracted from the `RELATIONSHIPS` field in each source record.

In [5]:
G = nx.Graph()

# Add entities as nodes
for entity in entities:
    entity_data = entity.get('RESOLVED_ENTITY', {})
    entity_id = entity_data.get('ENTITY_ID')
    records = entity_data.get('RECORDS', [])
    
    if not records:
        continue
    
    # Get entity info
    first_record = records[0]
    json_data = first_record.get('JSON_DATA', {})
    record_type = json_data.get('RECORD_TYPE', 'UNKNOWN')
    
    # Get name
    name = json_data.get('PRIMARY_NAME_FULL')
    if not name:
        name_list = json_data.get('NAMES', [])
        for name_obj in name_list:
            name = name_obj.get('NAME_FULL') or name_obj.get('PRIMARY_NAME_ORG') or name_obj.get('NAME_ORG')
            if name:
                break
    
    if not name:
        name = f"Entity {entity_id}"
    
    # Get data sources
    data_sources = list(set([r.get('DATA_SOURCE') for r in records]))
    
    # Add node
    G.add_node(
        entity_id,
        name=name,
        type=record_type,
        num_records=len(records),
        data_sources=data_sources
    )

print(f"Added {G.number_of_nodes()} entity nodes")

# Add relationship edges
edges_added = 0

for entity in entities:
    entity_data = entity.get('RESOLVED_ENTITY', {})
    anchor_entity_id = entity_data.get('ENTITY_ID')
    
    # Look through records for relationships
    for record in entity_data.get('RECORDS', []):
        relationships = record.get('JSON_DATA', {}).get('RELATIONSHIPS', [])
        
        for rel in relationships:
            pointer_key = rel.get('REL_POINTER_KEY')
            pointer_role = rel.get('REL_POINTER_ROLE', 'related')
            
            # Find target entity
            for target_entity in entities:
                target_data = target_entity.get('RESOLVED_ENTITY', {})
                target_entity_id = target_data.get('ENTITY_ID')
                
                for target_record in target_data.get('RECORDS', []):
                    if target_record.get('RECORD_ID') == pointer_key:
                        if anchor_entity_id != target_entity_id:
                            G.add_edge(
                                anchor_entity_id,
                                target_entity_id,
                                relationship=pointer_role
                            )
                            edges_added += 1
                        break

print(f"Added {edges_added} relationship edges")
print(f"\nKnowledge Graph built:")
print(f"  Nodes: {G.number_of_nodes()}")
print(f"  Edges: {G.number_of_edges()}")
print(f"  Connected components: {nx.number_connected_components(G)}")

Added 196 entity nodes
Added 396 relationship edges

Knowledge Graph built:
  Nodes: 196
  Edges: 233
  Connected components: 59


## Load the Embedding Model

Downloads and initializes `all-MiniLM-L6-v2` from Hugging Face.  This model produces 384-dimension embeddings and is a solid choice for this task since it is fast, small enough to run comfortably in the workshop environment, and handles the mix of names, addresses, and risk terminology in our entity text reasonably well.

In [6]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

print("Embedding model loaded")
print(f"  Model: all-MiniLM-L6-v2")
print(f"  Embedding dimension: 384")



Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Embedding model loaded
  Model: all-MiniLM-L6-v2
  Embedding dimension: 384


## Create Entity Embeddings

Iterates over all entities and builds a text description for each one (name, type, data sources, address, identifiers, risk topics), then embeds it with the sentence transformer.  The text description is what gets stored in LanceDB and searched against at query time, so the fields included here directly determine what kinds of questions the RAG can answer well.

In [7]:
entity_data = []

for entity in entities:
    entity_data_item = entity.get('RESOLVED_ENTITY', {})
    entity_id = entity_data_item.get('ENTITY_ID')
    records = entity_data_item.get('RECORDS', [])
    
    if not records:
        continue
    
    # Get entity info
    first_record = records[0]
    json_data = first_record.get('JSON_DATA', {})
    
    # Get name
    name = json_data.get('PRIMARY_NAME_FULL')
    if not name:
        name_list = json_data.get('NAMES', [])
        for name_obj in name_list:
            name = name_obj.get('NAME_FULL') or name_obj.get('PRIMARY_NAME_ORG') or name_obj.get('NAME_ORG')
            if name:
                break
    
    if not name:
        name = f"Entity {entity_id}"
    
    record_type = json_data.get('RECORD_TYPE', 'UNKNOWN')
    data_sources = list(set([r.get('DATA_SOURCE') for r in records]))
    
    # Get addresses
    addresses = []
    for rec in records:
        addrs = rec.get('JSON_DATA', {}).get('ADDRESSES', [])
        for addr in addrs[:2]:
            addr_full = addr.get('ADDR_FULL', '')
            if addr_full:
                addresses.append(addr_full)
    
    # Get identifiers
    identifiers = []
    for rec in records:
        ids = rec.get('JSON_DATA', {}).get('IDENTIFIERS', [])
        for id_obj in ids[:3]:
            id_type = id_obj.get('NATIONAL_ID_TYPE') or id_obj.get('OTHER_ID_TYPE')
            id_num = id_obj.get('NATIONAL_ID_NUMBER') or id_obj.get('OTHER_ID_NUMBER')
            if id_type and id_num:
                identifiers.append(f"{id_type}: {id_num}")
    
    # Get risks (from OPEN-SANCTIONS)
    risks = []
    for rec in records:
        risk_list = rec.get('JSON_DATA', {}).get('RISKS', [])
        for risk in risk_list:
            topic = risk.get('TOPIC', '')
            if topic:
                risks.append(topic)
    
    # Create text description for embedding
    text_parts = [
        f"Name: {name}",
        f"Type: {record_type}",
        f"Data sources: {', '.join(data_sources)}",
        f"Records merged: {len(records)}"
    ]
    
    if addresses:
        text_parts.append(f"Address: {addresses[0]}")
    
    if identifiers:
        text_parts.append(f"Identifiers: {', '.join(identifiers[:2])}")
    
    if risks:
        text_parts.append(f"Risk topics: {', '.join(set(risks))}")
    
    text = ". ".join(text_parts)
    
    # Create embedding
    embedding = embedding_model.encode(text).tolist()
    
    # Store data
    entity_data.append({
        'entity_id': entity_id,
        'name': name,
        'type': record_type,
        'text': text,
        'vector': embedding,
        'data_sources': ','.join(data_sources),
        'num_records': len(records),
        'addresses': '|'.join(addresses[:3]),
        'identifiers': '|'.join(identifiers[:3]),
        'risks': '|'.join(set(risks))
    })
    
    if len(entity_data) % 50 == 0:
        print(f"  Processed {len(entity_data)} entities...", end='\r')

print(f"\nCreated embeddings for {len(entity_data)} entities")

  Processed 150 entities...
Created embeddings for 196 entities


## Store Embeddings in LanceDB

Drops any existing `entities` table and writes all 196 entity records including their vectors into a fresh LanceDB table.  After this cell runs, the RAG notebook can connect to LanceDB and start querying without needing to touch Senzing again.

In [8]:
db = lancedb.connect('/workspace/lancedb_data')

# Drop existing table if it exists
try:
    db.drop_table('entities')
    print("Dropped existing table")
except:
    pass

# Create new table
print("Creating new table...")
table = db.create_table('entities', entity_data)

print(f"Stored {len(entity_data)} entities in LanceDB")
print("\nData preparation complete!")
print("You can now use the RAG notebook to query this data.")

Dropped existing table
Creating new table...
Stored 196 entities in LanceDB

Data preparation complete!
You can now use the RAG notebook to query this data.


## Preview LanceDB Contents

Pulls the first 10 rows from the LanceDB table and displays the key metadata columns (entity ID, name, type, data sources, record count, risks).  This is a quick gut check to confirm the data looks right before moving on.

In [9]:
print("LanceDB Contents:")
print("="*70)

sample = table.to_pandas().head(10)
display_columns = ['entity_id', 'name', 'type', 'data_sources', 'num_records', 'risks']
print(sample[display_columns].to_string())

LanceDB Contents:
   entity_id                               name          type                   data_sources  num_records                risks
0          1                    Abassin BADSHAH        PERSON  OPEN-OWNERSHIP,OPEN-SANCTIONS            3         corp.disqual
1          2                      LMAR (GB) LTD  ORGANIZATION                 OPEN-SANCTIONS            1                     
2          3            WANDLE HOLDINGS LIMITED  ORGANIZATION                 OPEN-SANCTIONS            1      sanction.linked
3          4  POLYUS GOLD INTERNATIONAL LIMITED  ORGANIZATION                 OPEN-SANCTIONS            1      sanction.linked
4          5          Firuza Nazimovna Kerimova        PERSON                 OPEN-SANCTIONS            1    role.rca|sanction
5          6                     ООО "ГРАНДЕКО"  ORGANIZATION                 OPEN-SANCTIONS            2  sanction.linked|poi
6          7        Amina Suleymanovna Kerimova        PERSON                 OPEN-SANCTIONS 

## LanceDB Statistics

Prints summary counts of what is in the vector store: total entities, breakdown by type (PERSON vs ORGANIZATION), breakdown by data source, and a count of entities that carry at least one risk flag.  Only 17 out of 196 entities have risk data, which reflects the relative size of the OPEN-SANCTIONS dataset.

In [10]:
all_entities = table.to_pandas()

print("\nLanceDB Statistics:")
print("="*70)
print(f"Total entities: {len(all_entities)}")
print()

print("By Type:")
print(all_entities['type'].value_counts())
print()

print("By Data Source:")
print(all_entities['data_sources'].value_counts())
print()

print("Entities with risks/sanctions:")
has_risks = all_entities[all_entities['risks'].notna() & (all_entities['risks'] != '')]
print(f"  Count: {len(has_risks)}")


LanceDB Statistics:
Total entities: 196

By Type:
type
ORGANIZATION    127
PERSON           69
Name: count, dtype: int64

By Data Source:
data_sources
OPEN-OWNERSHIP                   176
OPEN-SANCTIONS                    17
OPEN-OWNERSHIP,OPEN-SANCTIONS      3
Name: count, dtype: int64

Entities with risks/sanctions:
  Count: 17


## Risk Topics Breakdown

Lists every distinct risk topic in the dataset and shows example entities for each one.  This is useful context for the workshop because it tells participants exactly what risk vocabulary the chatbot knows about before they start asking questions.

In [11]:
all_entities = table.to_pandas()
all_risks = all_entities[all_entities['risks'].notna() & (all_entities['risks'] != '')]

print("Risk Topics in Dataset:")
print("="*70)

risk_topics = set()
for risks_str in all_risks['risks']:
    if risks_str:
        topics = risks_str.split('|')
        risk_topics.update(topics)

for topic in sorted(risk_topics):
    count = sum(topic in str(r) for r in all_risks['risks'])
    entities_with_risk = all_risks[all_risks['risks'].str.contains(topic, na=False)]
    print(f"\n{topic}: {count} entities")
    print("  Examples:")
    for idx, row in entities_with_risk.head(3).iterrows():
        print(f"    - {row['name']} ({row['type']})")

Risk Topics in Dataset:

corp.disqual: 1 entities
  Examples:
    - Abassin BADSHAH (PERSON)

poi: 2 entities
  Examples:
    - ООО "ГРАНДЕКО" (ORGANIZATION)
    - Suleyman Abusaidovich KERIMOV (PERSON)

role.oligarch: 1 entities
  Examples:
    - Suleyman Abusaidovich KERIMOV (PERSON)

role.pep: 3 entities
  Examples:
    - N. T. Wright (PERSON)
    - Mr Akhmet Magomedovich Palankoyev (PERSON)
    - Suleyman Abusaidovich KERIMOV (PERSON)

role.rca: 6 entities
  Examples:
    - Firuza Nazimovna Kerimova (PERSON)
    - Amina Suleymanovna Kerimova (PERSON)
    - Gulnara Suleimanova KERIMOVA (PERSON)

sanction: 14 entities
  Examples:
    - WANDLE HOLDINGS LIMITED (ORGANIZATION)
    - POLYUS GOLD INTERNATIONAL LIMITED (ORGANIZATION)
    - Firuza Nazimovna Kerimova (PERSON)

sanction.linked: 8 entities
  Examples:
    - WANDLE HOLDINGS LIMITED (ORGANIZATION)
    - POLYUS GOLD INTERNATIONAL LIMITED (ORGANIZATION)
    - ООО "ГРАНДЕКО" (ORGANIZATION)


## Inspect Sample Entity Records

Prints the full stored record for the first 5 entities including the first and last few values of each embedding vector.  This gives participants a concrete look at exactly what shape of data is going into LanceDB and being searched at query time.

In [12]:
sample = table.to_pandas().head(5)

# Convert to dict and display as formatted JSON
for idx, row in sample.iterrows():
    print(f"\nEntity {idx + 1}:")
    print("="*70)
    
    # Convert row to dict
    row_dict = row.to_dict()
    
    # Show vector info separately
    vector = row_dict.pop('vector')
    
    # Print everything except vector
    print(json.dumps(row_dict, indent=2))
    
    # Print vector summary
    print(f"\nvector: [{vector[0]}, {vector[1]}, {vector[2]}, ..., {vector[-1]}]")
    print(f"  (384-dimension vector, first few values shown)")
    print()


Entity 1:
{
  "entity_id": 1,
  "name": "Abassin BADSHAH",
  "type": "PERSON",
  "text": "Name: Abassin BADSHAH. Type: PERSON. Data sources: OPEN-OWNERSHIP, OPEN-SANCTIONS. Records merged: 3. Address: 31 Quernmore Close, Bromley, Kent, United Kingdom, BR1 4EL. Identifiers: OPEN-SANCTIONS: NK-25vyVFzt8vdJGgAXMRTwTJ. Risk topics: corp.disqual",
  "data_sources": "OPEN-OWNERSHIP,OPEN-SANCTIONS",
  "num_records": 3,
  "addresses": "31 Quernmore Close, Bromley, Kent, United Kingdom, BR1 4EL|31, Quernmore Close, Bromley, BR1 4EL|3, Market Parade, 41 East Street, Bromley, BR1 1QN",
  "identifiers": "OPEN-SANCTIONS: NK-25vyVFzt8vdJGgAXMRTwTJ",
  "risks": "corp.disqual"
}

vector: [0.008808483369648457, -0.023793436586856842, 0.00644419901072979, ..., -0.034974053502082825]
  (384-dimension vector, first few values shown)


Entity 2:
{
  "entity_id": 2,
  "name": "LMAR (GB) LTD",
  "type": "ORGANIZATION",
  "text": "Name: LMAR (GB) LTD. Type: ORGANIZATION. Data sources: OPEN-SANCTIONS. Records

In [13]:
print("Closing connections...")

try:
    grpc_channel.close()
    print("Done")
except Exception as e:
    print(f"Note: {e}")

Closing connections...
Done
