In [1]:
import os
import networkx as nx
from sentence_transformers import SentenceTransformer
import lancedb
import dspy

import grpc
import json
from senzing import SzEngine
from senzing_grpc import SzAbstractFactoryGrpc
from senzing import SzEngineFlags

print("All imports successful")

All imports successful


## Load the Embedding Model

Loads the same `all-MiniLM-L6-v2` model that was used to create the vectors in the previous notebook.  It is critical that the query embedding and the stored embeddings come from the same model, otherwise the similarity scores will be meaningless.

In [2]:
# Load the same embedding model used to create vectors
print("Loading embedding model...")

embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

print("Embedding model loaded")

Loading embedding model...




Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

[1mBertModel LOAD REPORT[0m from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


Embedding model loaded


## Connect to LanceDB

Opens the existing LanceDB database and the `entities` table created in the previous notebook.  The row count printed here should match the 196 entities we stored during vectorization.

In [3]:
db = lancedb.connect('/workspace/lancedb_data')
table = db.open_table('entities')

print(f"Connected to LanceDB")
print(f"Total entities available: {table.count_rows()}")

Connected to LanceDB
Total entities available: 196


## Rebuild the Knowledge Graph from Senzing

Reconnects briefly to Senzing to re-export entities and rebuild the NetworkX graph with nodes and relationship edges. 

In [4]:
G = nx.Graph()

# Get all entities
all_entities = table.to_pandas()

SENZING_HOST = os.getenv('SENZING_GRPC_HOST', 'senzing')
SENZING_PORT = os.getenv('SENZING_GRPC_PORT', '8261')

grpc_url = f"{SENZING_HOST}:{SENZING_PORT}"
grpc_channel = grpc.insecure_channel(grpc_url)
sz_abstract_factory = SzAbstractFactoryGrpc(grpc_channel)
sz_engine = sz_abstract_factory.create_engine()

# Quick export to rebuild graph
entities = []
export_handle = sz_engine.export_json_entity_report(
    flags=SzEngineFlags.SZ_EXPORT_INCLUDE_ALL_ENTITIES | 
          SzEngineFlags.SZ_ENTITY_INCLUDE_RECORD_JSON_DATA
)

while True:
    try:
        entity_json = sz_engine.fetch_next(export_handle)
        if not entity_json:
            break
        entities.append(json.loads(entity_json))
    except StopIteration:
        break

sz_engine.close_export_report(export_handle)

# Build graph nodes
for entity in entities:
    entity_data = entity.get('RESOLVED_ENTITY', {})
    entity_id = entity_data.get('ENTITY_ID')
    records = entity_data.get('RECORDS', [])
    
    if not records:
        continue
    
    first_record = records[0]
    json_data = first_record.get('JSON_DATA', {})
    record_type = json_data.get('RECORD_TYPE', 'UNKNOWN')
    
    name = json_data.get('PRIMARY_NAME_FULL')
    if not name:
        name_list = json_data.get('NAMES', [])
        for name_obj in name_list:
            name = name_obj.get('NAME_FULL') or name_obj.get('PRIMARY_NAME_ORG') or name_obj.get('NAME_ORG')
            if name:
                break
    
    if not name:
        name = f"Entity {entity_id}"
    
    data_sources = list(set([r.get('DATA_SOURCE') for r in records]))
    
    G.add_node(
        entity_id,
        name=name,
        type=record_type,
        num_records=len(records),
        data_sources=data_sources
    )

# Build graph edges
for entity in entities:
    entity_data = entity.get('RESOLVED_ENTITY', {})
    anchor_entity_id = entity_data.get('ENTITY_ID')
    
    for record in entity_data.get('RECORDS', []):
        relationships = record.get('JSON_DATA', {}).get('RELATIONSHIPS', [])
        
        for rel in relationships:
            pointer_key = rel.get('REL_POINTER_KEY')
            pointer_role = rel.get('REL_POINTER_ROLE', 'related')
            
            for target_entity in entities:
                target_data = target_entity.get('RESOLVED_ENTITY', {})
                target_entity_id = target_data.get('ENTITY_ID')
                
                for target_record in target_data.get('RECORDS', []):
                    if target_record.get('RECORD_ID') == pointer_key:
                        if anchor_entity_id != target_entity_id:
                            G.add_edge(
                                anchor_entity_id,
                                target_entity_id,
                                relationship=pointer_role
                            )
                        break

# Close Senzing

grpc_channel.close()

print(f"Knowledge graph rebuilt:")
print(f"  Nodes: {G.number_of_nodes()}")
print(f"  Edges: {G.number_of_edges()}")

Knowledge graph rebuilt:
  Nodes: 196
  Edges: 233


## Configure DSPy with Claude

Sets up DSPy to use Claude Sonnet as the language model backend.  The API key is read from an environment variable, stored in `.env` in the root directory of the repo, so it never needs to be hardcoded in the notebook.

In [5]:
ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY')

if not ANTHROPIC_API_KEY:
    raise ValueError("ANTHROPIC_API_KEY not found in environment")

lm = dspy.LM(
    model='anthropic/claude-sonnet-4-5-20250929',
    api_key=ANTHROPIC_API_KEY,
    max_tokens=2048
)

dspy.settings.configure(lm=lm)

print("DSPy configured with Claude 4.5 Sonnet")

DSPy configured with Claude 4.5 Sonnet


## Define the DSPy Signature and Module

Defines the contract for the RAG pipeline using a DSPy `Signature`: the LLM receives a `context` field (the graph data we assembled) and a `question` field, and must produce an `answer`.  The `GraphRAG` module wraps this in a `ChainOfThought` call, which prompts the model to reason step by step before answering rather than jumping straight to a conclusion.

In [6]:
# Define DSPy signature for Graph RAG
class GraphRAGSignature(dspy.Signature):
    """Answer questions about a corporate ownership and sanctions knowledge graph."""
    
    context = dspy.InputField(desc="Knowledge graph context including entities and relationships")
    question = dspy.InputField(desc="User's question about the knowledge graph")
    answer = dspy.OutputField(desc="Detailed answer based on the knowledge graph context")

# Create DSPy module
class GraphRAG(dspy.Module):
    def __init__(self):
        super().__init__()
        self.generate_answer = dspy.ChainOfThought(GraphRAGSignature)
    
    def forward(self, context, question):
        return self.generate_answer(context=context, question=question)

# Initialize
graph_rag = GraphRAG()

## Define function to ask the knowledge graph

This is the core of the pipeline.  Given a user question it runs four steps: embed the question and search LanceDB for the top 10 most similar entities, expand that set by pulling 1-hop neighbors from the NetworkX graph, format the combined entity set into a structured text context, then pass that context and the question to the DSPy module.  The graph expansion step is what makes this a knowledge graph RAG rather than a plain vector search.

In [7]:
def ask_knowledge_graph(question, top_k=10):
    """
    Simple RAG: Search -> Expand -> Format -> Ask LLM
    """
    print(f"\nQuestion: {question}")
    print("="*70)
    
    # Step 1: Vector search
    question_embedding = embedding_model.encode(question).tolist()
    results = table.search(question_embedding).limit(top_k).to_list()
    
    print(f"Found {len(results)} relevant entities")
    
    # Step 2: Collect entity IDs and expand to neighbors
    entity_ids = set()
    for r in results:
        entity_ids.add(r['entity_id'])
        
        # Add neighbors from graph
        if r['entity_id'] in G:
            neighbors = list(G.neighbors(r['entity_id']))[:5]
            entity_ids.update(neighbors)
    
    print(f"Expanded to {len(entity_ids)} entities (including neighbors)")
    
    # Step 3: Build simple context
    context_parts = ["ENTITIES:\n"]
    
    for entity_id in list(entity_ids)[:30]:  # Cap at 30 entities
        # Get entity info
        entity_info = table.search().where(f"entity_id = {entity_id}").limit(1).to_list()
        if not entity_info:
            continue
        
        info = entity_info[0]
        context_parts.append(f"- {info['name']} ({info['type']})")
        context_parts.append(f"  Sources: {info['data_sources']}, Records: {info['num_records']}")
        
        if info.get('risks'):
            context_parts.append(f"  Risks: {info['risks']}")
        
        # Add relationships
        if entity_id in G:
            rels = []
            for neighbor_id in list(G.neighbors(entity_id))[:3]:
                edge_data = G.get_edge_data(entity_id, neighbor_id)
                neighbor = G.nodes[neighbor_id]
                rel_type = edge_data.get('relationship', 'connected to') if edge_data else 'connected to'
                rels.append(f"{rel_type} {neighbor['name']}")
            
            if rels:
                context_parts.append(f"  Relationships: {'; '.join(rels)}")
        
        context_parts.append("")
    
    context = "\n".join(context_parts)
    
    # Step 4: Ask LLM
    print("Querying LLM...")
    result = graph_rag(context=context, question=question)
    
    print("\n" + "="*70)
    print("ANSWER")
    print("="*70)
    print(result.answer)
    print("="*70)

## Interactive Chatbot Session

Runs a simple read-eval-print loop so participants can ask free-form questions about the dataset.  Type `quit` or `exit` to stop.  The example questions in the notebook output are a good starting point: try asking about specific named individuals, risk categories, ownership chains, or patterns that might indicate fraud.

In [None]:
print("Knowledge Graph RAG - Interactive Session")
print("="*70)
print("Ask any question about the corporate ownership and sanctions data.")
print("The system will search LanceDB and query the knowledge graph.")
print("Type 'quit' to exit.\n")

while True:
    question = input("Your question: ").strip()
    
    if question.lower() in ['quit', 'exit', 'q']:
        print("Goodbye!")
        break
    
    if not question:
        continue
    
    try:
        ask_knowledge_graph(question)
        print()  # Blank line for readability
        
    except Exception as e:
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()

Knowledge Graph RAG - Interactive Session
Ask any question about the corporate ownership and sanctions data.
The system will search LanceDB and query the knowledge graph.
Type 'quit' to exit.



Your question:  who specifically is listed as an oligarch



Question: who specifically is listed as an oligarch
Found 10 relevant entities
Expanded to 19 entities (including neighbors)
Querying LLM...

ANSWER
**Suleyman Abusaidovich KERIMOV** is the only person specifically listed as an oligarch in this knowledge graph.

According to the data, Suleyman Abusaidovich KERIMOV has the following risk classifications:
- **role.oligarch** - Designated as an oligarch
- poi - Person of interest
- sanction - Under sanctions
- role.pep - Politically exposed person

He has family relationships with:
- Firuza Nazimovna Kerimova (family member)
- Amina Suleymanovna Kerimova (family member)
- Gulnara Suleimanova KERIMOVA (family member)

Additionally, his family member Said Kerimov appears to control several Russian organizations (ООО "ЗАРЕЧЬЕ-ЭСТЕЙТ", ООО "НАЦИОНАЛЬНАЯ КИНОСЕТЬ", ООО "ВЕНЧЕР МЕНЕДЖМЕНТ ЛИМИТЕД"), and multiple family members carry sanctions and related party risk flags.



Your question:  who are the oligarchs in this data and what organizations are they associated with?



Question: who are the oligarchs in this data and what organizations are they associated with?
Found 10 relevant entities
Expanded to 18 entities (including neighbors)
Querying LLM...

ANSWER
Based on the knowledge graph data, there is **one oligarch** identified:

**Suleyman Abusaidovich KERIMOV**
- Risk Profile: Oligarch, Person of Interest (POI), Sanctioned, Politically Exposed Person (PEP)
- Source: OPEN-SANCTIONS

**Organizations Associated with Kerimov (through family network):**

The oligarch is connected to several Russian organizations through his son, Said Kerimov, who controls or founded the following entities:

1. **ООО "ГРАНДЕКО"** - Sanctioned/linked entity, POI
2. **ООО "НАЦИОНАЛЬНАЯ КИНОСЕТЬ"** (National Cinema Network) - Sanctioned/linked entity
3. **ООО "ЗАРЕЧЬЕ-ЭСТЕЙТ"** - Sanctioned/linked entity
4. **ООО "ВЕНЧЕР МЕНЕДЖМЕНТ ЛИМИТЕД"** (Venture Management Limited) - Sanctioned/linked entity

These organizations form an interconnected corporate structure with control 

Your question:  do you see any signatures of fraud in this dataset?



Question: do you see any signatures of fraud in this dataset?
Found 10 relevant entities
Expanded to 25 entities (including neighbors)
Querying LLM...

ANSWER
Yes, there are several potential fraud signatures worth investigating:

**Most Concerning - Potential Identity Manipulation:**
- **Dan Symmons**, **Daniel Simons**, and **Daniel Francis Symns** appear as separate individuals but have very similar names. This could indicate:
  - The same person using name variations to obscure beneficial ownership across multiple entities (FASTMOOR LIMITED, BLACKBROKE LIMITED, KINDLESHINE LIMITED, ZYMURGY LLP)
  - This pattern is commonly used in fraud schemes to avoid detection and circumvent ownership disclosure requirements

**Red Flags for Further Investigation:**

1. **PEP Risk with Family Connection**: N. T. Wright (PEP-flagged) has a family relationship with Julian Wright (RCA-flagged). While not fraudulent per se, PEPs require enhanced due diligence, especially when family members are inv

Your question:  do you see any signatures of fraud in this dataset?



Question: do you see any signatures of fraud in this dataset?
Found 10 relevant entities
Expanded to 25 entities (including neighbors)
Querying LLM...

ANSWER
Yes, there are several potential fraud signatures worth investigating:

**Most Concerning - Potential Identity Manipulation:**
- **Dan Symmons**, **Daniel Simons**, and **Daniel Francis Symns** appear as separate individuals but have very similar names. This could indicate:
  - The same person using name variations to obscure beneficial ownership across multiple entities (FASTMOOR LIMITED, BLACKBROKE LIMITED, KINDLESHINE LIMITED, ZYMURGY LLP)
  - This pattern is commonly used in fraud schemes to avoid detection and circumvent ownership disclosure requirements

**Red Flags for Further Investigation:**

1. **PEP Risk with Family Connection**: N. T. Wright (PEP-flagged) has a family relationship with Julian Wright (RCA-flagged). While not fraudulent per se, PEPs require enhanced due diligence, especially when family members are inv