# AI Property Extractor — End-to-End Test Notebook

This notebook runs all 8 test scenarios from `TESTING.md` against the deployed AI Property Extractor.

**Project:** `out-of-bluefield`  
**Source view:** `cdf_cdm/CogniteAsset/v1`  
**Target view (for overwrite/append):** `rmdm/Asset/v2` (has `lastProcessedByAiExtractor`)  
**Instance space:** `springfield_instances`  
**Extraction pipeline:** `ep_ai_property_extractor`  
**Function:** `fn_ai_property_extractor`  

---

## 0. Setup & Client Initialization

In [1]:
from cognite.client import CogniteClient
from cognite.client.data_classes import ExtractionPipelineConfigWrite
from cognite.client.data_classes.data_modeling import (
    NodeApply, NodeOrEdgeData, ViewId, NodeId
)
import cognite.client.data_classes.data_modeling as dm
from dotenv import dotenv_values
import yaml
import time
from pathlib import Path

config = dotenv_values(".env")

client = CogniteClient.default_oauth_client_credentials(
    project=config["CDF_PROJECT"],
    cdf_cluster=config["CDF_CLUSTER"],
    tenant_id=config["IDP_TENANT_ID"],
    client_id=config["IDP_CLIENT_ID"],
    client_secret=config["IDP_CLIENT_SECRET"],
)

print(f"Connected to project: {client.config.project}")
print(f"Logged in as: {client.iam.token.inspect().subject}")

Connected to project: out-of-bluefield
Logged in as: 0021d776-b1ee-4386-a373-3bcf81bbbccd


In [2]:
# ── Constants ──
INSTANCE_SPACE = "springfield_instances"
SOURCE_VIEW = ViewId("cdf_cdm", "CogniteAsset", "v1")
TARGET_VIEW = ViewId("rmdm", "Asset", "6fe0dbb2ca7378")
EXTRACTION_PIPELINE = "ep_ai_property_extractor"
FUNCTION_EXT_ID = "fn_ai_property_extractor"
TEST_CONFIGS_DIR = Path("modules/ai_extractor/test_configs")
STATE_DB = "ai_extractor_state"
STATE_TABLE = "extraction_state"

# Test node external IDs
TEST_NODE_IDS = [
    "test_ai_ext_001",
    "test_ai_ext_002",
    "test_ai_ext_003",
]

## 0.1 Helper Functions

In [3]:
def upload_test_config(config_file: str):
    """Upload a test config YAML to the extraction pipeline."""
    config_path = TEST_CONFIGS_DIR / config_file
    with open(config_path) as f:
        config_yaml = f.read()
    
    # Parse and re-dump to strip comments (EP config is plain YAML)
    config_dict = yaml.safe_load(config_yaml)
    clean_yaml = yaml.dump(config_dict, default_flow_style=False)
    
    client.extraction_pipelines.config.create(
        ExtractionPipelineConfigWrite(
            external_id=EXTRACTION_PIPELINE,
            config=clean_yaml
        )
    )
    print(f"Uploaded config: {config_file}")
    for line in clean_yaml.strip().split("\n")[:15]:
        print(f"   {line}")
    if len(clean_yaml.strip().split("\n")) > 15:
        print(f"   ... ({len(clean_yaml.strip().splitlines())} lines total)")


def trigger_function(reset_state: bool = False, log_level: str = "DEBUG"):
    """Trigger the AI Property Extractor function and wait for result."""
    data = {
        "logLevel": log_level,
        "ExtractionPipelineExtId": EXTRACTION_PIPELINE,
    }
    if reset_state:
        data["resetState"] = True
    
    print(f"Triggering function with data: {data}")
    call = client.functions.call(
        external_id=FUNCTION_EXT_ID,
        data=data
    )
    print(f"   Call ID: {call.id}, Status: {call.status}")
    
    # Wait for completion
    print("   Waiting for completion...", end="")
    result = call.get_response()
    print(f" Done!")
    print(f"   Result: {result}")
    
    # Print logs
    print("\n   Function Logs:")
    logs = call.get_logs()
    for log_entry in logs:
        print(f"   {log_entry.message}")
    
    return result


def inspect_test_nodes(view_id: ViewId = SOURCE_VIEW, extra_view: ViewId = None):
    """Print properties of test nodes from the given view(s)."""
    sources = [view_id]
    if extra_view:
        sources.append(extra_view)
    
    results = client.data_modeling.instances.retrieve(
        nodes=[NodeId(INSTANCE_SPACE, ext_id) for ext_id in TEST_NODE_IDS],
        sources=sources,
    )
    
    for node in results.nodes:
        print(f"\n  {node.space}/{node.external_id}:")
        if not node.properties:
            print("   (no properties)")
            continue
        # node.properties is a dict keyed by ViewId -> dict of property values
        for view_key, prop_values in node.properties.items():
            print(f"   View: {view_key}")
            if isinstance(prop_values, dict):
                for key, val in prop_values.items():
                    display = val
                    if isinstance(val, str) and len(val) > 80:
                        display = val[:80] + "..."
                    print(f"     {key}: {display}")
            else:
                # Fallback: print raw
                print(f"     {prop_values}")


def inspect_state_store():
    """Print the current state store contents."""
    try:
        rows = client.raw.rows.list(
            db_name=STATE_DB,
            table_name=STATE_TABLE,
            limit=10
        )
        if not rows:
            print("   State store is empty")
            return
        for row in rows:
            print(f"\n   Row key: {row.key}")
            for k, v in row.columns.items():
                print(f"      {k}: {v}")
    except Exception as e:
        print(f"   Could not read state store: {e}")


def inspect_extraction_runs(limit: int = 3):
    """Print recent extraction pipeline runs."""
    runs = client.extraction_pipelines.runs.list(
        external_id=EXTRACTION_PIPELINE,
        limit=limit
    )
    for run in runs:
        msg = run.message[:120] if run.message else "N/A"
        print(f"   Status: {run.status} | {msg}")


def reset_state_store():
    """Delete the state store row to force a full re-run."""
    try:
        client.raw.rows.delete(
            db_name=STATE_DB,
            table_name=STATE_TABLE,
            key=["ai_property_extractor"]
        )
        print("   State store row deleted")
    except Exception as e:
        print(f"   Could not delete state store row (may not exist yet): {e}")

## 0.2 Verify Deployment

In [4]:
# Verify all deployed resources exist
print("=" * 60)
print("DEPLOYMENT VERIFICATION")
print("=" * 60)

# 1. Function
try:
    fn = client.functions.retrieve(external_id=FUNCTION_EXT_ID)
    print(f"\n  Function: {fn.external_id} (status: {fn.status})")
except Exception as e:
    print(f"\n  Function not found: {e}")

# 2. Extraction Pipeline
try:
    ep = client.extraction_pipelines.retrieve(external_id=EXTRACTION_PIPELINE)
    print(f"  Extraction Pipeline: {ep.external_id}")
except Exception as e:
    print(f"  Extraction Pipeline not found: {e}")

# 3. Agent
try:
    agent = client.agents.retrieve(external_id="ai_property_extractor_agent")
    print(f"  Agent: {agent.external_id}")
except Exception as e:
    print(f"  Agent not found: {e}")

# 4. Source view
try:
    views = client.data_modeling.views.retrieve(SOURCE_VIEW)
    print(f"  Source View: {SOURCE_VIEW} ({len(views[0].properties)} properties)")
except Exception as e:
    print(f"  Source View not found: {e}")

# 5. Target view
try:
    views = client.data_modeling.views.retrieve(TARGET_VIEW)
    target_props = list(views[0].properties.keys())
    has_ai_ts = "lastProcessedByAiExtractor" in target_props
    print(f"  Target View: {TARGET_VIEW} ({len(target_props)} properties)")
    print(f"   lastProcessedByAiExtractor: {'PRESENT' if has_ai_ts else 'MISSING!'}")
    print(f"   Properties: {', '.join(sorted(target_props)[:15])}{'...' if len(target_props) > 15 else ''}")
except Exception as e:
    print(f"  Target View not found: {e}")

# 6. State store
try:
    tables = client.raw.tables.list(db_name=STATE_DB, limit=10)
    table_names = [t.name for t in tables]
    print(f"  State Store DB: {STATE_DB} (tables: {table_names})")
except Exception as e:
    print(f"  State Store DB not found: {e}")

print("\n" + "=" * 60)

DEPLOYMENT VERIFICATION

  Function: fn_ai_property_extractor (status: Ready)
  Extraction Pipeline: ep_ai_property_extractor
  Agent not found: AgentsAPI.retrieve() got an unexpected keyword argument 'external_id'
  Source View: ViewId(space='cdf_cdm', external_id='CogniteAsset', version='v1') (23 properties)
  Target View: ViewId(space='rmdm', external_id='Asset', version='6fe0dbb2ca7378') (31 properties)
   lastProcessedByAiExtractor: PRESENT
   Properties: activities, aiTags, aliases, assetClass, children, description, equipment, failureNotifications, files, functionCode, functionCodeDesc, indirectMaintenanceOrders, lastProcessedByAiExtractor, maintenanceOrders, name...
  State Store DB: ai_extractor_state (tables: ['extraction_state'])



## 0.3 Create Test Data

Creates 3 test nodes in springfield_instances with descriptions suitable for tag/alias extraction.

In [5]:
# Create test nodes with descriptions for extraction
test_nodes = [
    NodeApply(
        space=INSTANCE_SPACE,
        external_id="test_ai_ext_001",
        sources=[NodeOrEdgeData(SOURCE_VIEW, properties={
            "name": "Pump P-101",
            "description": (
                "Centrifugal pump for cooling water circulation. "
                "Located in building B3, floor 2. Manufacturer: Grundfos. "
                "Capacity: 500 L/min. Installed 2019. "
                "Tags: cooling, water, pump, centrifugal"
            ),
        })]
    ),
    NodeApply(
        space=INSTANCE_SPACE,
        external_id="test_ai_ext_002",
        sources=[NodeOrEdgeData(SOURCE_VIEW, properties={
            "name": "Valve V-202",
            "description": (
                "Ball valve for steam isolation. "
                "DN150, PN40. Material: Stainless steel 316L. "
                "Installed in the main steam header. "
                "Tags: steam, isolation, valve, ball-valve"
            ),
        })]
    ),
    NodeApply(
        space=INSTANCE_SPACE,
        external_id="test_ai_ext_003",
        sources=[NodeOrEdgeData(SOURCE_VIEW, properties={
            "name": "Compressor C-301",
            "description": (
                "Reciprocating air compressor for instrument air. "
                "Max pressure: 10 bar. Flow rate: 200 Nm3/h. "
                "Manufacturer: Atlas Copco. Year: 2021. "
                "Tags: compressor, air, instrument-air, reciprocating"
            ),
        })]
    ),
]

result = client.data_modeling.instances.apply(nodes=test_nodes)
print(f"Created/updated {len(result.nodes)} test nodes")

# Verify
inspect_test_nodes()

Created/updated 3 test nodes

  springfield_instances/test_ai_ext_001:
   View: ViewId(space='cdf_cdm', external_id='CogniteAsset', version='v1')
     path: []
     name: Pump P-101
     description: Centrifugal pump for cooling water circulation. Located in building B3, floor 2....

  springfield_instances/test_ai_ext_002:
   View: ViewId(space='cdf_cdm', external_id='CogniteAsset', version='v1')
     path: []
     name: Valve V-202
     description: Ball valve for steam isolation. DN150, PN40. Material: Stainless steel 316L. Ins...

  springfield_instances/test_ai_ext_003:
   View: ViewId(space='cdf_cdm', external_id='CogniteAsset', version='v1')
     path: []
     name: Compressor C-301
     description: Reciprocating air compressor for instrument air. Max pressure: 10 bar. Flow rate...


---

## Test 1: add_new_only (default)

**Goal:** Properties are only extracted for nodes that don't have them yet.  
**Config:** config_add_new_only.yaml  
**Expected:**
- Nodes with empty tags/aliases get extracted values
- Second run processes 0 nodes (properties already populated)

In [6]:
# ── Test 1: Setup ──
print("=" * 60)
print("TEST 1: add_new_only mode")
print("=" * 60)

# Clear any previous test state
reset_state_store()

# Clear tags and aliases on test nodes
clear_nodes = [
    NodeApply(
        space=INSTANCE_SPACE,
        external_id=ext_id,
        sources=[NodeOrEdgeData(SOURCE_VIEW, properties={
            "tags": None,
            "aliases": None,
        })]
    )
    for ext_id in TEST_NODE_IDS
]
client.data_modeling.instances.apply(nodes=clear_nodes)
print("Cleared tags and aliases on test nodes")

# Upload config
upload_test_config("config_add_new_only.yaml")

print("\nBefore extraction:")
inspect_test_nodes()

TEST 1: add_new_only mode
   State store row deleted
Cleared tags and aliases on test nodes
Uploaded config: config_add_new_only.yaml
   agent:
     externalId: ai_property_extractor_agent
   extraction:
     properties:
     - property: tags
       writeMode: add_new_only
     - property: aliases
       writeMode: add_new_only
     textProperty: description
   processing:
     batchSize: 10
     llmBatchSize: 1
   prompt:
     customInstructions: 'Extract relevant tags and aliases from the asset description.
   
   ... (31 lines total)

Before extraction:

  springfield_instances/test_ai_ext_001:
   View: ViewId(space='cdf_cdm', external_id='CogniteAsset', version='v1')
     path: [{'space': 'springfield_instances', 'externalId': 'test_ai_ext_001'}]
     root: {'space': 'springfield_instances', 'externalId': 'test_ai_ext_001'}
     pathLastUpdatedTime: 2026-02-11T07:11:51.797543+00:00
     name: Pump P-101
     description: Centrifugal pump for cooling water circulation. Located in bu

In [7]:
# ── Test 1: Run 1 — Should extract tags and aliases ──
print("\nRun 1: Extracting properties...")
result = trigger_function()


Run 1: Extracting properties...
Triggering function with data: {'logLevel': 'DEBUG', 'ExtractionPipelineExtId': 'ep_ai_property_extractor'}
   Call ID: 7660323106836456, Status: Completed
   Waiting for completion... Done!
   Result: {'message': 'ai_property_extractor executed successfully. Processed 260 instances, updated 3 instances.', 'status': 'success'}

   Function Logs:
   [INFO] Starting ai_property_extractor
   [INFO] Reading config from extraction pipeline: ep_ai_property_extractor
   [DEBUG] Raw config: agent:
             externalId: ai_property_extractor_agent
           extraction:
             properties:
             - property: tags
               writeMode: add_new_only
             - property: aliases
               writeMode: add_new_only
             textProperty: description
           processing:
             batchSize: 10
             llmBatchSize: 1
           prompt:
             customInstructions: 'Extract relevant tags and aliases from the asset descriptio

In [8]:
# ── Test 1: Verify Run 1 ──
print("\nAfter Run 1:")
inspect_test_nodes()

print("\nExtraction Pipeline Runs:")
inspect_extraction_runs()

print("\nState Store:")
inspect_state_store()


After Run 1:

  springfield_instances/test_ai_ext_001:
   View: ViewId(space='cdf_cdm', external_id='CogniteAsset', version='v1')
     path: [{'space': 'springfield_instances', 'externalId': 'test_ai_ext_001'}]
     root: {'space': 'springfield_instances', 'externalId': 'test_ai_ext_001'}
     pathLastUpdatedTime: 2026-02-11T07:12:04.310131+00:00
     name: Pump P-101
     tags: ['cooling', 'water', 'pump', 'centrifugal']
     description: Centrifugal pump for cooling water circulation. Located in building B3, floor 2....

  springfield_instances/test_ai_ext_002:
   View: ViewId(space='cdf_cdm', external_id='CogniteAsset', version='v1')
     path: [{'space': 'springfield_instances', 'externalId': 'test_ai_ext_002'}]
     root: {'space': 'springfield_instances', 'externalId': 'test_ai_ext_002'}
     pathLastUpdatedTime: 2026-02-11T07:12:04.310131+00:00
     name: Valve V-202
     tags: ['steam', 'isolation', 'valve', 'ball-valve']
     description: Ball valve for steam isolation. DN150

In [9]:
# ── Test 1: Run 2 — Should process 0 nodes (already populated) ──
print("\nRun 2: Should skip all nodes (already have tags/aliases)...")
result = trigger_function()

print("\n PASS if processed=0 above")


Run 2: Should skip all nodes (already have tags/aliases)...
Triggering function with data: {'logLevel': 'DEBUG', 'ExtractionPipelineExtId': 'ep_ai_property_extractor'}
   Call ID: 6349392362329567, Status: Completed
   Waiting for completion... Done!
   Result: {'message': 'ai_property_extractor executed successfully. Processed 251 instances, updated 0 instances.', 'status': 'success'}

   Function Logs:
   [INFO] Starting ai_property_extractor
   [INFO] Reading config from extraction pipeline: ep_ai_property_extractor
   [DEBUG] Raw config: agent:
             externalId: ai_property_extractor_agent
           extraction:
             properties:
             - property: tags
               writeMode: add_new_only
             - property: aliases
               writeMode: add_new_only
             textProperty: description
           processing:
             batchSize: 10
             llmBatchSize: 1
           prompt:
             customInstructions: 'Extract relevant tags and alias

---

## Test 2: overwrite mode

**Goal:** Existing values are replaced; nodes are not reprocessed within the same epoch.  
**Config:** config_overwrite.yaml (uses rmdm/Asset/2 as target with lastProcessedByAiExtractor)  
**Expected:**
- All matching nodes get extracted tags and aliases
- lastProcessedByAiExtractor timestamp is set
- Second run processes 0 nodes (epoch filter)
- resetState: true causes full reprocessing

In [10]:
# ── Test 2: Setup ──
print("=" * 60)
print("TEST 2: overwrite mode")
print("=" * 60)

# Reset state
reset_state_store()

# Clear target view properties on test nodes
clear_nodes = [
    NodeApply(
        space=INSTANCE_SPACE,
        external_id=ext_id,
        sources=[NodeOrEdgeData(TARGET_VIEW, properties={
            "tags": None,
            "aliases": None,
            "lastProcessedByAiExtractor": None,
        })]
    )
    for ext_id in TEST_NODE_IDS
]
client.data_modeling.instances.apply(nodes=clear_nodes)
print("Cleared target view properties on test nodes")

# Upload config
upload_test_config("config_overwrite.yaml")

print("\nBefore extraction:")
inspect_test_nodes(SOURCE_VIEW, TARGET_VIEW)

TEST 2: overwrite mode
   State store row deleted
Cleared target view properties on test nodes
Uploaded config: config_overwrite.yaml
   agent:
     externalId: ai_property_extractor_agent
   extraction:
     aiTimestampProperty: lastProcessedByAiExtractor
     properties:
     - property: tags
       writeMode: overwrite
     - property: aliases
       writeMode: overwrite
     textProperty: description
   processing:
     batchSize: 10
     llmBatchSize: 1
   prompt:
     customInstructions: 'Extract relevant tags and aliases from the asset description.
   ... (36 lines total)

Before extraction:

  springfield_instances/test_ai_ext_001:
   View: ViewId(space='cdf_cdm', external_id='CogniteAsset', version='v1')
     path: [{'space': 'springfield_instances', 'externalId': 'test_ai_ext_001'}]
     root: {'space': 'springfield_instances', 'externalId': 'test_ai_ext_001'}
     pathLastUpdatedTime: 2026-02-11T07:12:04.310131+00:00
     name: Pump P-101
     description: Centrifugal pump f

In [11]:
# ── Test 2: Run 1 — Should extract and overwrite ──
print("\nRun 1: Extracting with overwrite mode...")
result = trigger_function()


Run 1: Extracting with overwrite mode...
Triggering function with data: {'logLevel': 'DEBUG', 'ExtractionPipelineExtId': 'ep_ai_property_extractor'}
   Call ID: 1150121769903883, Status: Completed
   Waiting for completion... Done!
   Result: {'message': 'ai_property_extractor executed successfully. Processed 3 instances, updated 3 instances.', 'status': 'success'}

   Function Logs:
   [INFO] Starting ai_property_extractor
   [INFO] Reading config from extraction pipeline: ep_ai_property_extractor
   [DEBUG] Raw config: agent:
             externalId: ai_property_extractor_agent
           extraction:
             aiTimestampProperty: lastProcessedByAiExtractor
             properties:
             - property: tags
               writeMode: overwrite
             - property: aliases
               writeMode: overwrite
             textProperty: description
           processing:
             batchSize: 10
             llmBatchSize: 1
           prompt:
             customInstructions

In [12]:
# ── Test 2: Verify Run 1 ──
print("\nAfter Run 1 (check tags, aliases, AND lastProcessedByAiExtractor):")
inspect_test_nodes(SOURCE_VIEW, TARGET_VIEW)

print("\nState Store:")
inspect_state_store()


After Run 1 (check tags, aliases, AND lastProcessedByAiExtractor):

  springfield_instances/test_ai_ext_001:
   View: ViewId(space='cdf_cdm', external_id='CogniteAsset', version='v1')
     path: [{'space': 'springfield_instances', 'externalId': 'test_ai_ext_001'}]
     root: {'space': 'springfield_instances', 'externalId': 'test_ai_ext_001'}
     pathLastUpdatedTime: 2026-02-11T07:29:34.200087+00:00
     name: Pump P-101
     tags: ['cooling', 'water', 'pump', 'centrifugal']
     description: Centrifugal pump for cooling water circulation. Located in building B3, floor 2....
   View: ViewId(space='rmdm', external_id='Asset', version='6fe0dbb2ca7378')
     lastProcessedByAiExtractor: 2026-02-11T07:29:33.817+00:00
     path: [{'space': 'springfield_instances', 'externalId': 'test_ai_ext_001'}]
     root: {'space': 'springfield_instances', 'externalId': 'test_ai_ext_001'}
     pathLastUpdatedTime: 2026-02-11T07:29:34.200087+00:00
     name: Pump P-101
     tags: ['cooling', 'water', 'pum

In [13]:
# ── Test 2: Run 2 — Should process 0 (epoch filter) ──
print("\nRun 2: Should skip all nodes (same epoch)...")
result = trigger_function()

print("\n PASS if processed=0 above")


Run 2: Should skip all nodes (same epoch)...
Triggering function with data: {'logLevel': 'DEBUG', 'ExtractionPipelineExtId': 'ep_ai_property_extractor'}
   Call ID: 1560718943295544, Status: Completed
   Waiting for completion... Done!
   Result: {'message': 'ai_property_extractor executed successfully. Processed 0 instances, updated 0 instances.', 'status': 'success'}

   Function Logs:
   [INFO] Starting ai_property_extractor
   [INFO] Reading config from extraction pipeline: ep_ai_property_extractor
   [DEBUG] Raw config: agent:
             externalId: ai_property_extractor_agent
           extraction:
             aiTimestampProperty: lastProcessedByAiExtractor
             properties:
             - property: tags
               writeMode: overwrite
             - property: aliases
               writeMode: overwrite
             textProperty: description
           processing:
             batchSize: 10
             llmBatchSize: 1
           prompt:
             customInstruct

In [14]:
# ── Test 2: Run 3 — Reset state, should reprocess all ──
print("\nRun 3: Reset state -> should reprocess all nodes...")
result = trigger_function(reset_state=True)

print("\nAfter Run 3 (values should be overwritten with fresh results):")
inspect_test_nodes(SOURCE_VIEW, TARGET_VIEW)

print("\n PASS if processed > 0 above and timestamps updated")


Run 3: Reset state -> should reprocess all nodes...
Triggering function with data: {'logLevel': 'DEBUG', 'ExtractionPipelineExtId': 'ep_ai_property_extractor', 'resetState': True}
   Call ID: 1933792755381937, Status: Completed
   Waiting for completion... Done!
   Result: {'message': 'ai_property_extractor executed successfully. Processed 3 instances, updated 3 instances.', 'status': 'success'}

   Function Logs:
   [INFO] Starting ai_property_extractor
   [INFO] Reading config from extraction pipeline: ep_ai_property_extractor
   [DEBUG] Raw config: agent:
             externalId: ai_property_extractor_agent
           extraction:
             aiTimestampProperty: lastProcessedByAiExtractor
             properties:
             - property: tags
               writeMode: overwrite
             - property: aliases
               writeMode: overwrite
             textProperty: description
           processing:
             batchSize: 10
             llmBatchSize: 1
           prompt:


---

## Test 3: append mode

**Goal:** New values are appended to existing lists (with deduplication).  
**Config:** config_append.yaml (uses rmdm/Asset/2 as target)  
**Expected:**
- Existing tags are preserved, new ones appended
- No duplicates
- lastProcessedByAiExtractor is set
- Second run processes 0 nodes (epoch filter)

In [15]:
# ── Test 3: Setup ──
print("=" * 60)
print("TEST 3: append mode")
print("=" * 60)

# Reset state
reset_state_store()

# Pre-populate test_ai_ext_001 with existing tags
pre_populate = [
    NodeApply(
        space=INSTANCE_SPACE,
        external_id="test_ai_ext_001",
        sources=[NodeOrEdgeData(TARGET_VIEW, properties={
            "tags": ["existing-tag-1", "existing-tag-2"],
            "aliases": ["existing-alias"],
            "lastProcessedByAiExtractor": None,  # Clear timestamp
        })]
    ),
]
# Clear the other nodes
for ext_id in TEST_NODE_IDS[1:]:
    pre_populate.append(
        NodeApply(
            space=INSTANCE_SPACE,
            external_id=ext_id,
            sources=[NodeOrEdgeData(TARGET_VIEW, properties={
                "tags": None,
                "aliases": None,
                "lastProcessedByAiExtractor": None,
            })]
        )
    )

client.data_modeling.instances.apply(nodes=pre_populate)
print("Pre-populated test_ai_ext_001 with existing tags: ['existing-tag-1', 'existing-tag-2']")
print("   and existing aliases: ['existing-alias']")

# Upload config
upload_test_config("config_append.yaml")

print("\nBefore extraction:")
inspect_test_nodes(SOURCE_VIEW, TARGET_VIEW)

TEST 3: append mode
   State store row deleted
Pre-populated test_ai_ext_001 with existing tags: ['existing-tag-1', 'existing-tag-2']
   and existing aliases: ['existing-alias']
Uploaded config: config_append.yaml
   agent:
     externalId: ai_property_extractor_agent
   extraction:
     aiTimestampProperty: lastProcessedByAiExtractor
     properties:
     - property: tags
       writeMode: append
     - property: aliases
       writeMode: append
     textProperty: description
   processing:
     batchSize: 10
     llmBatchSize: 1
   prompt:
     customInstructions: 'Extract relevant tags and aliases from the asset description.
   ... (36 lines total)

Before extraction:

  springfield_instances/test_ai_ext_001:
   View: ViewId(space='cdf_cdm', external_id='CogniteAsset', version='v1')
     path: [{'space': 'springfield_instances', 'externalId': 'test_ai_ext_001'}]
     root: {'space': 'springfield_instances', 'externalId': 'test_ai_ext_001'}
     pathLastUpdatedTime: 2026-02-11T07:29:

In [16]:
# ── Test 3: Run 1 — Should append new values ──
print("\nRun 1: Extracting with append mode...")
result = trigger_function()


Run 1: Extracting with append mode...
Triggering function with data: {'logLevel': 'DEBUG', 'ExtractionPipelineExtId': 'ep_ai_property_extractor'}
   Call ID: 5363038844260422, Status: Completed
   Waiting for completion... Done!
   Result: {'message': 'ai_property_extractor executed successfully. Processed 3 instances, updated 3 instances.', 'status': 'success'}

   Function Logs:
   [INFO] Starting ai_property_extractor
   [INFO] Reading config from extraction pipeline: ep_ai_property_extractor
   [DEBUG] Raw config: agent:
             externalId: ai_property_extractor_agent
           extraction:
             aiTimestampProperty: lastProcessedByAiExtractor
             properties:
             - property: tags
               writeMode: append
             - property: aliases
               writeMode: append
             textProperty: description
           processing:
             batchSize: 10
             llmBatchSize: 1
           prompt:
             customInstructions: 'Extrac

In [17]:
# ── Test 3: Verify Run 1 ──
print("\nAfter Run 1:")
print("   Check that test_ai_ext_001 tags contain BOTH 'existing-tag-1/2' AND new LLM tags")
print("   Check no duplicates")
print("   Check lastProcessedByAiExtractor is set")
inspect_test_nodes(SOURCE_VIEW, TARGET_VIEW)


After Run 1:
   Check that test_ai_ext_001 tags contain BOTH 'existing-tag-1/2' AND new LLM tags
   Check no duplicates
   Check lastProcessedByAiExtractor is set

  springfield_instances/test_ai_ext_001:
   View: ViewId(space='cdf_cdm', external_id='CogniteAsset', version='v1')
     path: [{'space': 'springfield_instances', 'externalId': 'test_ai_ext_001'}]
     root: {'space': 'springfield_instances', 'externalId': 'test_ai_ext_001'}
     pathLastUpdatedTime: 2026-02-11T07:30:16.710578+00:00
     name: Pump P-101
     tags: ['existing-tag-1', 'existing-tag-2', 'cooling', 'water', 'pump', 'centrifugal']
     aliases: ['existing-alias']
     description: Centrifugal pump for cooling water circulation. Located in building B3, floor 2....
   View: ViewId(space='rmdm', external_id='Asset', version='6fe0dbb2ca7378')
     lastProcessedByAiExtractor: 2026-02-11T07:30:16.176+00:00
     path: [{'space': 'springfield_instances', 'externalId': 'test_ai_ext_001'}]
     root: {'space': 'springfie

In [18]:
# ── Test 3: Run 2 — Should process 0 (epoch filter) ──
print("\nRun 2: Should skip all nodes (same epoch)...")
result = trigger_function()

print("\n PASS if processed=0 above")


Run 2: Should skip all nodes (same epoch)...
Triggering function with data: {'logLevel': 'DEBUG', 'ExtractionPipelineExtId': 'ep_ai_property_extractor'}
   Call ID: 1368258748473899, Status: Completed
   Waiting for completion... Done!
   Result: {'message': 'ai_property_extractor executed successfully. Processed 0 instances, updated 0 instances.', 'status': 'success'}

   Function Logs:
   [INFO] Starting ai_property_extractor
   [INFO] Reading config from extraction pipeline: ep_ai_property_extractor
   [DEBUG] Raw config: agent:
             externalId: ai_property_extractor_agent
           extraction:
             aiTimestampProperty: lastProcessedByAiExtractor
             properties:
             - property: tags
               writeMode: append
             - property: aliases
               writeMode: append
             textProperty: description
           processing:
             batchSize: 10
             llmBatchSize: 1
           prompt:
             customInstructions: 

---

## Test 4: Mixed write modes

**Goal:** Config with both add_new_only and overwrite properties works correctly.  
**Expected:**
- tags (add_new_only) is NOT overwritten if already populated
- aliases (overwrite) IS always replaced

In [19]:
# ── Test 4: Setup ──
print("=" * 60)
print("TEST 4: Mixed write modes (add_new_only + overwrite)")
print("=" * 60)

# Reset state
reset_state_store()

# Pre-populate test_ai_ext_001 with existing tags (but empty aliases)
setup_nodes = [
    NodeApply(
        space=INSTANCE_SPACE,
        external_id="test_ai_ext_001",
        sources=[NodeOrEdgeData(TARGET_VIEW, properties={
            "tags": ["pre-existing-tag"],  # Should NOT be overwritten
            "aliases": None,                # Should be written
            "lastProcessedByAiExtractor": None,
        })]
    ),
]
for ext_id in TEST_NODE_IDS[1:]:
    setup_nodes.append(
        NodeApply(
            space=INSTANCE_SPACE,
            external_id=ext_id,
            sources=[NodeOrEdgeData(TARGET_VIEW, properties={
                "tags": None,
                "aliases": None,
                "lastProcessedByAiExtractor": None,
            })]
        )
    )
client.data_modeling.instances.apply(nodes=setup_nodes)
print("test_ai_ext_001: tags=['pre-existing-tag'], aliases=None")
print("   Other nodes: tags=None, aliases=None")

# Create and upload a mixed-mode config inline
mixed_config = {
    "agent": {"externalId": "ai_property_extractor_agent"},
    "view": {"space": "cdf_cdm", "externalId": "CogniteAsset", "version": "v1"},
    "targetView": {"space": "rmdm", "externalId": "Asset", "version": "6fe0dbb2ca7378"},
    "extraction": {
        "textProperty": "description",
        "aiTimestampProperty": "lastProcessedByAiExtractor",
        "properties": [
            {"property": "tags", "writeMode": "add_new_only"},
            {"property": "aliases", "writeMode": "overwrite"},
        ],
    },
    "processing": {"batchSize": 10, "llmBatchSize": 1},
    "stateStore": {
        "enabled": True,
        "rawDatabase": STATE_DB,
        "rawTable": STATE_TABLE,
        "configVersion": "v1-mixed",
    },
    "prompt": {
        "customInstructions": "Extract relevant tags and aliases from the asset description."
    },
}

client.extraction_pipelines.config.create(
    ExtractionPipelineConfigWrite(
        external_id=EXTRACTION_PIPELINE,
        config=yaml.dump(mixed_config, default_flow_style=False)
    )
)
print("\nUploaded mixed-mode config")

print("\nBefore extraction:")
inspect_test_nodes(SOURCE_VIEW, TARGET_VIEW)

TEST 4: Mixed write modes (add_new_only + overwrite)
   State store row deleted
test_ai_ext_001: tags=['pre-existing-tag'], aliases=None
   Other nodes: tags=None, aliases=None

Uploaded mixed-mode config

Before extraction:

  springfield_instances/test_ai_ext_001:
   View: ViewId(space='cdf_cdm', external_id='CogniteAsset', version='v1')
     path: [{'space': 'springfield_instances', 'externalId': 'test_ai_ext_001'}]
     root: {'space': 'springfield_instances', 'externalId': 'test_ai_ext_001'}
     pathLastUpdatedTime: 2026-02-11T07:30:16.710578+00:00
     name: Pump P-101
     tags: ['pre-existing-tag']
     description: Centrifugal pump for cooling water circulation. Located in building B3, floor 2....
   View: ViewId(space='rmdm', external_id='Asset', version='6fe0dbb2ca7378')
     path: [{'space': 'springfield_instances', 'externalId': 'test_ai_ext_001'}]
     root: {'space': 'springfield_instances', 'externalId': 'test_ai_ext_001'}
     pathLastUpdatedTime: 2026-02-11T07:30:16.

In [20]:
# ── Test 4: Run ──
print("\nRunning mixed-mode extraction...")
result = trigger_function() 


Running mixed-mode extraction...
Triggering function with data: {'logLevel': 'DEBUG', 'ExtractionPipelineExtId': 'ep_ai_property_extractor'}
   Call ID: 8982270174014594, Status: Completed
   Waiting for completion... Done!
   Result: {'message': 'ai_property_extractor executed successfully. Processed 3 instances, updated 3 instances.', 'status': 'success'}

   Function Logs:
   [INFO] Starting ai_property_extractor
   [INFO] Reading config from extraction pipeline: ep_ai_property_extractor
   [DEBUG] Raw config: agent:
             externalId: ai_property_extractor_agent
           extraction:
             aiTimestampProperty: lastProcessedByAiExtractor
             properties:
             - property: tags
               writeMode: add_new_only
             - property: aliases
               writeMode: overwrite
             textProperty: description
           processing:
             batchSize: 10
             llmBatchSize: 1
           prompt:
             customInstructions: Ext

In [21]:
# ── Test 4: Verify ──
print("\nAfter extraction:")
print("   test_ai_ext_001: tags should STILL be ['pre-existing-tag'] (add_new_only, already populated)")
print("   test_ai_ext_001: aliases should have LLM-extracted values (overwrite)")
print("   Other nodes: both tags and aliases should have values")
inspect_test_nodes(SOURCE_VIEW, TARGET_VIEW)


After extraction:
   test_ai_ext_001: tags should STILL be ['pre-existing-tag'] (add_new_only, already populated)
   test_ai_ext_001: aliases should have LLM-extracted values (overwrite)
   Other nodes: both tags and aliases should have values

  springfield_instances/test_ai_ext_001:
   View: ViewId(space='cdf_cdm', external_id='CogniteAsset', version='v1')
     path: [{'space': 'springfield_instances', 'externalId': 'test_ai_ext_001'}]
     root: {'space': 'springfield_instances', 'externalId': 'test_ai_ext_001'}
     pathLastUpdatedTime: 2026-02-11T07:30:35.831374+00:00
     name: Pump P-101
     tags: ['pre-existing-tag']
     aliases: ['cooling', 'water', 'pump', 'centrifugal']
     description: Centrifugal pump for cooling water circulation. Located in building B3, floor 2....
   View: ViewId(space='rmdm', external_id='Asset', version='6fe0dbb2ca7378')
     lastProcessedByAiExtractor: 2026-02-11T07:30:35.522+00:00
     path: [{'space': 'springfield_instances', 'externalId': 'tes

---
## Test 5: Target view (cross-view extraction)

**Goal:** Read from `cdf_cdm/CogniteAsset/v1`, write extracted properties to `rmdm/Asset`.  
**Config:** config_target_view.yaml (overwrite mode + aiTimestampProperty)  
**Expected:**
- Tags and aliases are extracted and written correctly
- `lastProcessedByAiExtractor` is set on the target view (proves writes go to target)
- Second run processes 0 (epoch filter via target view's aiTimestamp)

**Note:** Since `rmdm/Asset` implements `CogniteAsset`, shared properties (tags, aliases)
live in the same underlying container. The key cross-view validation is that
`lastProcessedByAiExtractor` (which only exists on the target view) gets written correctly.

In [22]:
# ── Test 5: Setup ──
print("=" * 60)
print("TEST 5: Target view (cross-view extraction)")
print("=" * 60)

reset_state_store()

# Clear tags, aliases, and lastProcessedByAiExtractor on both views
clear_nodes = [
    NodeApply(
        space=INSTANCE_SPACE,
        external_id=ext_id,
        sources=[
            NodeOrEdgeData(SOURCE_VIEW, properties={
                "tags": None,
                "aliases": None,
            }),
            NodeOrEdgeData(TARGET_VIEW, properties={
                "tags": None,
                "aliases": None,
                "lastProcessedByAiExtractor": None,
            }),
        ]
    )
    for ext_id in TEST_NODE_IDS
]
client.data_modeling.instances.apply(nodes=clear_nodes)
print("Cleared tags, aliases, and lastProcessedByAiExtractor on all test nodes")

# Upload config (overwrite mode + aiTimestampProperty + targetView)
upload_test_config("config_target_view.yaml")

print("\nBefore extraction:")
inspect_test_nodes(SOURCE_VIEW, TARGET_VIEW)

TEST 5: Target view (cross-view extraction)
   State store row deleted
Cleared tags, aliases, and lastProcessedByAiExtractor on all test nodes
Uploaded config: config_target_view.yaml
   agent:
     externalId: ai_property_extractor_agent
   extraction:
     aiTimestampProperty: lastProcessedByAiExtractor
     properties:
     - property: tags
       writeMode: overwrite
     - property: aliases
       writeMode: overwrite
     textProperty: description
   processing:
     batchSize: 10
     llmBatchSize: 1
   prompt:
     customInstructions: 'Extract relevant tags and aliases from the asset description.
   ... (36 lines total)

Before extraction:

  springfield_instances/test_ai_ext_001:
   View: ViewId(space='cdf_cdm', external_id='CogniteAsset', version='v1')
     path: [{'space': 'springfield_instances', 'externalId': 'test_ai_ext_001'}]
     root: {'space': 'springfield_instances', 'externalId': 'test_ai_ext_001'}
     pathLastUpdatedTime: 2026-02-11T07:30:35.831374+00:00
     nam

In [23]:
# ── Test 5: Run 1 — Should extract and write to target view ──
print("\nRun 1: Extracting with cross-view config (overwrite + aiTimestamp)...")
result = trigger_function()


Run 1: Extracting with cross-view config (overwrite + aiTimestamp)...
Triggering function with data: {'logLevel': 'DEBUG', 'ExtractionPipelineExtId': 'ep_ai_property_extractor'}
   Call ID: 8728979186139068, Status: Completed
   Waiting for completion... Done!
   Result: {'message': 'ai_property_extractor executed successfully. Processed 3 instances, updated 3 instances.', 'status': 'success'}

   Function Logs:
   [INFO] Starting ai_property_extractor
   [INFO] Reading config from extraction pipeline: ep_ai_property_extractor
   [DEBUG] Raw config: agent:
             externalId: ai_property_extractor_agent
           extraction:
             aiTimestampProperty: lastProcessedByAiExtractor
             properties:
             - property: tags
               writeMode: overwrite
             - property: aliases
               writeMode: overwrite
             textProperty: description
           processing:
             batchSize: 10
             llmBatchSize: 1
           prompt:
  

In [24]:
# ── Test 5: Verify Run 1 ──
print("\nAfter Run 1:")
print("   Check: tags and aliases should have LLM-extracted values")
print("   Check: lastProcessedByAiExtractor should be set on target view")
print("   (This proves writes went to the target view's container)")
inspect_test_nodes(SOURCE_VIEW, TARGET_VIEW)


After Run 1:
   Check: tags and aliases should have LLM-extracted values
   Check: lastProcessedByAiExtractor should be set on target view
   (This proves writes went to the target view's container)

  springfield_instances/test_ai_ext_001:
   View: ViewId(space='cdf_cdm', external_id='CogniteAsset', version='v1')
     path: [{'space': 'springfield_instances', 'externalId': 'test_ai_ext_001'}]
     root: {'space': 'springfield_instances', 'externalId': 'test_ai_ext_001'}
     pathLastUpdatedTime: 2026-02-11T07:30:55.151222+00:00
     name: Pump P-101
     tags: ['cooling', 'water', 'pump', 'centrifugal']
     description: Centrifugal pump for cooling water circulation. Located in building B3, floor 2....
   View: ViewId(space='rmdm', external_id='Asset', version='6fe0dbb2ca7378')
     lastProcessedByAiExtractor: 2026-02-11T07:30:54.915+00:00
     path: [{'space': 'springfield_instances', 'externalId': 'test_ai_ext_001'}]
     root: {'space': 'springfield_instances', 'externalId': 'tes

In [25]:
# ── Test 5: Run 2 — Should process 0 (epoch filter via target view aiTimestamp) ──
print("\nRun 2: Should skip all nodes (same epoch, aiTimestamp on target view)...")
result = trigger_function()

print("\n PASS if processed=0 above (epoch filter uses lastProcessedByAiExtractor on target view)")


Run 2: Should skip all nodes (same epoch, aiTimestamp on target view)...
Triggering function with data: {'logLevel': 'DEBUG', 'ExtractionPipelineExtId': 'ep_ai_property_extractor'}
   Call ID: 5125764066798214, Status: Completed
   Waiting for completion... Done!
   Result: {'message': 'ai_property_extractor executed successfully. Processed 0 instances, updated 0 instances.', 'status': 'success'}

   Function Logs:
   [INFO] Starting ai_property_extractor
   [INFO] Reading config from extraction pipeline: ep_ai_property_extractor
   [DEBUG] Raw config: agent:
             externalId: ai_property_extractor_agent
           extraction:
             aiTimestampProperty: lastProcessedByAiExtractor
             properties:
             - property: tags
               writeMode: overwrite
             - property: aliases
               writeMode: overwrite
             textProperty: description
           processing:
             batchSize: 10
             llmBatchSize: 1
           prompt:

---
## Test 6: LLM batch processing

Sends multiple instances to the LLM in a single prompt (llmBatchSize=5). Check logs for batch messages.

In [26]:
# ── Test 6: Setup ──
print("=" * 60)
print("TEST 6: LLM batch processing")
print("=" * 60)

reset_state_store()

# Clear tags/aliases on source view
clear_nodes = [
    NodeApply(
        space=INSTANCE_SPACE,
        external_id=ext_id,
        sources=[NodeOrEdgeData(SOURCE_VIEW, properties={
            "tags": None,
            "aliases": None,
        })]
    )
    for ext_id in TEST_NODE_IDS
]
client.data_modeling.instances.apply(nodes=clear_nodes)
print("Cleared tags and aliases")

upload_test_config("config_llm_batch.yaml")

print("\nBefore extraction:")
inspect_test_nodes()

TEST 6: LLM batch processing
   State store row deleted
Cleared tags and aliases
Uploaded config: config_llm_batch.yaml
   agent:
     externalId: ai_property_extractor_agent
   extraction:
     properties:
     - property: tags
       writeMode: add_new_only
     - property: aliases
       writeMode: add_new_only
     textProperty: description
   processing:
     batchSize: 20
     llmBatchSize: 5
   prompt:
     customInstructions: 'Extract relevant tags and aliases from the asset description.
   
   ... (31 lines total)

Before extraction:

  springfield_instances/test_ai_ext_001:
   View: ViewId(space='cdf_cdm', external_id='CogniteAsset', version='v1')
     path: [{'space': 'springfield_instances', 'externalId': 'test_ai_ext_001'}]
     root: {'space': 'springfield_instances', 'externalId': 'test_ai_ext_001'}
     pathLastUpdatedTime: 2026-02-11T07:30:55.151222+00:00
     name: Pump P-101
     description: Centrifugal pump for cooling water circulation. Located in building B3, flo

In [27]:
# ── Test 6: Run ──
print("\nRunning with llmBatchSize=5...")
print("   Look for 'Sending batch prompt to agent' in logs")
result = trigger_function()


Running with llmBatchSize=5...
   Look for 'Sending batch prompt to agent' in logs
Triggering function with data: {'logLevel': 'DEBUG', 'ExtractionPipelineExtId': 'ep_ai_property_extractor'}
   Call ID: 6718198570637307, Status: Completed
   Waiting for completion... Done!
   Result: {'message': 'ai_property_extractor executed successfully. Processed 2796 instances, updated 3 instances.', 'status': 'success'}

   Function Logs:
   [INFO] Starting ai_property_extractor
   [INFO] Reading config from extraction pipeline: ep_ai_property_extractor
   [DEBUG] Raw config: agent:
             externalId: ai_property_extractor_agent
           extraction:
             properties:
             - property: tags
               writeMode: add_new_only
             - property: aliases
               writeMode: add_new_only
             textProperty: description
           processing:
             batchSize: 20
             llmBatchSize: 5
           prompt:
             customInstructions: 'Extract

In [28]:
# ── Test 6: Verify ──
print("\nAfter extraction:")
print("   All nodes should have tags and aliases")
print("   Check logs above for batch processing messages")
inspect_test_nodes()


After extraction:
   All nodes should have tags and aliases
   Check logs above for batch processing messages

  springfield_instances/test_ai_ext_001:
   View: ViewId(space='cdf_cdm', external_id='CogniteAsset', version='v1')
     path: [{'space': 'springfield_instances', 'externalId': 'test_ai_ext_001'}]
     root: {'space': 'springfield_instances', 'externalId': 'test_ai_ext_001'}
     pathLastUpdatedTime: 2026-02-11T07:31:06.034974+00:00
     name: Pump P-101
     tags: ['cooling', 'water', 'pump', 'centrifugal']
     description: Centrifugal pump for cooling water circulation. Located in building B3, floor 2....

  springfield_instances/test_ai_ext_002:
   View: ViewId(space='cdf_cdm', external_id='CogniteAsset', version='v1')
     path: [{'space': 'springfield_instances', 'externalId': 'test_ai_ext_002'}]
     root: {'space': 'springfield_instances', 'externalId': 'test_ai_ext_002'}
     pathLastUpdatedTime: 2026-02-11T07:31:06.034974+00:00
     name: Valve V-202
     tags: ['st

---
## Test 7: State reset

Tests that resetState=true causes a full re-run. With add_new_only, nodes with existing properties are still skipped by the property filter.

In [29]:
# ── Test 7: Setup ──
print("=" * 60)
print("TEST 7: State reset")
print("=" * 60)

reset_state_store()

# Clear tags/aliases
clear_nodes = [
    NodeApply(
        space=INSTANCE_SPACE,
        external_id=ext_id,
        sources=[NodeOrEdgeData(SOURCE_VIEW, properties={
            "tags": None,
            "aliases": None,
        })]
    )
    for ext_id in TEST_NODE_IDS
]
client.data_modeling.instances.apply(nodes=clear_nodes)
print("Cleared tags and aliases")

upload_test_config("config_add_new_only.yaml")

TEST 7: State reset
   State store row deleted
Cleared tags and aliases
Uploaded config: config_add_new_only.yaml
   agent:
     externalId: ai_property_extractor_agent
   extraction:
     properties:
     - property: tags
       writeMode: add_new_only
     - property: aliases
       writeMode: add_new_only
     textProperty: description
   processing:
     batchSize: 10
     llmBatchSize: 1
   prompt:
     customInstructions: 'Extract relevant tags and aliases from the asset description.
   
   ... (31 lines total)


In [30]:
# ── Test 7: Run 1 — Normal run ──
print("\nRun 1: Normal run (should process all)...")
result = trigger_function()

print("\nState Store after Run 1:")
inspect_state_store()


Run 1: Normal run (should process all)...
Triggering function with data: {'logLevel': 'DEBUG', 'ExtractionPipelineExtId': 'ep_ai_property_extractor'}
   Call ID: 4990431140911111, Status: Completed
   Waiting for completion... Done!
   Result: {'message': 'ai_property_extractor executed successfully. Processed 249 instances, updated 3 instances.', 'status': 'success'}

   Function Logs:
   [INFO] Starting ai_property_extractor
   [INFO] Reading config from extraction pipeline: ep_ai_property_extractor
   [DEBUG] Raw config: agent:
             externalId: ai_property_extractor_agent
           extraction:
             properties:
             - property: tags
               writeMode: add_new_only
             - property: aliases
               writeMode: add_new_only
             textProperty: description
           processing:
             batchSize: 10
             llmBatchSize: 1
           prompt:
             customInstructions: 'Extract relevant tags and aliases from the asset 

In [31]:
# ── Test 7: Run 2 — Should process 0 ──
print("\nRun 2: Should process 0 (properties already populated)...")
result = trigger_function()

print("\n PASS if processed=0")


Run 2: Should process 0 (properties already populated)...
Triggering function with data: {'logLevel': 'DEBUG', 'ExtractionPipelineExtId': 'ep_ai_property_extractor'}
   Call ID: 2989139282152006, Status: Completed
   Waiting for completion... Done!
   Result: {'message': 'ai_property_extractor executed successfully. Processed 260 instances, updated 0 instances.', 'status': 'success'}

   Function Logs:
   [INFO] Starting ai_property_extractor
   [INFO] Reading config from extraction pipeline: ep_ai_property_extractor
   [DEBUG] Raw config: agent:
             externalId: ai_property_extractor_agent
           extraction:
             properties:
             - property: tags
               writeMode: add_new_only
             - property: aliases
               writeMode: add_new_only
             textProperty: description
           processing:
             batchSize: 10
             llmBatchSize: 1
           prompt:
             customInstructions: 'Extract relevant tags and aliases

In [32]:
# ── Test 7: Run 3 — Reset state (add_new_only still skips populated) ──
print("\nRun 3: Reset state (should still process 0 for add_new_only since properties exist)...")
result = trigger_function(reset_state=True)

print("\nState Store after reset:")
inspect_state_store()

print("\nNote: With add_new_only mode, resetState clears state store stats")
print("   but nodes with existing tags/aliases are still skipped by the property filter.")
print("   To truly re-extract, clear the target properties first.")


Run 3: Reset state (should still process 0 for add_new_only since properties exist)...
Triggering function with data: {'logLevel': 'DEBUG', 'ExtractionPipelineExtId': 'ep_ai_property_extractor', 'resetState': True}
   Call ID: 6826650445016389, Status: Completed
   Waiting for completion... Done!
   Result: {'message': 'ai_property_extractor executed successfully. Processed 257 instances, updated 0 instances.', 'status': 'success'}

   Function Logs:
   [INFO] Starting ai_property_extractor
   [INFO] Reading config from extraction pipeline: ep_ai_property_extractor
   [DEBUG] Raw config: agent:
             externalId: ai_property_extractor_agent
           extraction:
             properties:
             - property: tags
               writeMode: add_new_only
             - property: aliases
               writeMode: add_new_only
             textProperty: description
           processing:
             batchSize: 10
             llmBatchSize: 1
           prompt:
             custo

In [33]:
# ── Test 7: Run 4 — Reset state + clear properties -> should reprocess ──
print("\nRun 4: Clear properties + reset state -> should reprocess all...")

clear_nodes = [
    NodeApply(
        space=INSTANCE_SPACE,
        external_id=ext_id,
        sources=[NodeOrEdgeData(SOURCE_VIEW, properties={
            "tags": None,
            "aliases": None,
        })]
    )
    for ext_id in TEST_NODE_IDS
]
client.data_modeling.instances.apply(nodes=clear_nodes)
print("   Cleared tags and aliases")

result = trigger_function(reset_state=True)

print("\n PASS if processed > 0")


Run 4: Clear properties + reset state -> should reprocess all...
   Cleared tags and aliases
Triggering function with data: {'logLevel': 'DEBUG', 'ExtractionPipelineExtId': 'ep_ai_property_extractor', 'resetState': True}
   Call ID: 3913207632155495, Status: Completed
   Waiting for completion... Done!
   Result: {'message': 'ai_property_extractor executed successfully. Processed 265 instances, updated 3 instances.', 'status': 'success'}

   Function Logs:
   [INFO] Starting ai_property_extractor
   [INFO] Reading config from extraction pipeline: ep_ai_property_extractor
   [DEBUG] Raw config: agent:
             externalId: ai_property_extractor_agent
           extraction:
             properties:
             - property: tags
               writeMode: add_new_only
             - property: aliases
               writeMode: add_new_only
             textProperty: description
           processing:
             batchSize: 10
             llmBatchSize: 1
           prompt:
            

---
## Test 8: No state store

Function works without state store. Processes everything every run, but add_new_only property filter still prevents reprocessing populated nodes.

In [34]:
# ── Test 8: Setup ──
print("=" * 60)
print("TEST 8: No state store")
print("=" * 60)

reset_state_store()

# Clear tags/aliases
clear_nodes = [
    NodeApply(
        space=INSTANCE_SPACE,
        external_id=ext_id,
        sources=[NodeOrEdgeData(SOURCE_VIEW, properties={
            "tags": None,
            "aliases": None,
        })]
    )
    for ext_id in TEST_NODE_IDS
]
client.data_modeling.instances.apply(nodes=clear_nodes)
print("Cleared tags and aliases")

upload_test_config("config_no_statestore.yaml")

TEST 8: No state store
   State store row deleted
Cleared tags and aliases
Uploaded config: config_no_statestore.yaml
   agent:
     externalId: ai_property_extractor_agent
   extraction:
     properties:
     - property: tags
       writeMode: add_new_only
     - property: aliases
       writeMode: add_new_only
     textProperty: description
   processing:
     batchSize: 10
     llmBatchSize: 1
   prompt:
     customInstructions: 'Extract relevant tags and aliases from the asset description.
   
   ... (28 lines total)


In [35]:
# ── Test 8: Run 1 — Should process all ──
print("\nRun 1: Processing without state store...")
result = trigger_function()


Run 1: Processing without state store...
Triggering function with data: {'logLevel': 'DEBUG', 'ExtractionPipelineExtId': 'ep_ai_property_extractor'}
   Call ID: 5284396764957818, Status: Completed
   Waiting for completion... Done!
   Result: {'message': 'ai_property_extractor executed successfully. Processed 261 instances, updated 3 instances.', 'status': 'success'}

   Function Logs:
   [INFO] Starting ai_property_extractor
   [INFO] Reading config from extraction pipeline: ep_ai_property_extractor
   [DEBUG] Raw config: agent:
             externalId: ai_property_extractor_agent
           extraction:
             properties:
             - property: tags
               writeMode: add_new_only
             - property: aliases
               writeMode: add_new_only
             textProperty: description
           processing:
             batchSize: 10
             llmBatchSize: 1
           prompt:
             customInstructions: 'Extract relevant tags and aliases from the asset d

In [36]:
# ── Test 8: Verify Run 1 ──
print("\nAfter Run 1:")
inspect_test_nodes()

print("\nState Store (should be empty / no new rows):")
inspect_state_store()


After Run 1:

  springfield_instances/test_ai_ext_001:
   View: ViewId(space='cdf_cdm', external_id='CogniteAsset', version='v1')
     path: [{'space': 'springfield_instances', 'externalId': 'test_ai_ext_001'}]
     root: {'space': 'springfield_instances', 'externalId': 'test_ai_ext_001'}
     pathLastUpdatedTime: 2026-02-11T08:15:36.407062+00:00
     name: Pump P-101
     tags: ['cooling', 'water', 'pump', 'centrifugal']
     description: Centrifugal pump for cooling water circulation. Located in building B3, floor 2....

  springfield_instances/test_ai_ext_002:
   View: ViewId(space='cdf_cdm', external_id='CogniteAsset', version='v1')
     path: [{'space': 'springfield_instances', 'externalId': 'test_ai_ext_002'}]
     root: {'space': 'springfield_instances', 'externalId': 'test_ai_ext_002'}
     pathLastUpdatedTime: 2026-02-11T08:15:36.407062+00:00
     name: Valve V-202
     tags: ['steam', 'isolation', 'valve', 'ball-valve']
     description: Ball valve for steam isolation. DN150

In [37]:
# ── Test 8: Run 2 — add_new_only should still skip populated nodes ──
print("\nRun 2: Should skip populated nodes (add_new_only property filter)...")
result = trigger_function()

print("\n PASS if processed=0 (property filter prevents reprocessing even without state store)")


Run 2: Should skip populated nodes (add_new_only property filter)...
Triggering function with data: {'logLevel': 'DEBUG', 'ExtractionPipelineExtId': 'ep_ai_property_extractor'}
   Call ID: 8021292588197261, Status: Completed
   Waiting for completion... Done!
   Result: {'message': 'ai_property_extractor executed successfully. Processed 270 instances, updated 0 instances.', 'status': 'success'}

   Function Logs:
   [INFO] Starting ai_property_extractor
   [INFO] Reading config from extraction pipeline: ep_ai_property_extractor
   [DEBUG] Raw config: agent:
             externalId: ai_property_extractor_agent
           extraction:
             properties:
             - property: tags
               writeMode: add_new_only
             - property: aliases
               writeMode: add_new_only
             textProperty: description
           processing:
             batchSize: 10
             llmBatchSize: 1
           prompt:
             customInstructions: 'Extract relevant tags 

---
## 9. Cleanup

Remove test data and reset state store.

In [38]:
# ── Cleanup: Delete test nodes ──
print("Cleaning up test data...")

client.data_modeling.instances.delete(
    nodes=[NodeId(INSTANCE_SPACE, ext_id) for ext_id in TEST_NODE_IDS]
)
print(f"   Deleted {len(TEST_NODE_IDS)} test nodes")

# Reset state store
reset_state_store()

print("\nCleanup complete!")

Cleaning up test data...
   Deleted 3 test nodes
   State store row deleted

Cleanup complete!
