# Filtering Queries with Session-Scoped UDTFs

This notebook demonstrates how to filter data when querying session-scoped UDTFs using WHERE clauses.

## Prerequisites

- UDTFs must be registered (see `basic_registration.ipynb`)
- CDF credentials configured in Secret Manager


## Step 1: Equality Filters


In [None]:
# Filter by single property - select all properties
query = """
SELECT * FROM small_boat_udtf(
    client_id => SECRET('cdf_sailboat_sailboat', 'client_id'),
    client_secret => SECRET('cdf_sailboat_sailboat', 'client_secret'),
    tenant_id => SECRET('cdf_sailboat_sailboat', 'tenant_id'),
    cdf_cluster => SECRET('cdf_sailboat_sailboat', 'cdf_cluster'),
    project => SECRET('cdf_sailboat_sailboat', 'project'),
    name => NULL,
    description => NULL
)
WHERE name = 'MyBoat'
LIMIT 10;
"""

result = spark.sql(query)
result.show(truncate=False)


In [None]:
# Filter by single property - select specific properties
query = """
SELECT 
    external_id,
    name,
    space
FROM small_boat_udtf(
    client_id => SECRET('cdf_sailboat_sailboat', 'client_id'),
    client_secret => SECRET('cdf_sailboat_sailboat', 'client_secret'),
    tenant_id => SECRET('cdf_sailboat_sailboat', 'tenant_id'),
    cdf_cluster => SECRET('cdf_sailboat_sailboat', 'cdf_cluster'),
    project => SECRET('cdf_sailboat_sailboat', 'project'),
    name => NULL,
    description => NULL
)
WHERE name = 'MyBoat'
LIMIT 10;
"""

result = spark.sql(query)
result.show(truncate=False)


In [None]:
# Filter by space and external_id
query = """
SELECT * FROM vessel_udtf(
    client_id => SECRET('cdf_sailboat_sailboat', 'client_id'),
    client_secret => SECRET('cdf_sailboat_sailboat', 'client_secret'),
    tenant_id => SECRET('cdf_sailboat_sailboat', 'tenant_id'),
    cdf_cluster => SECRET('cdf_sailboat_sailboat', 'cdf_cluster'),
    project => SECRET('cdf_sailboat_sailboat', 'project'),
    name => NULL,
    description => NULL
)
WHERE space = 'sailboat' AND external_id = 'vessel-123'
LIMIT 10;
"""

result = spark.sql(query)
result.show(truncate=False)


## Step 2: Range Filters


In [None]:
# Filter by timestamp range
query = """
SELECT * FROM pump_view_udtf(
    client_id => SECRET('cdf_power_windturbine', 'client_id'),
    client_secret => SECRET('cdf_power_windturbine', 'client_secret'),
    tenant_id => SECRET('cdf_power_windturbine', 'tenant_id'),
    cdf_cluster => SECRET('cdf_power_windturbine', 'cdf_cluster'),
    project => SECRET('cdf_power_windturbine', 'project'),
    name => NULL,
    description => NULL
)
WHERE timestamp > '2025-01-01' AND timestamp < '2025-12-31'
ORDER BY timestamp;
"""

result = spark.sql(query)
result.show(truncate=False)


In [None]:
# Filter by numeric range
query = """
SELECT * FROM sensor_udtf(
    client_id => SECRET('cdf_power_windturbine', 'client_id'),
    client_secret => SECRET('cdf_power_windturbine', 'client_secret'),
    tenant_id => SECRET('cdf_power_windturbine', 'tenant_id'),
    cdf_cluster => SECRET('cdf_power_windturbine', 'cdf_cluster'),
    project => SECRET('cdf_power_windturbine', 'project'),
    name => NULL,
    description => NULL
)
WHERE value > 100 AND value < 200
LIMIT 10;
"""

result = spark.sql(query)
result.show(truncate=False)


## Step 3: NULL Handling


In [None]:
# Filter out NULL values
query = """
SELECT * FROM small_boat_udtf(
    client_id => SECRET('cdf_sailboat_sailboat', 'client_id'),
    client_secret => SECRET('cdf_sailboat_sailboat', 'client_secret'),
    tenant_id => SECRET('cdf_sailboat_sailboat', 'tenant_id'),
    cdf_cluster => SECRET('cdf_sailboat_sailboat', 'cdf_cluster'),
    project => SECRET('cdf_sailboat_sailboat', 'project'),
    name => NULL,
    description => NULL
)
WHERE description IS NOT NULL
LIMIT 10;
"""

result = spark.sql(query)
result.show(truncate=False)


In [None]:
# Find records with NULL values
query = """
SELECT * FROM vessel_udtf(
    client_id => SECRET('cdf_sailboat_sailboat', 'client_id'),
    client_secret => SECRET('cdf_sailboat_sailboat', 'client_secret'),
    tenant_id => SECRET('cdf_sailboat_sailboat', 'tenant_id'),
    cdf_cluster => SECRET('cdf_sailboat_sailboat', 'cdf_cluster'),
    project => SECRET('cdf_sailboat_sailboat', 'project'),
    name => NULL,
    description => NULL
)
WHERE name IS NULL
LIMIT 10;
"""

result = spark.sql(query)
result.show(truncate=False)


## Step 4: Multiple Conditions


In [None]:
# Complex filtering with multiple conditions
query = """
SELECT * FROM pump_view_udtf(
    client_id => SECRET('cdf_power_windturbine', 'client_id'),
    client_secret => SECRET('cdf_power_windturbine', 'client_secret'),
    tenant_id => SECRET('cdf_power_windturbine', 'tenant_id'),
    cdf_cluster => SECRET('cdf_power_windturbine', 'cdf_cluster'),
    project => SECRET('cdf_power_windturbine', 'project'),
    name => NULL,
    description => NULL
)
WHERE space = 'power'
  AND timestamp > '2025-01-01'
  AND status = 'active'
  AND value > 50
ORDER BY timestamp DESC
LIMIT 100;
"""

result = spark.sql(query)
result.show(truncate=False)
