# Querying Data with Session-Scoped UDTFs

This notebook demonstrates various ways to query data using session-scoped UDTFs.

## Prerequisites

- UDTFs must be registered (see `basic_registration.ipynb`)
- CDF credentials configured in Secret Manager


## Step 1: Query Single UDTF


In [None]:
# Basic query with LIMIT
query = """
SELECT * FROM smallboat_udtf(
    client_id => SECRET('cdf_sailboat_sailboat', 'client_id'),
    client_secret => SECRET('cdf_sailboat_sailboat', 'client_secret'),
    tenant_id => SECRET('cdf_sailboat_sailboat', 'tenant_id'),
    cdf_cluster => SECRET('cdf_sailboat_sailboat', 'cdf_cluster'),
    project => SECRET('cdf_sailboat_sailboat', 'project'),
    name => NULL,
    description => NULL
) LIMIT 10;
"""

result = spark.sql(query)
result.show(truncate=False)


## Step 2: Select All Properties vs Specific Properties


In [None]:
# Select all properties (SELECT *)
query_all = """
SELECT * FROM smallboat_udtf(
    client_id => SECRET('cdf_sailboat_sailboat', 'client_id'),
    client_secret => SECRET('cdf_sailboat_sailboat', 'client_secret'),
    tenant_id => SECRET('cdf_sailboat_sailboat', 'tenant_id'),
    cdf_cluster => SECRET('cdf_sailboat_sailboat', 'cdf_cluster'),
    project => SECRET('cdf_sailboat_sailboat', 'project'),
    name => NULL,
    description => NULL
) LIMIT 10;
"""

result_all = spark.sql(query_all)
print("All properties:")
result_all.show(truncate=False)


In [None]:
# Select specific properties (subset of columns)
query_subset = """
SELECT 
    external_id,
    name,
    space
FROM smallboat_udtf(
    client_id => SECRET('cdf_sailboat_sailboat', 'client_id'),
    client_secret => SECRET('cdf_sailboat_sailboat', 'client_secret'),
    tenant_id => SECRET('cdf_sailboat_sailboat', 'tenant_id'),
    cdf_cluster => SECRET('cdf_sailboat_sailboat', 'cdf_cluster'),
    project => SECRET('cdf_sailboat_sailboat', 'project'),
    name => NULL,
    description => NULL
) LIMIT 10;
"""

result_subset = spark.sql(query_subset)
print("Specific properties only:")
result_subset.show(truncate=False)


## Step 3: Named vs Positional Parameters


In [None]:
# Named parameters (recommended - clearer and more maintainable)
query_named = """
SELECT * FROM smallboat_udtf(
    client_id => SECRET('cdf_sailboat_sailboat', 'client_id'),
    client_secret => SECRET('cdf_sailboat_sailboat', 'client_secret'),
    tenant_id => SECRET('cdf_sailboat_sailboat', 'tenant_id'),
    cdf_cluster => SECRET('cdf_sailboat_sailboat', 'cdf_cluster'),
    project => SECRET('cdf_sailboat_sailboat', 'project'),
    name => 'MyBoat',
    description => NULL
) LIMIT 10;
"""

result_named = spark.sql(query_named)
print("Named parameters:")
result_named.show(truncate=False)


## Step 4: Time Series UDTF Queries


In [None]:
# Time series UDTFs are automatically registered when you call register_session_scoped_udtfs()
# They are included in the registered dictionary from basic_registration.ipynb
# No manual registration needed - they use template-based generation like data model UDTFs

print("âœ“ Time Series UDTFs are auto-registered (no manual registration needed)")


In [None]:
# Query single time series datapoints
query_ts = """
SELECT * FROM time_series_datapoints_udtf(
    instance_id => 'sailboat:vessels.urn:mrn:imo:mmsi:258219000::129038::navigation.speedOverGround',
    start => '47w-ago',
    end => '46w-ago',
    client_id => SECRET('cdf_sailboat_sailboat', 'client_id'),
    client_secret => SECRET('cdf_sailboat_sailboat', 'client_secret'),
    tenant_id => SECRET('cdf_sailboat_sailboat', 'tenant_id'),
    cdf_cluster => SECRET('cdf_sailboat_sailboat', 'cdf_cluster'),
    project => SECRET('cdf_sailboat_sailboat', 'project')
)
ORDER BY timestamp
LIMIT 10;
"""

result_ts = spark.sql(query_ts)
result_ts.show(truncate=False)


In [None]:
# Query multiple time series in long format
# Note: instance_ids uses format "space:external_id" and supports time series from different spaces
query_ts_long = """
SELECT * FROM time_series_datapoints_long_udtf(
    instance_ids => 'sailboat:vessels.urn:mrn:imo:mmsi:258219000::129038::navigation.speedOverGround,sailboat:vessels.urn:mrn:imo:mmsi:258219000::129038::navigation.courseOverGroundTrue',  -- Format: "space:external_id"
    start => '47w-ago',
    end => '46w-ago',
    client_id => SECRET('cdf_sailboat_sailboat', 'client_id'),
    client_secret => SECRET('cdf_sailboat_sailboat', 'client_secret'),
    tenant_id => SECRET('cdf_sailboat_sailboat', 'tenant_id'),
    cdf_cluster => SECRET('cdf_sailboat_sailboat', 'cdf_cluster'),
    project => SECRET('cdf_sailboat_sailboat', 'project')
)
ORDER BY time_series_external_id, timestamp
LIMIT 20;
"""

result_ts_long = spark.sql(query_ts_long)
result_ts_long.show(truncate=False)


In [None]:
# Query latest datapoints for multiple time series
# Note: instance_ids uses format "space:external_id" and supports time series from different spaces
query_ts_latest = """
SELECT * FROM time_series_latest_datapoints_udtf(
    instance_ids => 'sailboat:vessels.urn:mrn:imo:mmsi:258219000::129038::navigation.speedOverGround,sailboat:vessels.urn:mrn:imo:mmsi:258219000::129038::navigation.courseOverGroundTrue',  -- Format: "space:external_id"
    before => 'now',  -- Get latest before this time (or use '1h-ago', ISO 8601, etc.)
    include_status => true,  -- Include status_code in output
    client_id => SECRET('cdf_sailboat_sailboat', 'client_id'),
    client_secret => SECRET('cdf_sailboat_sailboat', 'client_secret'),
    tenant_id => SECRET('cdf_sailboat_sailboat', 'tenant_id'),
    cdf_cluster => SECRET('cdf_sailboat_sailboat', 'cdf_cluster'),
    project => SECRET('cdf_sailboat_sailboat', 'project')
)
ORDER BY time_series_external_id;
"""

result_ts_latest = spark.sql(query_ts_latest)
result_ts_latest.show(truncate=False)
