diff --git a/cleancloud/providers/gcp/rules/ai/featurestore_idle.py b/cleancloud/providers/gcp/rules/ai/featurestore_idle.py index 53aa939..53fb1ba 100644 --- a/cleancloud/providers/gcp/rules/ai/featurestore_idle.py +++ b/cleancloud/providers/gcp/rules/ai/featurestore_idle.py @@ -1,7 +1,64 @@ +""" +Rule: gcp.vertex.featurestore.idle + + (spec — docs/specs/gcp/ai/featurestore_idle.md) + +Intent: + Detect Vertex AI feature serving stores with documented, provisioned + online-serving capacity that show no documented online-serving + request-count telemetry over a conservative review window. + + This is a conservative review-candidate rule only. It is not proof that + a store is safe to delete, not proof that offline feature workflows are + unused, and not proof of a specific monthly saving. + +Covered resource families: + - Vertex AI Feature Store (Legacy) Featurestore (spec 9.1) + - Vertex AI Feature Online Store with Bigtable online serving (spec 9.2) + +Exclusions: + - resource name malformed or store ID / region absent (spec 7) + - region filter set and region does not exactly match (spec 4.4) + - state not exactly STABLE (spec 9.1.2, 9.2.2) + - reference_time absent, unparsable, or in the future (spec 7) + - store younger than full observation window (spec 4.7) + - legacy: fixedNodeCount == 0 and no valid scaling.minNodeCount (spec 9.3) + - legacy: both fixedNodeCount and scaling.minNodeCount materially present — invalid mode (spec 7) + - FeatureOnlineStore: storage type not exactly Bigtable (spec 9.2.5, 9.3) + - FeatureOnlineStore: bigtable.autoScaling absent, unusable, or maxNodeCount < minNodeCount (spec 7) + - metric coverage unresolved — not exactly idle_days aligned daily buckets (spec 8.4) + - aggregate request count > 0 (spec 9.1.7, 9.2.8) + +Detection (legacy Featurestore): + - state == "STABLE" + - legacy_online_serving_mode is "fixed" or "autoscaled" + - reference_time_utc <= evaluation_window_start_utc + - metric_coverage_state == "full_window" and telemetry_state == "confirmed_zero" + +Detection (Bigtable-backed FeatureOnlineStore): + - state == "STABLE" + - storage type is Bigtable (bigtable key present, optimized key absent) + - bigtable_min_node_count >= 1 and bigtable_max_node_count >= bigtable_min_node_count + - reference_time_utc <= evaluation_window_start_utc + - metric_coverage_state == "full_window" and telemetry_state == "confirmed_zero" + +Cost model (spec 3.5, 10.1): + estimated_monthly_cost_usd = None + Pricing varies by backing, region, node count, and commitment model; + no flat estimate is appropriate. + +APIs: + - aiplatform.googleapis.com/v1: projects/{project}/locations/{loc}/featurestores + - aiplatform.googleapis.com/v1: projects/{project}/locations/{loc}/featureOnlineStores + - monitoring.googleapis.com: aiplatform.googleapis.com/featurestore/online_serving/request_count + - monitoring.googleapis.com: aiplatform.googleapis.com/featureonlinestore/online_serving/request_count +""" + import warnings from datetime import datetime, timedelta, timezone from typing import List, Optional +from google.api import metric_pb2 from google.auth.transport.requests import AuthorizedSession from google.cloud import monitoring_v3 from google.protobuf import duration_pb2, timestamp_pb2 @@ -11,6 +68,9 @@ from cleancloud.core.finding import Finding from cleancloud.core.risk import RiskLevel +# Integer sentinel for DELTA metric kind (google.api.metric_pb2.MetricDescriptor.MetricKind.DELTA) +_METRIC_KIND_DELTA: int = int(metric_pb2.MetricDescriptor.MetricKind.DELTA) + RULE_METADATA = { "id": "gcp.vertex.featurestore.idle", "category": "ai", @@ -18,26 +78,23 @@ "cost_impact": "high", } -# Default idle window — 30 days without any online serving requests = confidently idle. -# Longer than other rules because feature stores are sometimes used in periodic batch -# workflows with sparse online inference (e.g., weekly recommendation refreshes). +# Default idle window (spec 6.3) _DEFAULT_IDLE_DAYS = 30 -# Legacy featurestore: cost per Bigtable node (us-central1, on-demand, SSD-backed). -# A fixedNodeCount=1 store bills at $0.27/hr continuously. -_BIGTABLE_NODE_HOURLY_COST = 0.27 # published GCP rate +# Canonical alignment period: one full UTC day (spec 8.3) +_ALIGNMENT_PERIOD_SECONDS = 86400 -# New featureOnlineStore (Optimized / BigQuery-backed): no per-node billing; -# costs arise from storage and query compute — conservative flat estimate. -_OPTIMIZED_STORE_MONTHLY_COST = 100.0 # [est] conservative — actual varies by storage/queries +# Monitoring metric types (spec 8.1) +_LEGACY_METRIC = "aiplatform.googleapis.com/featurestore/online_serving/request_count" +_NEW_METRIC = "aiplatform.googleapis.com/featureonlinestore/online_serving/request_count" -_HOURS_PER_MONTH = 730.0 +# Monitored resource types (spec 8.1) +_LEGACY_RESOURCE_TYPE = "aiplatform.googleapis.com/Featurestore" +_NEW_RESOURCE_TYPE = "aiplatform.googleapis.com/FeatureOnlineStore" -# Monitoring metric names -_LEGACY_REQUEST_COUNT_METRIC = "aiplatform.googleapis.com/featurestore/online_serving/request_count" -_NEW_REQUEST_COUNT_METRIC = ( - "aiplatform.googleapis.com/featureonlinestore/online_serving/request_count" -) +# Resource ID labels on the monitored resource (spec 8.1) +_LEGACY_ID_LABEL = "featurestore_id" +_NEW_ID_LABEL = "feature_online_store_id" # Known Vertex AI Feature Store locations. Used as fallback when the locations/- # wildcard returns 400 (Feature Store APIs do not support the wildcard in all projects). @@ -66,11 +123,6 @@ "me-west1", ] -# Feature Store states that indicate the store is active and incurring charges. -# STABLE is the normal operating state for both legacy featurestores and Feature -# Online Stores. UPDATING is excluded — in-flight updates don't indicate idleness. -_ACTIVE_STATES = {"STABLE"} - def _parse_location(name: str) -> Optional[str]: """Extract region from resource name: projects/.../locations/{region}/...""" @@ -86,25 +138,170 @@ def _parse_resource_id(name: str) -> str: return name.rsplit("/", 1)[-1] if name else "" -def _age_days(create_str: str, now: datetime) -> Optional[float]: - """Parse createTime ISO string and return age in days. Returns None on failure.""" - if not create_str: +def _parse_rfc3339(ts: str) -> Optional[datetime]: + """Parse an RFC3339 timestamp, normalize to UTC-aware datetime, or return None.""" + if not ts: return None try: - dt = datetime.fromisoformat(create_str.replace("Z", "+00:00")) - if dt.tzinfo is None: - dt = dt.replace(tzinfo=timezone.utc) - return (now - dt).total_seconds() / 86400 + dt = datetime.fromisoformat(ts.replace("Z", "+00:00")) + dt = dt if dt.tzinfo else dt.replace(tzinfo=timezone.utc) + # spec 7: all timestamps must be normalized to timezone-aware UTC before comparison + return dt.astimezone(timezone.utc) except (ValueError, AttributeError): return None +def _resolve_reference_time(create_str: str, update_str: str, now: datetime) -> Optional[datetime]: + """ + Resolve reference_time_utc = max(createTime, updateTime) (spec 7). + + Future timestamps are discarded before the max. Returns None when neither + timestamp is parseable or both resolve to future values. + """ + create_time = _parse_rfc3339(create_str) + update_time = _parse_rfc3339(update_str) + + if create_time and create_time > now: + create_time = None + if update_time and update_time > now: + update_time = None + + if create_time and update_time: + return max(create_time, update_time) + return create_time or update_time + + +def _query_store_activity( + client: monitoring_v3.MetricServiceClient, + project_id: str, + store_id: str, + region: str, + metric_type: str, + resource_type: str, + id_label: str, + window_start: datetime, + window_end: datetime, + idle_days: int, +) -> str: + """ + Query the canonical request-count metric for a single store over the full window. + + Applies the exact filter required by spec 8.2: metric.type, resource.type, + resource.labels.location, and the family-specific store ID label. + + Validates coverage per spec 8.4: + - exactly one reduced series must remain + - exactly idle_days aligned daily datapoints + - each datapoint has a valid numeric value and no future timestamp + - no gap between adjacent datapoints exceeds the alignment period + + Returns: + "confirmed_zero" — full window, exactly idle_days buckets, total == 0 + "positive_activity" — full coverage and aggregate total > 0 + "unresolved" — any coverage constraint violated + + Raises: + Any exception from the Monitoring RPC layer (network, permission, etc.) + propagates to the caller so it can surface family-level visibility (spec 11.4). + """ + start_ts = timestamp_pb2.Timestamp() + start_ts.FromDatetime(window_start) + end_ts = timestamp_pb2.Timestamp() + end_ts.FromDatetime(window_end) + + interval = monitoring_v3.TimeInterval(start_time=start_ts, end_time=end_ts) + + # spec 8.2: exact filter on all four required dimensions + filter_str = ( + f'metric.type="{metric_type}"' + f' AND resource.type="{resource_type}"' + f' AND resource.labels.location="{region}"' + f' AND resource.labels.{id_label}="{store_id}"' + ) + + results = list( + client.list_time_series( + request={ + "name": f"projects/{project_id}", + "filter": filter_str, + "interval": interval, + "view": monitoring_v3.ListTimeSeriesRequest.TimeSeriesView.FULL, + "aggregation": monitoring_v3.Aggregation( + alignment_period=duration_pb2.Duration(seconds=_ALIGNMENT_PERIOD_SECONDS), + per_series_aligner=monitoring_v3.Aggregation.Aligner.ALIGN_SUM, + cross_series_reducer=monitoring_v3.Aggregation.Reducer.REDUCE_SUM, + group_by_fields=[f"resource.labels.{id_label}"], + ), + } + ) + ) + + # spec 8.4 point 1: exactly 1 reduced series + if len(results) != 1: + return "unresolved" + + # spec 8.3 point 5: metric kind must resolve to DELTA + if results[0].metric_kind != _METRIC_KIND_DELTA: + return "unresolved" + + points = list(results[0].points) + + # spec 8.4 point 2: exactly idle_days aligned datapoints + if len(points) != idle_days: + return "unresolved" + + # spec 8.4 point 3: pre-compute expected daily bucket end times as whole-second Unix + # timestamps. Cloud Monitoring returns ts.seconds with nanos=0; comparing against + # datetime objects derived from a sub-second-precise window_start would cause spurious + # mismatches. Integer-second comparison is both correct and tolerant of tiny variance. + _ws_secs = int(window_start.timestamp()) + expected_bucket_end_seconds: frozenset = frozenset( + _ws_secs + n * _ALIGNMENT_PERIOD_SECONDS for n in range(1, idle_days + 1) + ) + + total = 0.0 + timestamps = [] + seen_bucket_seconds: set = set() + + for point in points: + # spec 8.4 point 3/4: each point must map to exactly one documented bucket end + try: + ts = point.interval.end_time + point_dt = datetime.fromtimestamp(ts.seconds + ts.nanos / 1e9, tz=timezone.utc) + # whole-second membership check; duplicate seconds → same bucket twice + if ts.seconds not in expected_bucket_end_seconds or ts.seconds in seen_bucket_seconds: + return "unresolved" + seen_bucket_seconds.add(ts.seconds) + timestamps.append(point_dt) + except Exception: + return "unresolved" + + # spec 8.4 point 5: valid numeric value — WhichOneof dispatch (0 is falsy) + which = point.value.WhichOneof("value") + if which == "int64_value": + val = float(point.value.int64_value) + elif which == "double_value": + val = float(point.value.double_value) + else: + return "unresolved" + + total += val + + # spec 8.4 point 6: no gap between adjacent points exceeds alignment period + timestamps.sort() + for i in range(1, len(timestamps)): + if (timestamps[i] - timestamps[i - 1]).total_seconds() > _ALIGNMENT_PERIOD_SECONDS: + return "unresolved" + + return "confirmed_zero" if total == 0.0 else "positive_activity" + + def _list_featurestores(session: AuthorizedSession, project_id: str) -> list: - """List legacy Vertex AI featurestores across all locations. + """ + List legacy Vertex AI featurestores across all locations. - Returns only stores with online serving configured (fixedNodeCount > 0 or - scaling.minNodeCount > 0). Returns [] if the API is not enabled (404). - Raises PermissionError on 403. + Returns all stores (filtering by online-serving capacity happens in the caller). + Returns [] if the API is not enabled (404). Raises PermissionError on 403. Tries the locations/- wildcard first; falls back to per-location queries when the wildcard returns 400 (not supported by all projects). @@ -112,7 +309,6 @@ def _list_featurestores(session: AuthorizedSession, project_id: str) -> list: base_url = f"https://aiplatform.googleapis.com/v1/projects/{project_id}/locations" def _paginate_location(location: str) -> Optional[list]: - """Paginate one location. Returns None on 400 (unsupported), [] on 404.""" results = [] params: dict = {"pageSize": 100} while True: @@ -128,24 +324,17 @@ def _paginate_location(location: str) -> Optional[list]: return None # wildcard unsupported — signal caller to try per-location resp.raise_for_status() data = resp.json() - for store in data.get("featurestores", []): - config = store.get("onlineServingConfig") or {} - fixed = config.get("fixedNodeCount", 0) - scaling_min = (config.get("scaling") or {}).get("minNodeCount", 0) - if fixed > 0 or scaling_min > 0: - results.append(store) + results.extend(data.get("featurestores", [])) page_token = data.get("nextPageToken") if not page_token: break params["pageToken"] = page_token return results - # Fast path: wildcard covers all regions in one call sequence result = _paginate_location("-") if result is not None: return result - # Fallback: per-location queries stores: list = [] seen: set = set() for location in _FEATURESTORE_LOCATIONS: @@ -161,17 +350,19 @@ def _paginate_location(location: str) -> Optional[list]: def _list_feature_online_stores(session: AuthorizedSession, project_id: str) -> list: - """List new-generation Vertex AI Feature Online Stores across all locations. + """ + List Vertex AI Feature Online Stores across all locations. - Returns [] if the API returns 404 (not enabled or no stores). Raises PermissionError on 403. + Returns all stores (filtering by storage type happens in the caller). + Returns [] if the API returns 404 (not enabled or no stores). + Raises PermissionError on 403. Tries the locations/- wildcard first; falls back to per-location queries - when the wildcard returns 400 (not supported by all projects). + when the wildcard returns 400. """ base_url = f"https://aiplatform.googleapis.com/v1/projects/{project_id}/locations" def _paginate_location(location: str) -> Optional[list]: - """Paginate one location. Returns None on 400 (unsupported), [] on 404.""" results = [] params: dict = {"pageSize": 100} while True: @@ -194,12 +385,10 @@ def _paginate_location(location: str) -> Optional[list]: params["pageToken"] = page_token return results - # Fast path: wildcard covers all regions in one call sequence result = _paginate_location("-") if result is not None: return result - # Fallback: per-location queries stores: list = [] seen: set = set() for location in _FEATURESTORE_LOCATIONS: @@ -214,61 +403,6 @@ def _paginate_location(location: str) -> Optional[list]: return stores -def _fetch_request_counts( - credentials, - project_id: str, - idle_days: int, - metric: str, - id_label: str, -) -> dict[str, int]: - """Fetch total online serving request counts per store over the past idle_days days. - - Returns store_id → total_request_count. Returns {} on any error (monitoring optional). - """ - try: - client = monitoring_v3.MetricServiceClient(credentials=credentials) - now = datetime.now(timezone.utc) - start = now - timedelta(days=idle_days) - interval = monitoring_v3.TimeInterval( - start_time=timestamp_pb2.Timestamp(seconds=int(start.timestamp())), - end_time=timestamp_pb2.Timestamp(seconds=int(now.timestamp())), - ) - results = client.list_time_series( - request={ - "name": f"projects/{project_id}", - "filter": f'metric.type="{metric}"', - "interval": interval, - "view": monitoring_v3.ListTimeSeriesRequest.TimeSeriesView.FULL, - "aggregation": monitoring_v3.Aggregation( - alignment_period=duration_pb2.Duration(seconds=86400), # 1-day buckets - per_series_aligner=monitoring_v3.Aggregation.Aligner.ALIGN_SUM, - cross_series_reducer=monitoring_v3.Aggregation.Reducer.REDUCE_SUM, - group_by_fields=[f"resource.labels.{id_label}"], - ), - } - ) - counts: dict[str, int] = {} - for ts in results: - store_id = ts.resource.labels.get(id_label, "") - if not store_id: - continue - if not ts.points: - # No data points — metric exists but no observations in window. - # Skip rather than treating as zero to avoid HIGH-confidence - # false positives when telemetry is absent. - continue - total = sum(int(p.value.int64_value) for p in ts.points) - counts[store_id] = counts.get(store_id, 0) + total - return counts - except Exception as e: - warnings.warn( - f"gcp.vertex.featurestore.idle: monitoring query failed for {metric} " - f"({type(e).__name__}: {e}) — falling back to age-based detection", - stacklevel=2, - ) - return {} - - def find_idle_featurestores( *, project_id: str, @@ -277,35 +411,50 @@ def find_idle_featurestores( idle_days: int = _DEFAULT_IDLE_DAYS, ) -> List[Finding]: """ - Find Vertex AI Feature Store online stores with no serving activity for an extended period. - - Legacy featurestores and Bigtable-backed Feature Online Stores incur Bigtable compute - charges continuously while in STABLE state, regardless of whether any ReadFeatureValues - requests are made. A single-node legacy store costs ~$197/month; a 3-node HA store - costs ~$591/month. Optimized (BigQuery-backed) Feature Online Stores incur storage and - query compute charges instead of per-node billing (~$100+/month estimated). - These stores are often left running after a project winds down or a model is retired. - - Detection logic: - - Lists legacy Vertex AI featurestores with online serving configured (fixedNodeCount > 0 - or scaling.minNodeCount > 0) and new-generation Feature Online Stores via the Vertex AI - REST API (wildcard location with per-region fallback) - - Queries Cloud Monitoring for total online_serving/request_count over idle_days days - - Stores with zero requests are flagged as idle (HIGH confidence) - - If monitoring data is unavailable, stores older than idle_days are flagged - based on age alone (LOW confidence — heuristic: existence duration only) + Find Vertex AI feature stores with zero online-serving requests for idle_days days. + + Emits findings only when documented provisioned online-serving capacity is present + and the canonical request-count metric confirms exactly zero activity for the full + aligned observation window (exactly idle_days daily aligned buckets, spec 8.4). + + No age-only or monitoring-absent fallback is used (spec 8.5). IAM permissions required: - aiplatform.featurestores.list (roles/aiplatform.viewer) - aiplatform.featureOnlineStores.list (roles/aiplatform.viewer) - - monitoring.timeSeries.list (roles/monitoring.viewer) — optional; fallback to age + - monitoring.timeSeries.list (roles/monitoring.viewer) """ idle_days = max(1, idle_days) session = AuthorizedSession(credentials) - now = datetime.now(timezone.utc) + # Truncate to whole seconds so window boundaries are exact UTC instants with no + # sub-second component. This ensures int(window_start.timestamp()) is lossless and + # bucket boundaries in _query_store_activity match the spec's exact definition. + now = datetime.now(timezone.utc).replace(microsecond=0) + + # spec 6.3 / 2.1: evaluation window + window_end = now + window_start = window_end - timedelta(seconds=idle_days * _ALIGNMENT_PERIOD_SECONDS) + findings: List[Finding] = [] - # --- Legacy featurestores --- + # Create monitoring client once; skip all per-store queries if creation fails. + # Emit a warning so the failure is operationally visible (spec 11.4). + try: + monitoring_client: Optional[monitoring_v3.MetricServiceClient] = ( + monitoring_v3.MetricServiceClient(credentials=credentials) + ) + except Exception as e: + warnings.warn( + f"gcp.vertex.featurestore.idle: monitoring client creation failed " + f"({type(e).__name__}: {e}) — all stores will be skipped (no age-only fallback)", + UserWarning, + stacklevel=2, + ) + monitoring_client = None + + # ------------------------------------------------------------------------- + # Legacy featurestores (spec 9.1) + # ------------------------------------------------------------------------- legacy_stores: list = [] try: legacy_stores = _list_featurestores(session, project_id) @@ -318,131 +467,170 @@ def find_idle_featurestores( stacklevel=2, ) - legacy_counts = _fetch_request_counts( - credentials, - project_id, - idle_days, - _LEGACY_REQUEST_COUNT_METRIC, - "featurestore_id", - ) - for store in legacy_stores: + # spec 7: resource name and store ID must be present name = store.get("name", "") + if not name: + continue store_id = _parse_resource_id(name) - region = _parse_location(name) or "unknown" + if not store_id: + continue - if region_filter and not region.startswith(region_filter): + # spec 7: region must be parseable from the resource name + region = _parse_location(name) + if not region: continue - state = store.get("state", "") - if state not in _ACTIVE_STATES: + # spec 4.4: exact region filter match + if region_filter and region != region_filter: continue - config = store.get("onlineServingConfig") or {} - fixed_nodes = config.get("fixedNodeCount", 0) - scaling_min = (config.get("scaling") or {}).get("minNodeCount", 0) - node_count = fixed_nodes if fixed_nodes > 0 else scaling_min - is_autoscaled = fixed_nodes == 0 and scaling_min > 0 - hourly = _BIGTABLE_NODE_HOURLY_COST * node_count - monthly = hourly * _HOURS_PER_MONTH - - create_str = store.get("createTime", "") - age = _age_days(create_str, now) - - request_count = legacy_counts.get(store_id) - - if request_count is not None: - if request_count > 0: - continue # Active — skip - confidence = ConfidenceLevel.HIGH - idle_signal = f"0 ReadFeatureValues requests over {idle_days}d (monitoring confirmed)" - elif age is not None and age >= idle_days: - confidence = ConfidenceLevel.LOW - idle_signal = ( - f"no monitoring data; store has been STABLE for {age:.0f}d " - f"(heuristic: age only — request activity unknown)" - ) + # spec 9.1.2: only STABLE state + if store.get("state") != "STABLE": + continue + + # spec 7: reference_time = max(createTime, updateTime) + reference_time = _resolve_reference_time( + store.get("createTime", ""), store.get("updateTime", ""), now + ) + if reference_time is None: + continue + + # spec 4.7: full observation window must be coverable + if reference_time > window_start: + continue + + # spec 7: resolve legacy_online_serving_mode — malformed config skips (spec 11.3) + try: + config = store.get("onlineServingConfig") or {} + fixed_nodes = int(config.get("fixedNodeCount") or 0) + scaling = config.get("scaling") or {} + scaling_min = int(scaling.get("minNodeCount") or 0) + except (TypeError, ValueError, AttributeError): + continue + + if fixed_nodes > 0 and scaling_min > 0: + # spec 7: invalid mode — both materially present + continue + elif fixed_nodes > 0: + serving_mode = "fixed" + provisioned_nodes = fixed_nodes + elif scaling_min > 0: + serving_mode = "autoscaled" + provisioned_nodes = scaling_min else: + # no provisioned online-serving capacity + continue + + # spec 8.5: no age-only or monitoring-absent fallback + if monitoring_client is None: continue - risk = RiskLevel.HIGH if confidence == ConfidenceLevel.HIGH else RiskLevel.MEDIUM + # spec 8.2–8.4: per-store monitoring query with full coverage validation. + # RPC/network failures propagate as a warning and skip the store (spec 11.4). + try: + telemetry = _query_store_activity( + monitoring_client, + project_id, + store_id, + region, + _LEGACY_METRIC, + _LEGACY_RESOURCE_TYPE, + _LEGACY_ID_LABEL, + window_start, + window_end, + idle_days, + ) + except Exception as e: + warnings.warn( + f"gcp.vertex.featurestore.idle: monitoring query failed for " + f"legacy store '{store_id}' ({type(e).__name__}: {e})", + UserWarning, + stacklevel=2, + ) + continue - display_name = (store.get("displayName") or "").strip() or store_id - age_str = f"{age:.1f}d" if age is not None else "unknown" + if telemetry != "confirmed_zero": + continue # positive_activity or unresolved — neither emits - node_label = ( - f"{node_count} Bigtable node{'s' if node_count != 1 else ''} (autoscaled min)" - if is_autoscaled - else f"{node_count} Bigtable node{'s' if node_count != 1 else ''} (fixed)" - ) - signals = [ - f"Store state: STABLE (billable) — age: {age_str}", - f"Idle signal: {idle_signal}", - f"Online serving config: {node_label}", - f"Burn rate: ~${hourly:.2f}/hr (~${monthly:,.0f}/mo, {node_count} node{'s' if node_count != 1 else ''} × ${_BIGTABLE_NODE_HOURLY_COST}/hr)", + # --- All conditions met: emit finding --- + + signals_used = [ + "Resource family: Vertex AI Feature Store (Legacy)", + "State: STABLE", + f"Region: {region}", + f"Reference time (max(createTime, updateTime)): {reference_time.isoformat()}", + f"Idle window: {idle_days} days (full window, exactly {idle_days} aligned daily buckets confirmed)", + f"Serving mode: {serving_mode}, provisioned node floor: {provisioned_nodes}", + f"Metric: {_LEGACY_METRIC}", + "Aggregate request count over full window: 0", ] - not_checked = [ - "Periodic or low-frequency batch workflows that query less often than the idle window", - "Feature stores accessed by pipelines running on a schedule (e.g. weekly)", - "Committed use discounts — actual cost may be lower", + signals_not_checked = [ + "Periodic or low-frequency batch workflows with access frequency below the idle window", + "Feature stores accessed by scheduled pipelines (e.g. weekly jobs)", + "Offline feature generation, sync, or BigQuery-backed workflows", "Stores intentionally kept warm for latency-sensitive cold-start mitigation", ] - evidence = Evidence( - signals_used=signals, - signals_not_checked=not_checked, - time_window=f"{idle_days}d", - ) + details: dict = { + "store_name": name, + "store_id": store_id, + "store_family": "legacy_featurestore", + "state": "STABLE", + "region": region, + "reference_time": reference_time.isoformat(), + "idle_days_threshold": idle_days, + "legacy_serving_mode": serving_mode, + "provisioned_node_floor": provisioned_nodes, + "metric_type": _LEGACY_METRIC, + "metric_coverage_state": "full_window", + "telemetry_state": "confirmed_zero", + "request_count_total": 0, + } + if serving_mode == "fixed": + details["fixed_node_count"] = fixed_nodes + else: + details["scaling_min_node_count"] = scaling_min findings.append( Finding( provider="gcp", rule_id="gcp.vertex.featurestore.idle", resource_type="gcp.vertex.featurestore", - resource_id=name or store_id, + resource_id=name, region=region, - title=f"Idle Vertex AI Feature Store ({node_count} node{'s' if node_count != 1 else ''})", + title=( + f"Idle Vertex AI Feature Store (Legacy, " + f"{provisioned_nodes} node{'s' if provisioned_nodes != 1 else ''})" + ), summary=( - f"Vertex AI Feature Store '{display_name}' has had no online serving " - f"requests for at least {idle_days} days while maintaining " - f"{node_count} Bigtable node{'s' if node_count != 1 else ''}, " - f"costing ~${hourly:.2f}/hr (~${monthly:,.0f}/mo)." + f"Legacy Vertex AI Feature Store '{store_id}' ({serving_mode}, " + f"{provisioned_nodes} node{'s' if provisioned_nodes != 1 else ''}) " + f"in region '{region}' shows zero online-serving requests " + f"over {idle_days} days." ), reason=( - ( - f"Feature Store in STABLE state with zero ReadFeatureValues requests " - f"for ≥{idle_days} days" - ) - if request_count is not None - else ( - f"Feature Store in STABLE state for ≥{idle_days} days " - f"(heuristic: age only — no request data available)" - ) + f"Aggregate online-serving request count == 0 over the {idle_days}-day " + f"observation window ({_LEGACY_METRIC})" ), - risk=risk, - confidence=confidence, + risk=RiskLevel.HIGH, + confidence=ConfidenceLevel.HIGH, detected_at=now, - evidence=evidence, - estimated_monthly_cost_usd=round(monthly, 2), - details={ - "store_name": name, - "store_id": store_id, - "store_type": "legacy_featurestore", - "region": region, - "bigtable_node_count": node_count, - "bigtable_scaling": "autoscaled" if is_autoscaled else "fixed", - "age_days": round(age, 1) if age is not None else None, - "request_count": request_count, - "idle_days_threshold": idle_days, - "hourly_cost_usd": round(hourly, 4), - "pricing_confidence": "published", - "pricing_scope": "us_central1_reference", - }, + evidence=Evidence( + signals_used=signals_used, + signals_not_checked=signals_not_checked, + time_window=f"{idle_days} days", + ), + details=details, + # spec 10.1: always None — pricing varies by backing, region, and commitment + estimated_monthly_cost_usd=None, ) ) - # --- New featureOnlineStores --- + # ------------------------------------------------------------------------- + # Bigtable-backed FeatureOnlineStores (spec 9.2) + # ------------------------------------------------------------------------- new_stores: list = [] try: new_stores = _list_feature_online_stores(session, project_id) @@ -455,138 +643,166 @@ def find_idle_featurestores( stacklevel=2, ) - new_counts = _fetch_request_counts( - credentials, - project_id, - idle_days, - _NEW_REQUEST_COUNT_METRIC, - "feature_online_store_id", - ) - for store in new_stores: + # spec 7: resource name and store ID must be present name = store.get("name", "") + if not name: + continue store_id = _parse_resource_id(name) - region = _parse_location(name) or "unknown" + if not store_id: + continue - if region_filter and not region.startswith(region_filter): + # spec 7: region must be parseable from the resource name + region = _parse_location(name) + if not region: continue - state = store.get("state", "") - if state not in _ACTIVE_STATES: + # spec 4.4: exact region filter match + if region_filter and region != region_filter: continue - # Determine backing type and cost - bigtable_config = store.get("bigtable") or {} - is_optimized = store.get("optimized") is not None - autoscaling = bigtable_config.get("autoScaling") or {} - min_nodes = autoscaling.get("minNodeCount", 0) - - if is_optimized: - # Optimized (BigQuery-backed) — flat estimate; no Bigtable node charges - hourly = _OPTIMIZED_STORE_MONTHLY_COST / _HOURS_PER_MONTH - monthly = _OPTIMIZED_STORE_MONTHLY_COST - backing_label = "Optimized (BigQuery-backed)" - pricing_confidence = "estimated" - elif min_nodes > 0: - hourly = _BIGTABLE_NODE_HOURLY_COST * min_nodes - monthly = hourly * _HOURS_PER_MONTH - backing_label = f"{min_nodes} Bigtable node{'s' if min_nodes != 1 else ''} (min)" - pricing_confidence = "published" - else: - # Unknown backing — still STABLE but can't estimate cost accurately - hourly = _BIGTABLE_NODE_HOURLY_COST # conservative single-node floor - monthly = hourly * _HOURS_PER_MONTH - backing_label = "unknown backing" - pricing_confidence = "estimated" + # spec 9.2.2: only STABLE state + if store.get("state") != "STABLE": + continue - create_str = store.get("createTime", "") - age = _age_days(create_str, now) + # spec 7: reference_time = max(createTime, updateTime) + reference_time = _resolve_reference_time( + store.get("createTime", ""), store.get("updateTime", ""), now + ) + if reference_time is None: + continue - request_count = new_counts.get(store_id) + # spec 4.7: full observation window must be coverable + if reference_time > window_start: + continue - if request_count is not None: - if request_count > 0: + # spec 9.2.5, 9.3: storage type must be exactly Bigtable; optimized is out of scope + has_bigtable = "bigtable" in store + has_optimized = "optimized" in store + + if not has_bigtable or has_optimized: + # neither-present or both-present → unusable union; optimized → out of scope + continue + + # spec 7: bigtable.autoScaling must be present and structurally usable (spec 11.3) + try: + bigtable_config = store.get("bigtable") or {} + autoscaling = bigtable_config.get("autoScaling") + if not autoscaling: continue - confidence = ConfidenceLevel.HIGH - idle_signal = f"0 serving requests over {idle_days}d (monitoring confirmed)" - elif age is not None and age >= idle_days: - confidence = ConfidenceLevel.LOW - idle_signal = ( - f"no monitoring data; store has been STABLE for {age:.0f}d " - f"(heuristic: age only — request activity unknown)" + min_nodes = int(autoscaling.get("minNodeCount") or 0) + max_nodes = int(autoscaling.get("maxNodeCount") or 0) + except (TypeError, ValueError, AttributeError): + continue + + # spec 7: min >= 1 and max >= min + if min_nodes < 1 or max_nodes < min_nodes: + continue + + # spec 8.5: no age-only or monitoring-absent fallback + if monitoring_client is None: + continue + + # spec 8.2–8.4: per-store monitoring query with full coverage validation. + # RPC/network failures propagate as a warning and skip the store (spec 11.4). + try: + telemetry = _query_store_activity( + monitoring_client, + project_id, + store_id, + region, + _NEW_METRIC, + _NEW_RESOURCE_TYPE, + _NEW_ID_LABEL, + window_start, + window_end, + idle_days, + ) + except Exception as e: + warnings.warn( + f"gcp.vertex.featurestore.idle: monitoring query failed for " + f"feature online store '{store_id}' ({type(e).__name__}: {e})", + UserWarning, + stacklevel=2, ) - else: continue - risk = RiskLevel.HIGH if confidence == ConfidenceLevel.HIGH else RiskLevel.MEDIUM + if telemetry != "confirmed_zero": + continue # positive_activity or unresolved — neither emits - display_name = (store.get("displayName") or "").strip() or store_id - age_str = f"{age:.1f}d" if age is not None else "unknown" + # --- All conditions met: emit finding --- - signals = [ - f"Store state: STABLE (billable) — age: {age_str}", - f"Idle signal: {idle_signal}", - f"Backing: {backing_label}", - f"Burn rate: ~${hourly:.2f}/hr (~${monthly:,.0f}/mo)", + signals_used = [ + "Resource family: Vertex AI Feature Online Store (Bigtable-backed)", + "State: STABLE", + f"Region: {region}", + f"Reference time (max(createTime, updateTime)): {reference_time.isoformat()}", + f"Idle window: {idle_days} days (full window, exactly {idle_days} aligned daily buckets confirmed)", + f"Bigtable autoscaling: minNodeCount={min_nodes}, maxNodeCount={max_nodes}", + f"Metric: {_NEW_METRIC}", + "Aggregate request count over full window: 0", ] - not_checked = [ - "Periodic or low-frequency batch workflows that query less often than the idle window", - "Feature stores accessed by pipelines running on a schedule (e.g. weekly)", - "Optimized stores — cost estimate is conservative; actual cost depends on storage size and query volume", - "Committed use discounts — actual cost may be lower", + signals_not_checked = [ + "Periodic or low-frequency batch workflows with access frequency below the idle window", + "Feature stores accessed by scheduled pipelines (e.g. weekly jobs)", + "Offline feature generation, sync, or BigQuery-backed workflows", + "Stores intentionally kept warm for latency-sensitive cold-start mitigation", ] - evidence = Evidence( - signals_used=signals, - signals_not_checked=not_checked, - time_window=f"{idle_days}d", - ) + details: dict = { + "store_name": name, + "store_id": store_id, + "store_family": "feature_online_store", + "state": "STABLE", + "region": region, + "reference_time": reference_time.isoformat(), + "idle_days_threshold": idle_days, + "storage_type": "bigtable", + "bigtable_min_node_count": min_nodes, + "bigtable_max_node_count": max_nodes, + "metric_type": _NEW_METRIC, + "metric_coverage_state": "full_window", + "telemetry_state": "confirmed_zero", + "request_count_total": 0, + } findings.append( Finding( provider="gcp", rule_id="gcp.vertex.featurestore.idle", resource_type="gcp.vertex.feature_online_store", - resource_id=name or store_id, + resource_id=name, region=region, - title=f"Idle Vertex AI Feature Online Store ({backing_label})", + title=( + f"Idle Vertex AI Feature Online Store " + f"(Bigtable, min {min_nodes} node{'s' if min_nodes != 1 else ''})" + ), summary=( - f"Vertex AI Feature Online Store '{display_name}' ({backing_label}) " - f"has had no serving requests for at least {idle_days} days, " - f"costing ~${hourly:.2f}/hr (~${monthly:,.0f}/mo)." + f"Vertex AI Feature Online Store '{store_id}' " + f"(Bigtable, min {min_nodes} node{'s' if min_nodes != 1 else ''}) " + f"in region '{region}' shows zero online-serving requests " + f"over {idle_days} days." ), reason=( - ( - f"Feature Online Store in STABLE state with zero serving requests " - f"for ≥{idle_days} days" - ) - if request_count is not None - else ( - f"Feature Online Store in STABLE state for ≥{idle_days} days " - f"(heuristic: age only — no request data available)" - ) + f"Aggregate online-serving request count == 0 over the {idle_days}-day " + f"observation window ({_NEW_METRIC})" ), - risk=risk, - confidence=confidence, + risk=RiskLevel.HIGH, + confidence=ConfidenceLevel.HIGH, detected_at=now, - evidence=evidence, - estimated_monthly_cost_usd=round(monthly, 2), - details={ - "store_name": name, - "store_id": store_id, - "store_type": "feature_online_store", - "backing": "optimized" if is_optimized else "bigtable", - "region": region, - "bigtable_min_nodes": min_nodes if not is_optimized else None, - "age_days": round(age, 1) if age is not None else None, - "request_count": request_count, - "idle_days_threshold": idle_days, - "hourly_cost_usd": round(hourly, 4), - "pricing_confidence": pricing_confidence, - "pricing_scope": "us_central1_reference", - }, + evidence=Evidence( + signals_used=signals_used, + signals_not_checked=signals_not_checked, + time_window=f"{idle_days} days", + ), + details=details, + # spec 10.1: always None — pricing varies by backing, region, and commitment + estimated_monthly_cost_usd=None, ) ) return findings + + +find_idle_featurestores.RULE_ID = "gcp.vertex.featurestore.idle" diff --git a/cleancloud/providers/gcp/rules/ai/tpu_idle.py b/cleancloud/providers/gcp/rules/ai/tpu_idle.py index 0242df7..e1fdd3c 100644 --- a/cleancloud/providers/gcp/rules/ai/tpu_idle.py +++ b/cleancloud/providers/gcp/rules/ai/tpu_idle.py @@ -1,16 +1,70 @@ +""" +Rule: gcp.tpu.idle + + (spec — docs/specs/gcp/ai/tpu_idle.md) + +Intent: + Detect standalone Cloud TPU Nodes in documented billable READY state that + show no observed accelerator-processing activity above a conservative + threshold over a buffered review window, using documented Cloud Monitoring + duty-cycle telemetry. + + This is a precision-first review-candidate rule. It is not proof that the + TPU-backed job is abandoned, not proof the node is safe to stop or delete, + and not proof of a specific monthly saving. + +Covered resource families: + - Cloud TPU Node (projects.locations.nodes, TPU v2 REST API) + +Exclusions: + - node name malformed, node ID or zone absent / unresolvable (spec 7) + - region filter set and derived region does not exactly match (spec 7) + - state not exactly READY (spec 3.1, 9) + - createTime absent, unparsable, future, or node younger than full buffered + window — create_time_utc > evaluation_window_start_utc (spec 7, 9) + - queuedResource non-empty string — queued-resource-managed node (spec 3.5, 9) + - multisliceNode == true — multislice node (spec 3.5, 9) + - malformed queuedResource (non-string/non-null) or multisliceNode + (non-bool/non-null) (spec 7) + - monitoring client creation failure — all nodes skip; no age-only fallback + (spec 8.6, 11.2) + - monitoring query failure for a node — that node skips, warning issued + (spec 11.1) + - telemetry join state not confirmed "complete" (spec 8.3, 9) — currently + always the case; see Current status below + +Detection (pre-checks currently applied; emission blocked — see Current status): + - state == "READY" + - queuedResource absent/empty and not malformed + - multisliceNode != true and not malformed + - create_time_utc <= evaluation_window_start_utc + - telemetry join state confirmed "complete" (spec 8.3) [blocked — see Current status] + +Current status — join barrier (spec 8.3): + The duty-cycle metric (tpu.googleapis.com/accelerator/duty_cycle) is + published on the tpu.googleapis.com/GceTpuWorker monitored resource with + labels resource_container, location, and worker_id. These labels do not + include a TPU Node name. No documented first-party Google Cloud surface maps + worker_id to the owning TPU Node, so telemetry_join_state cannot be proven + "complete". The rule currently emits no findings. When Google publishes a + documented worker-to-node identity surface, implement the join in + _run_zone_diagnostic(). + +Cost model (spec 3.2, 10.1): + estimated_monthly_cost_usd = None + Pricing varies by TPU type, region, and usage option (on-demand, spot, + committed-use); no flat estimate is appropriate. + +APIs: + - tpu.googleapis.com/v2: projects/{project}/locations/-/nodes + - monitoring.googleapis.com: tpu.googleapis.com/accelerator/duty_cycle + on tpu.googleapis.com/GceTpuWorker +""" + import warnings from datetime import datetime, timedelta, timezone from typing import List, Optional -from google.api_core.exceptions import ( - BadGateway, - DeadlineExceeded, - GatewayTimeout, - InternalServerError, - ResourceExhausted, - ServiceUnavailable, - TooManyRequests, -) from google.auth.transport.requests import AuthorizedSession from google.cloud import monitoring_v3 from google.protobuf import duration_pb2, timestamp_pb2 @@ -30,37 +84,20 @@ # Default idle window — 7 days of near-zero duty_cycle = confidently idle _DEFAULT_IDLE_DAYS = 7 -# duty_cycle fraction at or below which a node is considered idle. -# 2% allows for brief health-check spikes without masking genuine utilization. -# max() is used (not mean/p95) so that any single active sample keeps the node -# out of the idle bucket — this avoids flagging intermittently-used nodes. -_DUTY_CYCLE_IDLE_THRESHOLD = 0.02 - -# TPU node states that incur compute charges. -# NOTE: If GCP adds new billable states (e.g. HIBERNATED), update this set. -# Source: https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm -_BILLABLE_STATES = {"READY"} - -# Per-chip hourly cost — us-central1 on-demand reference rates. -# All values are per chip-hour. Actual cost varies by region and commitment. -_CHIP_HOURLY_COST: dict[str, float] = { - "V2": 1.50, # $1.50/chip-hr (v2 pod, published GCP rate) - "V3": 2.20, # $2.20/chip-hr (v3 device; v3 pod is $2.00 — use higher) - "V4": 3.22, # $3.22/chip-hr (published GCP rate, us-central1) - "V5LITE_POD": 1.20, # $1.20/chip-hr (TPU v5e litepod, published) - "V5P": 4.20, # $4.20/chip-hr (TPU v5p, published) - "V6E": 2.40, # $2.40/chip-hr [est] — no confirmed published rate as of 2025 -} -_DEFAULT_CHIP_HOURLY_COST = 2.00 # conservative fallback for unknown/future types +# 180-second monitoring visibility buffer (spec 3.4) +_MONITORING_BUFFER_SECONDS = 180 -# Types whose per-chip pricing is estimated (not yet officially published). -_PRICING_ESTIMATED_TYPES = frozenset({"V6E"}) +# Idle threshold in percent units [0,100] (spec 6.4); values are percent not fraction. +# Referenced in the unreachable finding block below; not yet in a live decision path. +_DUTY_CYCLE_THRESHOLD_PCT = 2.0 -# Types where topology-based chip counting is unreliable: for V5+/V6E pod slices, -# topology encodes the full pod shape rather than the per-node slice count. -_TOPOLOGY_UNRELIABLE_TYPES = frozenset({"V5LITE_POD", "V5P", "V6E"}) +# Monitoring alignment period for the non-semantic placeholder aggregation in +# _run_zone_diagnostic; not yet in a live decision path (spec 8.4, 8.5). +_ALIGNMENT_PERIOD_SECONDS = 3600 -_HOURS_PER_MONTH = 730.0 +# Canonical metric and resource type (spec 8.1) +_DUTY_CYCLE_METRIC = "tpu.googleapis.com/accelerator/duty_cycle" +_DUTY_CYCLE_RESOURCE_TYPE = "tpu.googleapis.com/GceTpuWorker" def _parse_location(name: str) -> Optional[str]: @@ -78,23 +115,31 @@ def _parse_node_id(name: str) -> str: return name.rsplit("/", 1)[-1] if name else "" -def _zone_to_region(zone: str) -> str: - """Derive GCP region from zone (e.g. 'us-central1-f' → 'us-central1'). +def _zone_to_region(zone: str) -> Optional[str]: + """Derive GCP region from zone (e.g. 'us-central1-f' -> 'us-central1'). - Uses split/join rather than rsplit to handle multi-hyphen region prefixes - (e.g. 'northamerica-northeast1-a' → 'northamerica-northeast1') correctly. - Falls back to the zone itself if it has no hyphen. + Returns None when the zone string has no hyphen and region cannot be derived. """ parts = zone.split("-") - return "-".join(parts[:-1]) if len(parts) > 1 else zone + if len(parts) < 2: + return None + return "-".join(parts[:-1]) -def _tpu_type_from_legacy(accel_type: str) -> str: - """Map legacy acceleratorType string to acceleratorConfig.type key. +def _parse_rfc3339_utc(ts: str) -> Optional[datetime]: + """Parse an RFC3339 timestamp string into a timezone-aware UTC datetime.""" + if not ts: + return None + try: + dt = datetime.fromisoformat(ts.replace("Z", "+00:00")) + dt = dt if dt.tzinfo else dt.replace(tzinfo=timezone.utc) + return dt.astimezone(timezone.utc) + except (ValueError, AttributeError): + return None - Examples: "v2-8" → "V2", "v4-8" → "V4", "v5litepod-4" → "V5LITE_POD". - Returns "" if the type is unrecognised. - """ + +def _tpu_type_from_legacy(accel_type: str) -> str: + """Map legacy acceleratorType string to acceleratorConfig.type key (context only).""" lower = accel_type.lower() if lower.startswith("v2"): return "V2" @@ -111,101 +156,7 @@ def _tpu_type_from_legacy(accel_type: str) -> str: return "" -def _chip_count( - accel_type_legacy: str, topology: Optional[str], tpu_type: str = "" -) -> tuple[int, bool]: - """Derive chip count for a TPU node. - - Returns (chips, is_approximate). is_approximate is True when the count was - derived from pod-shape topology for V5+/V6E types (may overstate slice count) - or when no usable information was found at all. - - Priority differs by type: - - V2–V4 (topology is reliable slice geometry): - 1. topology multiplication → exact - 2. legacy acceleratorType suffix → exact - 3. fallback 1 → approximate - - V5+/V6E (topology may encode pod shape, not slice count): - 1. legacy acceleratorType suffix → exact (encodes slice count directly) - 2. topology multiplication → approximate (pod shape, may overstate) - 3. fallback 1 → approximate - """ - - def _from_topology(t: str) -> Optional[int]: - try: - parts = [int(x) for x in t.lower().split("x") if x] - count = 1 - for p in parts: - count *= p - return count if count > 0 else None - except (ValueError, AttributeError): - return None - - def _from_legacy(s: str) -> Optional[int]: - try: - return max(1, int(s.rsplit("-", 1)[-1])) - except (ValueError, IndexError): - return None - - if tpu_type in _TOPOLOGY_UNRELIABLE_TYPES: - # Legacy suffix encodes the per-node slice count directly; prefer it. - if accel_type_legacy: - v = _from_legacy(accel_type_legacy) - if v is not None: - return v, False - # Fall back to topology — may reflect pod shape, so mark approximate. - if topology: - v = _from_topology(topology) - if v is not None: - return v, True - else: - # For V2–V4, topology gives the exact slice geometry. - if topology: - v = _from_topology(topology) - if v is not None: - return v, False - if accel_type_legacy: - v = _from_legacy(accel_type_legacy) - if v is not None: - return v, False - - return 1, True - - -def _hourly_cost( - tpu_type: str, chips: int, chip_count_approximate: bool = False -) -> tuple[float, str]: - """Return (hourly_cost_usd, pricing_confidence) for a TPU node. - - pricing_confidence is "estimated" when any of: - - The type has no entry in _CHIP_HOURLY_COST (unknown type, uses fallback rate) - - The type is in _PRICING_ESTIMATED_TYPES (rate not officially published) - - chip_count_approximate is True (chip count may be wrong; cost inherits uncertainty) - """ - rate = _CHIP_HOURLY_COST.get(tpu_type, _DEFAULT_CHIP_HOURLY_COST) - estimated = ( - tpu_type not in _CHIP_HOURLY_COST - or tpu_type in _PRICING_ESTIMATED_TYPES - or chip_count_approximate - ) - return rate * chips, ("estimated" if estimated else "published") - - -def _compute_risk(confidence: ConfidenceLevel, hourly_cost: float) -> RiskLevel: - """Map (confidence, hourly_cost) to a RiskLevel. - - HIGH confidence + expensive (≥$10/hr) → CRITICAL - HIGH confidence + cheaper → HIGH - LOW confidence (age-only fallback) → MEDIUM - """ - if confidence == ConfidenceLevel.HIGH: - return RiskLevel.CRITICAL if hourly_cost >= 10.0 else RiskLevel.HIGH - return RiskLevel.MEDIUM - - -def _list_tpu_nodes(session: AuthorizedSession, project_id: str) -> list: +def _list_tpu_nodes(session: AuthorizedSession, project_id: str) -> list[dict]: """List all TPU nodes across all zones for a project. Uses the locations/- wildcard. Returns [] if the TPU API is not enabled. @@ -220,7 +171,6 @@ def _list_tpu_nodes(session: AuthorizedSession, project_id: str) -> list: params["pageToken"] = page_token resp = session.get(url, params=params) if resp.status_code == 403: - # Distinguish SERVICE_DISABLED (API not enabled) from true 403 (IAM). try: reason = resp.json().get("error", {}).get("details", [{}])[0].get("reason", "") except Exception: @@ -232,7 +182,6 @@ def _list_tpu_nodes(session: AuthorizedSession, project_id: str) -> list: "Grant roles/tpu.viewer to the scanning identity." ) if resp.status_code == 404: - # TPU API not enabled or no nodes in any zone — treat as empty. return [] resp.raise_for_status() data = resp.json() @@ -243,100 +192,62 @@ def _list_tpu_nodes(session: AuthorizedSession, project_id: str) -> list: return nodes -def _fetch_duty_cycles(credentials, project_id: str, idle_days: int) -> dict[str, float]: - """Fetch max duty_cycle per TPU node over the past idle_days days. - - Returns a dict mapping canonical node short-name → max_duty_cycle (0.0–1.0). - The canonical key is the last path segment of whichever label is populated - (resource_name or node_id), normalised via rsplit("/", 1)[-1]. - - A single retry is attempted on transient errors before falling back to the - age-based detection path. Permanent errors (auth, quota) are not retried. - - Returns {} on failure — monitoring is optional; callers fall back to age. - - Note on scale: list_time_series is paginated by the Python client iterator; - large projects with many TPU nodes may incur multiple API calls. +def _run_zone_diagnostic( + client: monitoring_v3.MetricServiceClient, + project_id: str, + zone: str, + window_start: datetime, + window_end: datetime, +) -> None: + """Zone-scoped diagnostic query; side-effect only — returns None. + + Called exclusively to surface permission and API-availability errors for the zone + (spec 11.1). The query result is intentionally discarded and MUST NOT be attributed + to any specific TPU Node: the zone filter cannot distinguish workers belonging to + different nodes in the same zone, so no emission decision is derivable here. + + Callers: invoke at most once per zone (see zone_ok / zone_errors cache in + find_idle_tpu_nodes). Do not call per-node and attempt to attribute results. + + When Google publishes a documented worker-to-node identity surface, replace this + function body with join (8.3) -> coverage (8.4) -> activity (8.5) and change + the return type to a structured result (see TODO below) so find_idle_tpu_nodes + can derive an emission verdict from the return value. + + RPC exceptions propagate to the caller (no outer try/except here). + + TODO (structured return): When the join is implemented, return a dataclass or dict + with discrete join_state, coverage_state, and activity_state fields so each + dimension is independently modelled and surfaced in finding details. """ - - def _is_transient(exc: Exception) -> bool: - """True for errors likely to succeed on retry (network/timeout/throttle).""" - return isinstance( - exc, - ( - DeadlineExceeded, # timeout - ResourceExhausted, # 429 quota - TooManyRequests, # 429 rate limit - ServiceUnavailable, # 503 - BadGateway, # 502 - GatewayTimeout, # 504 - InternalServerError, # 500 (transient backend errors) - ), + # Zone-scoped diagnostic — surfaces permission / availability errors only. + # Results MUST NOT be attributed to any specific TPU Node: zone filter alone does + # not prove ownership; only a documented join surface would (spec 8.3). + _ = list( + client.list_time_series( + request={ + "name": f"projects/{project_id}", + "filter": ( + f'metric.type="{_DUTY_CYCLE_METRIC}"' + f' AND resource.type="{_DUTY_CYCLE_RESOURCE_TYPE}"' + f' AND resource.labels.location="{zone}"' + ), + "interval": monitoring_v3.TimeInterval( + start_time=timestamp_pb2.Timestamp(seconds=int(window_start.timestamp())), + end_time=timestamp_pb2.Timestamp(seconds=int(window_end.timestamp())), + ), + "view": monitoring_v3.ListTimeSeriesRequest.TimeSeriesView.FULL, + "aggregation": monitoring_v3.Aggregation( + # Non-semantic placeholder shape matching the intended join-aware query. + # alignment_period / ALIGN_MAX not used in any emission decision today. + alignment_period=duration_pb2.Duration(seconds=_ALIGNMENT_PERIOD_SECONDS), + per_series_aligner=monitoring_v3.Aggregation.Aligner.ALIGN_MAX, + # No cross_series_reducer — preserves per-worker/accelerator granularity + # for when the join is implementable (spec 8.2, 8.3). + ), + } ) - - last_exc: Optional[Exception] = None - for attempt in range(2): - try: - client = monitoring_v3.MetricServiceClient(credentials=credentials) - now = datetime.now(timezone.utc) - start = now - timedelta(days=idle_days) - interval = monitoring_v3.TimeInterval( - start_time=timestamp_pb2.Timestamp(seconds=int(start.timestamp())), - end_time=timestamp_pb2.Timestamp(seconds=int(now.timestamp())), - ) - results = client.list_time_series( - request={ - "name": f"projects/{project_id}", - # tpu_worker is the monitored resource type for this metric. - # Do not use resource.type="tpu_node" — not valid for this metric. - # If GCP changes this schema, the query returns {} and the rule - # falls back to age-based detection rather than erroring out. - "filter": ( - 'metric.type="tpu.googleapis.com/node/accelerator/duty_cycle"' - ' AND resource.type="tpu_worker"' - ), - "interval": interval, - "view": monitoring_v3.ListTimeSeriesRequest.TimeSeriesView.FULL, - "aggregation": monitoring_v3.Aggregation( - alignment_period=duration_pb2.Duration(seconds=3600), - per_series_aligner=monitoring_v3.Aggregation.Aligner.ALIGN_MAX, - cross_series_reducer=monitoring_v3.Aggregation.Reducer.REDUCE_MAX, - # Group by both labels: resource_name (full path, preferred) - # and node_id (short name, documented for tpu_worker). - group_by_fields=[ - "resource.labels.resource_name", - "resource.labels.node_id", - ], - ), - } - ) - duty_cycles: dict[str, float] = {} - for ts in results: - if not ts.points: - continue - labels = ts.resource.labels - # Normalise to the last path segment as the canonical key so - # "projects/.../nodes/my-tpu" and "my-tpu" map to the same entry. - raw_label = labels.get("resource_name") or labels.get("node_id", "") - if not raw_label: - continue - canonical = raw_label.rsplit("/", 1)[-1] - # max() is intentional: any single active sample keeps the node - # out of the idle bucket. mean/p95 would mask intermittent usage. - max_val = max((p.value.double_value for p in ts.points), default=0.0) - duty_cycles[canonical] = max(duty_cycles.get(canonical, 0.0), max_val) - return duty_cycles - except Exception as exc: - last_exc = exc - if attempt == 0 and _is_transient(exc): - continue # retry once for transient errors only - break # permanent error (auth, permission, quota) — don't retry - warnings.warn( - f"gcp.tpu.idle: monitoring query failed ({type(last_exc).__name__}: {last_exc}) " - "— falling back to age-based detection", - stacklevel=2, ) - return {} def find_idle_tpu_nodes( @@ -349,240 +260,266 @@ def find_idle_tpu_nodes( """ Find Cloud TPU nodes that have been idle for an extended period. - TPU nodes in READY state incur compute charges regardless of utilization. - An idle TPU v4 node (4 chips) costs ~$12.88/hr; a v5p-8 costs ~$33.60/hr. - Forgetting to delete a TPU after a training run is a common cause of runaway cost. + Currently emits no findings: the worker-to-node telemetry join (spec 8.3) cannot + be proven with documented GCP surfaces; every node passes pre-checks but is blocked + at the telemetry gate. See module docstring — "Current status" section. - Detection logic: - - Lists all TPU nodes in READY state via the Cloud TPU v2 REST API - - Queries Cloud Monitoring for tpu.googleapis.com/node/accelerator/duty_cycle - over the past idle_days days (7 days by default) - - Nodes with max duty_cycle ≤ 2% AND age ≥ idle_days are flagged HIGH confidence - - Nodes with max duty_cycle ≤ 2% but age < idle_days are skipped (not yet idle) - - If monitoring data is unavailable, nodes older than idle_days are flagged - at LOW confidence (existence duration is not a reliable idle proxy) + When the join is implemented, emits a finding only when the node is in documented + READY state, is standalone (not queued-resource-managed or multislice), and complete + joined duty-cycle telemetry confirms no accelerator activity above 2% over the + buffered idle window. No age-only or monitoring-absent fallback is performed. IAM permissions required: - tpu.nodes.list (roles/tpu.viewer) - - monitoring.timeSeries.list (roles/monitoring.viewer) — optional; fallback to age + - monitoring.timeSeries.list (roles/monitoring.viewer) """ idle_days = max(1, idle_days) - session = AuthorizedSession(credentials) - now = datetime.now(timezone.utc) + now = datetime.now(timezone.utc).replace(microsecond=0) + window_end = now - timedelta(seconds=_MONITORING_BUFFER_SECONDS) + window_start = window_end - timedelta(seconds=idle_days * 86400) + session = AuthorizedSession(credentials) nodes = _list_tpu_nodes(session, project_id) if not nodes: return [] - ready_nodes = [n for n in nodes if n.get("state") in _BILLABLE_STATES] - if not ready_nodes: - return [] - - # Batch monitoring query — one call covers all nodes in the project. - duty_cycles = _fetch_duty_cycles(credentials, project_id, idle_days) + try: + monitoring_client = monitoring_v3.MetricServiceClient(credentials=credentials) + except Exception as e: + warnings.warn( + f"gcp.tpu.idle: monitoring client creation failed " + f"({type(e).__name__}: {e}) — all nodes will be skipped (no age-only fallback)", + UserWarning, + stacklevel=2, + ) + monitoring_client = None findings: List[Finding] = [] - unmatched_node_ids: List[str] = [] - - for node in ready_nodes: - name = node.get("name", "") + # Per-zone diagnostic query cache: _run_zone_diagnostic is zone-scoped (spec 11.1); + # caching avoids redundant API calls when multiple nodes share a zone. + zone_ok: set[str] = set() + zone_errors: set[str] = set() + + for node in nodes: + # --- Identity --- + name = node.get("name") or "" + if not name: + continue node_id = _parse_node_id(name) - zone = _parse_location(name) or "unknown" + if not node_id: + continue + zone = _parse_location(name) + if not zone: + continue + region = _zone_to_region(zone) + if not region: + continue - if region_filter and not zone.startswith(region_filter): + # --- Region filter: exact match on derived region (spec 7) --- + if region_filter and region != region_filter: continue - # Prefer acceleratorConfig (new API); fall back to legacy acceleratorType. - accel_config = node.get("acceleratorConfig") or {} - tpu_type = (accel_config.get("type") or "").upper() - topology = (accel_config.get("topology") or "").strip() - accel_type_legacy = (node.get("acceleratorType") or "").strip() + # --- State: must be exactly READY (spec 3.1, 4) --- + if node.get("state") != "READY": + continue - if not tpu_type and accel_type_legacy: - tpu_type = _tpu_type_from_legacy(accel_type_legacy) + # --- createTime: absent/unparsable/future -> skip (spec 7) --- + create_time = _parse_rfc3339_utc(node.get("createTime") or "") + if create_time is None: + continue + if create_time > now: + continue - chips, chip_count_approximate = _chip_count(accel_type_legacy, topology or None, tpu_type) - hourly, pricing_confidence = _hourly_cost(tpu_type, chips, chip_count_approximate) - monthly = hourly * _HOURS_PER_MONTH + # --- Full window coverable (spec 7) --- + if create_time > window_start: + continue + + # --- Standalone: queuedResource absent/empty (spec 3.5, 9) --- + queued_resource = node.get("queuedResource") + if queued_resource is None or queued_resource == "": + pass # standalone + elif isinstance(queued_resource, str): + continue # non-empty string -> managed by queued resource + else: + continue # malformed non-string/non-null -> skip + + # --- Standalone: multisliceNode not true (spec 3.5, 9) --- + multislice = node.get("multisliceNode") + if multislice is None or multislice is False: + pass # standalone + elif multislice is True: + continue # explicitly multislice + else: + continue # malformed non-bool/non-null -> skip + + # --- Monitoring (no age-only fallback, spec 8.6) --- + if monitoring_client is None: + continue - create_str = node.get("createTime", "") - age_days: Optional[float] = None - if create_str: + # Zone-level diagnostic query — one call per zone is sufficient; + # _run_zone_diagnostic filters by location only and results are not + # attributed to any specific node, so caching across nodes in the same + # zone is safe (spec 11.1). + if zone in zone_errors: + continue + if zone not in zone_ok: try: - create_dt = datetime.fromisoformat(create_str.replace("Z", "+00:00")) - if create_dt.tzinfo is None: - create_dt = create_dt.replace(tzinfo=timezone.utc) - age_days = (now - create_dt).total_seconds() / 86400 - except (ValueError, AttributeError): - pass - - # Idle detection — monitoring first, age fallback. - # Canonical lookup key (short name) matches _fetch_duty_cycles normalisation. - node_duty_cycle: Optional[float] = duty_cycles.get(node_id) - if node_duty_cycle is None and duty_cycles: - # duty_cycles is non-empty but this node has no entry — collect for - # a single aggregated warning rather than one warning per node. - unmatched_node_ids.append(node_id) - - if node_duty_cycle is not None: - if node_duty_cycle > _DUTY_CYCLE_IDLE_THRESHOLD: - continue # Active — skip - # Enforce the idle_days minimum: nodes younger than the window - # cannot have been idle for idle_days by definition. Skip them - # regardless of what duty_cycle reports — the rule contract is - # "idle for 7+ days", not "low utilisation at time of scan". - if age_days is None or age_days < idle_days: + _run_zone_diagnostic( + monitoring_client, + project_id, + zone, + window_start, + window_end, + ) + zone_ok.add(zone) + except Exception as e: + warnings.warn( + f"gcp.tpu.idle: monitoring query failed for zone '{zone}' " + f"({type(e).__name__}: {e}) — nodes in this zone will be skipped", + UserWarning, + stacklevel=2, + ) + zone_errors.add(zone) continue - confidence = ConfidenceLevel.HIGH - idle_signal = ( - f"max duty_cycle={node_duty_cycle:.1%} over {idle_days}d window " - f"(threshold: {_DUTY_CYCLE_IDLE_THRESHOLD:.0%})" - ) - elif age_days is not None and age_days >= idle_days: - # Age-only fallback: createTime tells us when the node was created, - # NOT when it was last used. LOW confidence. - confidence = ConfidenceLevel.LOW - idle_signal = ( - f"no monitoring data; node exists for {age_days:.0f}d with no " - f"observed activity (≥ {idle_days}d threshold) — existence " - "duration is not a reliable idle proxy" - ) - else: - continue # Too new or no age data — not enough signal - risk = _compute_risk(confidence, hourly) + # This rule currently emits no findings (join barrier, spec 8.3). + # _run_zone_diagnostic returns None (side-effect only); the telemetry verdict is + # not derived from it. "unresolved" is the single hardcoded source of truth here. + # When the join is implemented: replace this assignment with the structured + # verdict from a node-attributed call to _run_zone_diagnostic (see its docstring). + telemetry = "unresolved" + if telemetry != "confirmed_idle": + continue + + # --- Build finding --- + # UNREACHABLE TODAY: the guard above always skips this block because + # _run_zone_diagnostic always returns "unresolved" (join barrier, spec 8.3). + # When the join is implemented, ALL signal strings and detail values below must + # be verified and updated — in particular: + # - The "Worker join:" signal must use actual proven joined/expected counts, + # not the static "complete (expected == joined workers)" string below. + # - "telemetry_join_state" in details must be set to the actual proven state + # (e.g. "complete") rather than the placeholder "unresolved" below. + # Do NOT bypass the join barrier to reach this block without first proving + # telemetry_join_state == "complete" per spec 8.3. + accel_config = node.get("acceleratorConfig") or {} + tpu_type = (accel_config.get("type") or "").strip() + topology = (accel_config.get("topology") or "").strip() + accel_type_legacy = (node.get("acceleratorType") or "").strip() + + if not tpu_type and accel_type_legacy: + tpu_type = _tpu_type_from_legacy(accel_type_legacy) - runtime = (node.get("runtimeVersion") or "").strip() scheduling = node.get("schedulingConfig") or {} - preemptible = scheduling.get("preemptible", False) - spot = scheduling.get("spot", False) + preemptible = bool(scheduling.get("preemptible", False)) + spot = bool(scheduling.get("spot", False)) + reserved = bool(scheduling.get("reserved", False)) + runtime = (node.get("runtimeVersion") or "").strip() - type_str = accel_type_legacy or tpu_type or "unknown" - hw_parts = [type_str] + age_days_val = (now - create_time).total_seconds() / 86400 + accel_context = tpu_type or accel_type_legacy or "unknown" + hw_parts = [accel_context] if topology: - hw_parts.append(f"[{topology}]") - hw_parts.append(f"{chips} chip{'s' if chips != 1 else ''}") - hardware_label = ", ".join(hw_parts) - - age_str = f"{age_days:.1f}d" if age_days is not None else "unknown" - node_region = _zone_to_region(zone) if zone != "unknown" else "unknown" - # Pricing is a baseline estimate: us-central1 on-demand rate, not - # adjusted for actual region, committed use, or customer agreements. - pricing_note = ( - f"baseline estimate, us-central1 on-demand (region: {node_region}, zone: {zone})" - ) + hw_parts.append(f"topology={topology}") + hw_label = " ".join(hw_parts) - scheduling_note: Optional[str] = None + scheduling_parts = [] if spot: - scheduling_note = ( - "Scheduling: spot — cost NOT adjusted for spot discount (~60–70% lower); " - "node may have been preempted rather than left idle" - ) - elif preemptible: - scheduling_note = ( - "Scheduling: preemptible — cost NOT adjusted for preemptible discount " - "(~30% lower); node may have been interrupted rather than left idle" - ) - + scheduling_parts.append("spot") + if preemptible: + scheduling_parts.append("preemptible") + if reserved: + scheduling_parts.append("reserved") + scheduling_context = ", ".join(scheduling_parts) or "on-demand" + + # TODO (join barrier, spec 8.3): The last four signals are aspirational + # templates. Replace each [TODO: ...] entry with dynamically constructed + # strings derived from actual proven join/coverage/activity values when + # the join is implemented. signals = [ - f"Node state: READY (billable) — age: {age_str}", - f"Idle signal: {idle_signal}", - f"Hardware: {hardware_label}", - f"Burn rate: ~${hourly:.2f}/hr ({chips} chip{'s' if chips != 1 else ''} " - f"× ${hourly / chips:.4g}/chip-hr; {pricing_note})", + f"State: READY (billable); zone: {zone}; region: {region}", + f"createTime: {create_time.isoformat()}; node age: {age_days_val:.1f}d", + f"Standalone: queuedResource={queued_resource!r}, multisliceNode={multislice!r}", + f"Accelerator: {hw_label}", + f"Scheduling: {scheduling_context}", + f"[TODO: Worker join — actual joined/expected worker counts; " + f"metric: {_DUTY_CYCLE_METRIC}]", + f"Metric: {_DUTY_CYCLE_METRIC} on {_DUTY_CYCLE_RESOURCE_TYPE}", + f"Idle window: {window_start.isoformat()} - {window_end.isoformat()} ({idle_days}d)", + f"Threshold: {_DUTY_CYCLE_THRESHOLD_PCT}% max duty cycle", + f"[TODO: telemetry confirmed no duty-cycle datapoint above " + f"{_DUTY_CYCLE_THRESHOLD_PCT}% over the full buffered window]", ] if runtime: signals.append(f"Runtime: {runtime}") - if scheduling_note: - signals.append(scheduling_note) not_checked = [ - "Batch or scheduled jobs — duty_cycle captures real-time utilization only; " - "a node running nightly jobs may appear idle between runs; consider " - "increasing idle_days for batch workloads", - "Cost shown is a us-central1 on-demand baseline estimate; actual cost " - f"varies by region ({node_region}), committed use, spot/preemptible " - "discounts, and customer pricing agreements", - "duty_cycle metric not always emitted for newer TPU types (V5+, V6E) " - "or nodes that have never had a workload submitted — no data ≠ idle", + "Batch or scheduled jobs — duty_cycle captures real-time accelerator activity; " + "a node used only for nightly jobs may appear idle between runs", + "Cost impact — pricing varies by TPU type, region, and usage option; " + "no flat estimate is appropriate", "Nodes shared across teams where utilization is tracked externally", ] - evidence = Evidence( - signals_used=signals, - signals_not_checked=not_checked, - time_window=f"{idle_days}d", - ) - node_display = (node.get("description") or "").strip() or node_id + findings.append( Finding( provider="gcp", rule_id="gcp.tpu.idle", resource_type="gcp.tpu.node", - resource_id=name or node_id, - region=zone, - title=f"Idle Cloud TPU Node ({hardware_label})", + resource_id=name, + region=region, + title=f"Idle Cloud TPU Node ({accel_context})", + # TODO (join barrier): replace summary/reason with telemetry-confirmed + # text once the join is implemented (spec 8.3). summary=( - ( - f"Cloud TPU node '{node_display}' ({hardware_label}) has " - f"near-zero utilization (max duty_cycle={node_duty_cycle:.1%}) " - f"over the past {idle_days} days in READY state, " - f"costing ~${hourly:.2f}/hr (~${monthly:,.0f}/mo)." - ) - if node_duty_cycle is not None - else ( - f"Cloud TPU node '{node_display}' ({hardware_label}) has existed " - f"for ≥{idle_days} days in READY state with no utilization data " - f"(heuristic: existence duration only — utilization unknown), " - f"costing ~${hourly:.2f}/hr (~${monthly:,.0f}/mo)." - ) + f"Cloud TPU node '{node_display}' ({accel_context}) has been in " + f"READY state for {age_days_val:.0f}d. " + f"[TODO: add joined duty-cycle telemetry confirmation (spec 8.3)]" ), reason=( - ( - f"TPU node in READY state with near-zero utilization " - f"(duty_cycle ≤ {_DUTY_CYCLE_IDLE_THRESHOLD:.0%}) " - f"for {idle_days} days" - ) - if node_duty_cycle is not None - else ( - f"TPU node in READY state for ≥{idle_days} days " - f"(heuristic: age only — no utilization data available)" - ) + f"TPU node in READY state " + f"[TODO: add duty-cycle verdict — max <= {_DUTY_CYCLE_THRESHOLD_PCT}% " + f"over {idle_days}d window (spec 8.3)]" ), - risk=risk, - confidence=confidence, + risk=RiskLevel.HIGH, + confidence=ConfidenceLevel.HIGH, detected_at=now, - evidence=evidence, - estimated_monthly_cost_usd=round(monthly, 2), + evidence=Evidence( + signals_used=signals, + signals_not_checked=not_checked, + time_window=f"{idle_days}d", + ), + estimated_monthly_cost_usd=None, details={ "node_name": name, "node_id": node_id, "zone": zone, - "region": node_region, + "region": region, "tpu_type": tpu_type or accel_type_legacy or None, "topology": topology or None, - "chip_count": chips, - "chip_count_approximate": chip_count_approximate, "runtime_version": runtime or None, "preemptible": preemptible, "spot": spot, - "age_days": round(age_days, 1) if age_days is not None else None, - "max_duty_cycle": node_duty_cycle, + "reserved": reserved, + "age_days": round(age_days_val, 1), "idle_days_threshold": idle_days, - "hourly_cost_usd": round(hourly, 4), - "pricing_confidence": pricing_confidence, - "pricing_scope": "us_central1_reference_not_region_adjusted", + "duty_cycle_threshold_pct": _DUTY_CYCLE_THRESHOLD_PCT, + "monitoring_buffer_seconds": _MONITORING_BUFFER_SECONDS, + # All three telemetry state fields are "unresolved" today (join barrier, + # spec 8.3). When the join is implemented, set each dynamically: + # telemetry_join_state — proven join state, e.g. "complete" + # telemetry_coverage_state — coverage verdict from 8.4, e.g. "complete" + # telemetry_state — overall verdict, e.g. "confirmed_idle" + "telemetry_join_state": "unresolved", + "telemetry_coverage_state": "unresolved", + "telemetry_state": "unresolved", }, ) ) - if unmatched_node_ids: - warnings.warn( - f"gcp.tpu.idle: {len(unmatched_node_ids)} node(s) in project '{project_id}' " - f"had no duty_cycle data in monitoring — key may not match monitoring label; " - f"fell back to age-based detection. Node IDs: {', '.join(unmatched_node_ids)}", - stacklevel=2, - ) - return findings + + +find_idle_tpu_nodes.RULE_ID = "gcp.tpu.idle" diff --git a/cleancloud/providers/gcp/rules/ai/vertex_endpoint_idle.py b/cleancloud/providers/gcp/rules/ai/vertex_endpoint_idle.py index a37ba6f..ea952e1 100644 --- a/cleancloud/providers/gcp/rules/ai/vertex_endpoint_idle.py +++ b/cleancloud/providers/gcp/rules/ai/vertex_endpoint_idle.py @@ -1,10 +1,69 @@ +""" +Rule: gcp.vertex.endpoint.idle + + (spec -- docs/specs/gcp/ai/vertex_endpoint_idle.md) + +Intent: + Detect Vertex AI Endpoints with a documented always-deployed serving floor + and no observed online prediction request activity over a conservative review + window, using documented Cloud Monitoring request-count telemetry. + + This is a precision-first review-candidate rule. It is not proof that the + endpoint is safe to delete, not proof that all endpoint verbs are unused, + and not proof of a specific monthly saving. + +Covered resource families: + - Vertex AI Endpoint (projects.locations.endpoints, aiplatform v1 REST API) + +In-scope deployed models (spec 3.3, 3.4): + - dedicatedResources.minReplicaCount >= 1 (always-deployed serving floor) + - automaticResources.minReplicaCount >= 1 (always-deployed serving floor) + +Out-of-scope: + - sharedResources deployments (shared pool cost not directly attributable; spec 11.4) + - dedicatedResources.minReplicaCount == 0 (scale-to-zero preview; no always-deployed floor) + - automaticResources.minReplicaCount == 0 (scale-to-zero; no always-deployed floor) + - endpoints with no deployed models + - shared-resource-only endpoints (spec 11.4) + +Exclusions: + - endpoint name or location malformed (spec 7) + - location filter set and location does not exactly match (spec 9) + - no in-scope deployed models; provisioned_serving_floor < 1 (spec 9) + - shared-resource-only endpoint (spec 9, 11.4) + - any in-scope deployed model createTime missing, future, or unparsable (spec 7) + - endpoint createTime missing, future, or unparsable (spec 7) + - capacity_floor_start > evaluation_window_start (window not fully coverable; spec 9) + - monitoring client creation failure -- all endpoints skip; no fallback (spec 11.2) + - monitoring query failure for a location (spec 11.2) + - telemetry_coverage_state != "complete" (spec 8.3, 9) + - max_observed_request_rate_per_replica > 0 (spec 9) + +Detection (all must be true to emit): + - provisioned_serving_floor >= 1 + - capacity_floor_start_utc <= evaluation_window_start_utc + - telemetry_coverage_state == "complete" + - max_observed_request_rate_per_replica == 0 + +Cost model (spec 6.4): + estimated_monthly_cost_usd = None + Pricing varies by machine type, accelerator, region, and usage option; + a flat estimate would be misleading. + +APIs: + - aiplatform.googleapis.com/v1: projects/{project}/locations/-/endpoints + - monitoring.googleapis.com: aiplatform.googleapis.com/prediction/online/request_count + on aiplatform.googleapis.com/Endpoint +""" + +import warnings from collections import defaultdict from datetime import datetime, timedelta, timezone from typing import Dict, List, Optional, Tuple from google.auth.transport.requests import AuthorizedSession from google.cloud import monitoring_v3 -from google.protobuf import duration_pb2, timestamp_pb2 +from google.protobuf import timestamp_pb2 from cleancloud.core.confidence import ConfidenceLevel from cleancloud.core.evidence import Evidence @@ -18,7 +77,14 @@ "cost_impact": "high", } -# Accelerator types treated as GPU/high-cost +_DEFAULT_IDLE_DAYS = 14 + +# Canonical metric and resource type (spec 8.1, 8.2) +_REQUEST_METRIC_TYPE = "aiplatform.googleapis.com/prediction/online/request_count" +_REQUEST_METRIC_RESOURCE_TYPE = "aiplatform.googleapis.com/Endpoint" + +# Accelerator types for risk classification (spec 10.2). +# Risk is HIGH when any in-scope dedicated model has a nonzero accelerator count/type. _GPU_ACCELERATORS = frozenset( { "NVIDIA_TESLA_T4", @@ -35,68 +101,345 @@ } ) -# Monthly cost per node for the machine type alone (on-demand, us-central1, 730 h/month) -# Source: https://cloud.google.com/vertex-ai/pricing (Online Prediction node hours) -_MACHINE_MONTHLY_COST = { - "n1-standard-2": 69.0, - "n1-standard-4": 138.0, - "n1-standard-8": 277.0, - "n1-standard-16": 554.0, - "n1-standard-32": 1_107.0, - "n1-highmem-2": 93.0, - "n1-highmem-4": 187.0, - "n1-highmem-8": 374.0, - "n1-highmem-16": 748.0, - "n2-standard-2": 78.0, - "n2-standard-4": 157.0, - "n2-standard-8": 314.0, - "n2-standard-16": 628.0, - "c2-standard-4": 166.0, - "c2-standard-8": 332.0, - "c2-standard-16": 665.0, - # a2-* and g2-* include accelerator cost — no separate GPU add-on - "a2-highgpu-1g": 2_933.0, - "a2-highgpu-2g": 5_866.0, - "a2-highgpu-4g": 11_732.0, - "a2-highgpu-8g": 23_464.0, - "a2-ultragpu-1g": 5_103.0, - "a2-ultragpu-2g": 10_206.0, - "g2-standard-4": 706.0, - "g2-standard-8": 1_060.0, - "g2-standard-12": 1_414.0, - "g2-standard-16": 2_120.0, - "g2-standard-24": 3_181.0, - "g2-standard-32": 4_241.0, - "g2-standard-48": 6_361.0, - "g2-standard-96": 12_722.0, -} -_DEFAULT_MACHINE_MONTHLY_COST = 150.0 - -# Additional monthly cost per GPU when attached to n1-*/n2-* machines. -# a2-* and g2-* already include GPU cost in the machine price above. -_GPU_MONTHLY_COST_EACH = { - "NVIDIA_TESLA_T4": 311.0, - "NVIDIA_TESLA_V100": 1_385.0, - "NVIDIA_TESLA_P100": 1_022.0, - "NVIDIA_TESLA_K80": 392.0, - "NVIDIA_TESLA_A100": 2_933.0, - "NVIDIA_L4": 680.0, - "NVIDIA_H100_80GB": 8_000.0, -} -_DAYS_IDLE = 14 +def _parse_location(name: str) -> Optional[str]: + """Extract location from endpoint resource name. + + Resolves from the exact 'locations/{location}' segment (spec 7). + Returns None if the segment is absent or empty. + """ + parts = name.split("/") + try: + idx = parts.index("locations") + loc = parts[idx + 1] + return loc if loc else None + except (ValueError, IndexError): + return None + + +def _parse_endpoint_id(name: str) -> str: + """Extract endpoint ID from the final segment of the resource name.""" + return name.rsplit("/", 1)[-1] if name else "" + + +def _parse_rfc3339_utc(ts: str) -> Optional[datetime]: + """Parse an RFC3339 timestamp string into a timezone-aware UTC datetime.""" + if not ts: + return None + try: + dt = datetime.fromisoformat(ts.replace("Z", "+00:00")) + if dt.tzinfo is None: + dt = dt.replace(tzinfo=timezone.utc) + return dt.astimezone(timezone.utc) + except (ValueError, AttributeError): + return None + + +def _classify_deployed_models( + deployed_models: list, + now: datetime, +) -> dict: + """ + Classify deployed models on an endpoint per spec 3.3, 3.4, 7, 9. + + In-scope (spec 3.3, 3.4): + dedicatedResources.minReplicaCount >= 1 + automaticResources.minReplicaCount >= 1 + + Out-of-scope (skip model, not endpoint): + sharedResources (shared pool; not endpoint-attributable; spec 11.4) + any resource mode with minReplicaCount == 0 + + skip=True cases (caller must skip entire endpoint per spec 9): + malformed or missing minReplicaCount on any model + unrecognized prediction-resource union on any model + unusable createTime (missing, future, unparsable) on any in-scope model + + Returns a dict: + skip (bool): True -> caller must skip the endpoint (malformed record). + provisioned_floor (int): sum of minReplicaCount across in-scope models; + 0 means no always-deployed serving floor. + shared_only (bool): True only when all models use sharedResources and + none use dedicatedResources or automaticResources (spec 11.4). + has_accelerator (bool): any in-scope dedicated model has a nonzero + GPU/TPU accelerator count and a recognized accelerator type (spec 10.2). + capacity_floor_start (datetime | None): max createTime across in-scope + models; None when provisioned_floor == 0 (no in-scope models). + resource_modes (str): resource mode types seen on the endpoint. + in_scope_count (int): number of in-scope deployed models. + """ + _skip = { + "skip": True, + "provisioned_floor": 0, + "shared_only": False, + "has_accelerator": False, + "capacity_floor_start": None, + "resource_modes": "malformed", + "in_scope_count": 0, + } + + if not deployed_models: + return { + "skip": False, + "provisioned_floor": 0, + "shared_only": False, + "has_accelerator": False, + "capacity_floor_start": None, + "resource_modes": "none", + "in_scope_count": 0, + } + + provisioned_floor = 0 + has_accelerator = False + in_scope_create_times: List[datetime] = [] + seen_modes: List[str] = [] + in_scope_count = 0 + + for model in deployed_models: + dedicated = model.get("dedicatedResources") + automatic = model.get("automaticResources") + shared = model.get("sharedResources") + + if dedicated is not None: + if "dedicatedResources" not in seen_modes: + seen_modes.append("dedicatedResources") + raw = dedicated.get("minReplicaCount") + if raw is None: + # minReplicaCount is required; missing is malformed (spec 9) + return _skip + try: + min_rep = int(raw) + except (TypeError, ValueError): + return _skip + if min_rep >= 1: + in_scope_count += 1 + provisioned_floor += min_rep + spec = dedicated.get("machineSpec") or {} + at = spec.get("acceleratorType", "ACCELERATOR_TYPE_UNSPECIFIED") + try: + ac = int(spec.get("acceleratorCount") or 0) + except (TypeError, ValueError): + ac = 0 + if ( + at + and at != "ACCELERATOR_TYPE_UNSPECIFIED" + and ac > 0 + and at in _GPU_ACCELERATORS + ): + has_accelerator = True + ct = _parse_rfc3339_utc(model.get("createTime") or "") + if ct is None or ct > now: + # Unusable createTime on in-scope model -> skip endpoint (spec 7, 9) + return _skip + in_scope_create_times.append(ct) + + elif automatic is not None: + if "automaticResources" not in seen_modes: + seen_modes.append("automaticResources") + raw = automatic.get("minReplicaCount") + if raw is None: + # minReplicaCount is required; missing is malformed (spec 9) + return _skip + try: + min_rep = int(raw) + except (TypeError, ValueError): + return _skip + if min_rep >= 1: + # automaticResources does not expose machineSpec; no accelerator check (spec 10.2) + in_scope_count += 1 + provisioned_floor += min_rep + ct = _parse_rfc3339_utc(model.get("createTime") or "") + if ct is None or ct > now: + # Unusable createTime on in-scope model -> skip endpoint (spec 7, 9) + return _skip + in_scope_create_times.append(ct) + + elif shared is not None: + if "sharedResources" not in seen_modes: + seen_modes.append("sharedResources") + else: + # Unrecognized prediction-resource union -> malformed record; skip endpoint (spec 9) + return _skip + + resource_modes = ", ".join(seen_modes) if seen_modes else "none" + + # shared_only: endpoint has sharedResources models and no dedicated or automatic models at all + shared_only = ( + provisioned_floor == 0 + and "sharedResources" in seen_modes + and "dedicatedResources" not in seen_modes + and "automaticResources" not in seen_modes + ) + + if provisioned_floor == 0: + return { + "skip": False, + "provisioned_floor": 0, + "shared_only": shared_only, + "has_accelerator": False, + "capacity_floor_start": None, + "resource_modes": resource_modes, + "in_scope_count": 0, + } + + # provisioned_floor >= 1 and all in-scope createTimes are valid (fail-fast above catches bad ones) + return { + "skip": False, + "provisioned_floor": provisioned_floor, + "shared_only": False, + "has_accelerator": has_accelerator, + "capacity_floor_start": max(in_scope_create_times), + "resource_modes": resource_modes, + "in_scope_count": in_scope_count, + } + + +def _query_location_request_counts( + client: monitoring_v3.MetricServiceClient, + project_id: str, + location: str, + window_start: datetime, + window_end: datetime, + eligible_endpoint_ids: set, +) -> Optional[Dict[str, List[Tuple[float, datetime]]]]: + """ + Query request-count telemetry for all eligible endpoints in a location. + + Issues a single monitoring call for the location (spec 8.2: batching allowed). + Exact endpoint attribution is enforced from resource.labels.endpoint_id (spec 8.2). + No cross_series_reducer -- per-endpoint series identity is preserved (spec 8.2). + No per-series aligner -- raw request-count values are preserved for zero/nonzero + evaluation without any transform step (spec 8.2.7). + + Returns a dict mapping endpoint_id -> list of (value, timestamp) tuples for + in-window usable datapoints, or None if the query fails (caller must skip all + endpoints in this location; spec 11.2). + + In-window: point.interval.end_time falls within [window_start, window_end]. + Value extraction: int64_value first, else double_value kept as float -- a + positive non-integer double (e.g. 0.7) must not be truncated to zero (spec 8.4.3). + Null, NaN, and unsupported value shapes are ignored. + """ + try: + results = client.list_time_series( + request={ + "name": f"projects/{project_id}", + "filter": ( + f'metric.type="{_REQUEST_METRIC_TYPE}"' + f' AND resource.type="{_REQUEST_METRIC_RESOURCE_TYPE}"' + f' AND resource.labels.location="{location}"' + ), + "interval": monitoring_v3.TimeInterval( + start_time=timestamp_pb2.Timestamp(seconds=int(window_start.timestamp())), + end_time=timestamp_pb2.Timestamp(seconds=int(window_end.timestamp())), + ), + "view": monitoring_v3.ListTimeSeriesRequest.TimeSeriesView.FULL, + # No aggregation: preserves raw request-count values per spec 8.2.7. + # No cross_series_reducer: preserves per-endpoint series identity per spec 8.2. + } + ) + + per_endpoint: Dict[str, List[Tuple[float, datetime]]] = {} + + for series in results: + ep_id = series.resource.labels.get("endpoint_id", "") + if not ep_id or ep_id not in eligible_endpoint_ids: + continue # not in eligible set; exact attribution required (spec 8.2) + + if ep_id not in per_endpoint: + per_endpoint[ep_id] = [] + + for point in series.points: + # Get point end timestamp (spec 8.4.1: use monitoring timestamps) + try: + if point.interval and point.interval.end_time: + pt_ts = point.interval.end_time.ToDatetime(tzinfo=timezone.utc) + else: + continue + except (AttributeError, TypeError): + continue + + # Ignore points outside the full observation window (spec 8.4.2) + if pt_ts < window_start or pt_ts > window_end: + continue + + # Extract value: int64_value first, else double_value as float (spec 8.4.3). + # Keep double as float -- do not truncate to int; 0.7 must remain 0.7 > 0. + val: float = float(point.value.int64_value) + if val == 0.0: + try: + dval = point.value.double_value + if dval and dval == dval: # truthy and not NaN + val = float(dval) + except (AttributeError, TypeError): + pass + + per_endpoint[ep_id].append((val, pt_ts)) + + return per_endpoint + + except Exception: + return None # conservative: caller skips all endpoints in this location (spec 11.2) + + +def _evaluate_endpoint_telemetry( + points: List[Tuple[float, datetime]], + window_start: datetime, + window_end: datetime, +) -> Tuple[str, str, float]: + """ + Evaluate telemetry coverage and activity state from collected in-window datapoints. + + Returns (telemetry_coverage_state, telemetry_state, max_observed_rate): + telemetry_coverage_state: "complete" or "unresolved" + telemetry_state: "no_observed_prediction_requests", + "observed_prediction_requests", or "unresolved" + max_observed_rate: maximum usable in-window value; 0.0 when unresolved + + Coverage rules (spec 8.3): + - No in-window points -> unresolved (spec 8.3.1). + - Gap-based coverage checks (spec 8.3.6, 8.3.8): + Threshold: (window_end - window_start).total_seconds() / 2. + Leading gap (window_start to first point): > threshold -> unresolved. + Interior gap (between consecutive points): > threshold -> unresolved. + Trailing gap (last point to window_end): > threshold -> unresolved. + A gap larger than half the observation window cannot be proven to + preserve sufficient observation across the full window. The threshold + is relative to the window -- not a fixed cadence assumption -- consistent + with the spec prohibition on inventing a sampling cadence or mandatory + trailing ingestion buffer (spec 8.3). + - All gaps within threshold -> coverage complete. + + Activity rules (spec 8.4): + - max value > 0 -> observed_prediction_requests (spec 8.4.5) + - max value == 0 -> no_observed_prediction_requests (spec 8.4.6) + """ + if not points: + return "unresolved", "unresolved", 0.0 + + threshold_s = (window_end - window_start).total_seconds() / 2 + + # Sort by timestamp to check gaps in chronological order + sorted_pts = sorted(points, key=lambda x: x[1]) + timestamps = [ts for _, ts in sorted_pts] + + # Leading gap: window_start to first observed point (spec 8.3.6, 8.3.8) + if (timestamps[0] - window_start).total_seconds() > threshold_s: + return "unresolved", "unresolved", 0.0 + + # Interior gaps: between consecutive observed points (spec 8.3.6, 8.3.8) + for i in range(1, len(timestamps)): + if (timestamps[i] - timestamps[i - 1]).total_seconds() > threshold_s: + return "unresolved", "unresolved", 0.0 -# Endpoints with fewer than this many prediction requests in the idle window are -# flagged as "near-idle" (MEDIUM confidence). Zero requests -> fully idle. -# GPU endpoints use a lower threshold — higher cost justifies more aggressive flagging. -# Threshold is then scaled by sqrt(replicas) so large deployments can't hide inefficiency -# behind a linearly growing bar (20 replicas -> ×4.5, not ×20). -_LOW_TRAFFIC_THRESHOLD = 10 -_LOW_TRAFFIC_THRESHOLD_GPU = 5 + # Trailing gap: last observed point to window_end (spec 8.3.6, 8.3.8) + if (window_end - timestamps[-1]).total_seconds() > threshold_s: + return "unresolved", "unresolved", 0.0 -# Findings below this estimated cost are suppressed — avoids noise from cheap endpoints -# and builds user trust by keeping findings actionable. -_MIN_MONTHLY_COST_USD = 50 + max_val = max(v for v, _ in points) + if max_val > 0: + return "complete", "observed_prediction_requests", max_val + return "complete", "no_observed_prediction_requests", 0.0 def find_idle_vertex_endpoints( @@ -104,383 +447,254 @@ def find_idle_vertex_endpoints( project_id: str, credentials, region_filter: Optional[str] = None, + idle_days: int = _DEFAULT_IDLE_DAYS, ) -> List[Finding]: """ - Find Vertex AI Online Prediction endpoints with zero or near-zero predictions - for 14 days. + Find Vertex AI endpoints with an always-deployed serving floor and zero observed + prediction requests over the full observation window. - Vertex AI endpoints with dedicatedResources.minReplicaCount > 0 keep instances - running continuously regardless of traffic — billing is per node-hour regardless - of prediction volume. GPU-backed endpoints (T4, V100, A100) cost $300–$8K/month - per GPU plus machine cost. Endpoints created for experiments are frequently - abandoned after the model demo or prototype phase. + Emits a finding only when all of the following are true (spec 9): + 1. at least one deployed model is in scope with provisioned_serving_floor >= 1 + 2. capacity_floor_start_utc <= evaluation_window_start_utc (full window coverable) + 3. telemetry_coverage_state == "complete" + 4. max_observed_request_rate_per_replica == 0 - Detection tiers: - - IDLE: Zero prediction requests over the idle window -> HIGH/MEDIUM confidence - - NEAR-IDLE: < 10 requests over the idle window -> MEDIUM confidence + No age-only, traffic-split, or missing-telemetry fallback is performed (spec 8.5). - Endpoints using automaticResources (auto-scaling to zero) are excluded — they - incur no compute cost when idle. - - Monitoring queries are batched per location — one API call per region rather - than one call per endpoint. - - IAM permissions: - - aiplatform.endpoints.list (roles/aiplatform.viewer) - - monitoring.timeSeries.list (roles/monitoring.viewer) + IAM permissions required: + aiplatform.endpoints.list (roles/aiplatform.viewer) + monitoring.timeSeries.list (roles/monitoring.viewer) """ - findings: List[Finding] = [] - now = datetime.now(timezone.utc) + idle_days = max(1, idle_days) + now = datetime.now(timezone.utc).replace(microsecond=0) + window_end = now + window_start = window_end - timedelta(seconds=idle_days * 86400) session = AuthorizedSession(credentials) + endpoints = _list_endpoints(session, project_id) + if not endpoints: + return [] try: monitoring_client: Optional[monitoring_v3.MetricServiceClient] = ( monitoring_v3.MetricServiceClient(credentials=credentials) ) - except Exception: - monitoring_client = None - - try: - endpoints = _list_endpoints(session, project_id) - except PermissionError: - raise + except Exception as exc: + warnings.warn( + f"gcp.vertex.endpoint.idle: monitoring client creation failed " + f"({type(exc).__name__}: {exc}) -- all endpoints will be skipped (no fallback)", + UserWarning, + stacklevel=2, + ) + return [] - # ----------------------------------------------------------------------- - # Phase 1: collect eligible endpoints, grouped by location for batching - # ----------------------------------------------------------------------- + # ------------------------------------------------------------------------- + # Phase 1: pre-check each endpoint; group eligible ones by location + # ------------------------------------------------------------------------- eligible_by_location: Dict[str, List[dict]] = defaultdict(list) for endpoint in endpoints: - endpoint_name = endpoint.get("name", "") - display_name = endpoint.get("displayName", "") + name = (endpoint.get("name") or "").strip() + if not name: + continue - # Extract location and numeric ID from resource name: - # projects/{proj}/locations/{loc}/endpoints/{id} - parts = endpoint_name.split("/") - location = parts[3] if len(parts) > 3 else "" - endpoint_id = parts[-1] if parts else "" + endpoint_id = _parse_endpoint_id(name) + if not endpoint_id: + continue - if region_filter and location != region_filter: + location = _parse_location(name) + if not location: continue - # Aggregate dedicated resources across all deployed models - total_min_replicas, machine_type, accel_type, accel_count, is_gpu = _parse_deployed_models( - endpoint.get("deployedModels", []) - ) + if region_filter and location != region_filter: + continue - # Skip endpoints with no always-on dedicated capacity — automaticResources - # scale to zero and incur no idle compute cost - if total_min_replicas == 0: + # Parse endpoint createTime (spec 7) + endpoint_create_time = _parse_rfc3339_utc(endpoint.get("createTime") or "") + if endpoint_create_time is None or endpoint_create_time > now: continue - # Age calculation — use endpoint createTime, not deployed model createTime - age_days: Optional[int] = None - create_time_str = endpoint.get("createTime", "") - if create_time_str: - try: - created_at = datetime.fromisoformat(create_time_str.replace("Z", "+00:00")) - if created_at.tzinfo is None: - created_at = created_at.replace(tzinfo=timezone.utc) - age_days = (now - created_at).days - # Skip endpoints younger than half the idle threshold — - # too new to reliably classify as abandoned - if age_days < max(_DAYS_IDLE // 2, 7): - continue - except ValueError: - pass + # Classify deployed models (spec 3.3, 3.4, 7) + deployed_models = endpoint.get("deployedModels") or [] + mc = _classify_deployed_models(deployed_models, now) + + if mc["skip"]: + continue # malformed minReplicaCount - # Effective window: cap to age so we don't look back before the endpoint existed - effective_window = min(_DAYS_IDLE, age_days) if age_days is not None else _DAYS_IDLE + if mc["provisioned_floor"] < 1: + continue # no always-deployed serving floor (covers shared_only too) - # Accurate cost: sum per deployed model (handles mixed machine types correctly) - deployed_models = endpoint.get("deployedModels", []) - monthly_cost = _compute_multi_model_cost(deployed_models) - num_dedicated_models = sum(1 for m in deployed_models if m.get("dedicatedResources")) + if mc["capacity_floor_start"] is None: + continue # in-scope deployed model createTime unusable (spec 7) + + # capacity_floor_start = max(endpoint createTime, in-scope model createTimes) (spec 7) + capacity_floor_start = max(endpoint_create_time, mc["capacity_floor_start"]) + + if capacity_floor_start > window_start: + continue # full observation window not coverable (spec 9) eligible_by_location[location].append( { - "endpoint_name": endpoint_name, + "name": name, "endpoint_id": endpoint_id, - "display_name": display_name, "location": location, - "total_min_replicas": total_min_replicas, - "machine_type": machine_type, - "accel_type": accel_type, - "accel_count": accel_count, - "is_gpu": is_gpu, - "age_days": age_days, - "effective_window": effective_window, - "monthly_cost": monthly_cost, - "num_dedicated_models": num_dedicated_models, + "display_name": (endpoint.get("displayName") or "").strip(), + "endpoint_create_time": endpoint_create_time, + "capacity_floor_start": capacity_floor_start, + "provisioned_floor": mc["provisioned_floor"], + "has_accelerator": mc["has_accelerator"], + "resource_modes": mc["resource_modes"], + "in_scope_count": mc["in_scope_count"], } ) - # ----------------------------------------------------------------------- - # Phase 2: batch monitoring query per location, then build findings - # ----------------------------------------------------------------------- - if monitoring_client is None: - return findings # conservative: skip all if monitoring unavailable + # ------------------------------------------------------------------------- + # Phase 2: batch monitoring query per location; evaluate; build findings + # ------------------------------------------------------------------------- + findings: List[Finding] = [] for location, ep_list in eligible_by_location.items(): - # One monitoring call covers all endpoints in this location. - # Pass eligible IDs so results from stale/unrelated series are ignored. eligible_ids = {ep["endpoint_id"] for ep in ep_list} - result = _get_prediction_counts_batch( - monitoring_client, project_id, location, _DAYS_IDLE, eligible_ids + telemetry_data = _query_location_request_counts( + monitoring_client, + project_id, + location, + window_start, + window_end, + eligible_ids, ) - for ep_info in ep_list: - if result is None: - # Monitoring error for this location — assume active (conservative) - continue - - counts, recently_active_ids = result - - # Suppress noise from very cheap endpoints early — avoids wasting - # confidence/signal computation on findings below the noise floor. - if ep_info["monthly_cost"] < _MIN_MONTHLY_COST_USD: - continue - - endpoint_id_key = ep_info["endpoint_id"] - is_recently_active = endpoint_id_key in recently_active_ids - # Endpoints in recently_active_ids have monitoring data — they were detected - # as recently active and excluded from counts intentionally, not because - # metrics are missing. - no_monitoring_data = endpoint_id_key not in counts and not is_recently_active - count = counts.get(endpoint_id_key, 0) - - # GPU endpoints are flagged more aggressively — higher cost warrants it. - # Sqrt scaling by replica count prevents large deployments from hiding - # inefficiency behind a linearly growing bar. - is_gpu = ep_info["is_gpu"] - total_min_replicas_ep = ep_info["total_min_replicas"] - # Recency dominates: endpoint with traffic in the last 24h is considered active - # regardless of how low the 14-day total looks. - if is_recently_active: - continue - - base_threshold = _LOW_TRAFFIC_THRESHOLD_GPU if is_gpu else _LOW_TRAFFIC_THRESHOLD - # Sqrt scaling is sublinear so large deployments can't hide inefficiency. - # Cap at 50 to keep the threshold intuitive — 100-node endpoints still only - # need 50+ requests to be considered "active" at the detection level. - effective_threshold = min( - 50, - max(1, int(base_threshold * max(1.0, total_min_replicas_ep**0.5))), + if telemetry_data is None: + # Query failure -- skip all endpoints in this location (spec 11.2) + warnings.warn( + f"gcp.vertex.endpoint.idle: monitoring query failed for location " + f"'{location}' -- all endpoints in this location will be skipped", + UserWarning, + stacklevel=2, ) - if count >= effective_threshold: - continue # genuinely active endpoint - - is_near_idle = count > 0 - age_days = ep_info["age_days"] - effective_window = ep_info["effective_window"] - - # Safety guard: missing monitoring data ≠ zero traffic. - # Monitoring can be absent due to metric delays, permission gaps, or - # misconfiguration. Require 2× the idle window before trusting absence - # as evidence of idleness — CleanCloud biases toward false negatives - # over false positives to maintain enterprise trust. - if no_monitoring_data and (age_days is None or age_days < _DAYS_IDLE * 2): - continue - - # Cron/batch pattern protection: counts at or below 20% of the capacity-adjusted - # threshold suggest periodic (e.g. weekly) usage rather than abandonment. - # Scales with effective_threshold so large deployments are handled consistently. - is_low_frequency_use = is_near_idle and count <= max(2, int(effective_threshold * 0.2)) - - # Confidence based on traffic and age. - # HIGH requires: zero traffic, full observation window, established age. - if is_low_frequency_use: - # Very low count — could be weekly/cron job; cap at MEDIUM - confidence = ConfidenceLevel.MEDIUM - elif is_near_idle: - # Some traffic exists (> 2 requests) — cap at MEDIUM regardless of age - confidence = ConfidenceLevel.MEDIUM - elif age_days is not None and age_days >= _DAYS_IDLE and effective_window == _DAYS_IDLE: - confidence = ConfidenceLevel.HIGH - elif age_days is None or age_days >= int(_DAYS_IDLE * 0.75): - confidence = ConfidenceLevel.MEDIUM - else: - continue # too borderline - - endpoint_name = ep_info["endpoint_name"] - endpoint_id = ep_info["endpoint_id"] - display_name = ep_info["display_name"] - total_min_replicas = total_min_replicas_ep - machine_type = ep_info["machine_type"] - accel_type = ep_info["accel_type"] - accel_count = ep_info["accel_count"] - monthly_cost = ep_info["monthly_cost"] - num_dedicated_models = ep_info["num_dedicated_models"] - - risk = RiskLevel.HIGH if is_gpu else RiskLevel.MEDIUM - - # Waste score: full cost when count=0; scales down as traffic approaches threshold. - # Useful for future sorting / prioritization — not yet exposed in UI. - waste_fraction = ( - 1.0 - min(count / effective_threshold, 1.0) if effective_threshold > 0 else 1.0 - ) - waste_score = round(monthly_cost * waste_fraction, 2) + continue - is_experiment_pattern = num_dedicated_models > 1 + for ep_info in ep_list: + ep_id = ep_info["endpoint_id"] + points = telemetry_data.get(ep_id, []) - # Context-aware action recommendations - recommendations: List[str] = [] - if total_min_replicas > 1: - recommendations.append( - "Reduce minReplicaCount to 1 if high availability is not required" - ) - recommendations.append( - "Switch to automaticResources (minReplicaCount=0) to eliminate idle compute cost " - "if the workload is not latency-critical — scales to zero when idle" - ) - if is_experiment_pattern: - recommendations.append( - "Consolidate deployed models or delete unused A/B test deployments" - ) - recommendations.append( - f"Delete endpoint if no longer needed: " - f"gcloud ai endpoints delete {endpoint_id} " - f"--region={location} --project=PROJECT_ID" + coverage_state, telemetry_state, max_rate = _evaluate_endpoint_telemetry( + points, window_start, window_end ) - if is_near_idle: - title = ( - f"Near-Idle Vertex AI Endpoint " - f"({count} Prediction{'s' if count != 1 else ''} in {effective_window} Days)" - ) - traffic_signal = ( - f"{count} prediction request(s) in {effective_window} days — " - f"near-idle (capacity-adjusted threshold: {effective_threshold} requests" - f"{', GPU-adjusted' if is_gpu else ''})" - ) - else: - title = f"Idle Vertex AI Endpoint (No Predictions for {effective_window} Days)" - traffic_signal = ( - f"Zero prediction requests for {effective_window} days " - "(Cloud Monitoring: aiplatform.googleapis.com/prediction/online/request_count)" - ) + if coverage_state != "complete": + continue # telemetry not sufficiently observed (spec 8.3, 9) - # Pricing is region-dependent — always flag estimate as approximate - cost_note = f"~${monthly_cost:,.0f}/month (us-central1 baseline)" - gpu_prefix = "GPU-backed endpoint — " if is_gpu else "" + if max_rate > 0: + continue # observed prediction requests (spec 9) - if is_near_idle: - summary = ( - f"{gpu_prefix}Vertex AI endpoint '{display_name or endpoint_id}' in '{location}' " - f"had only {count} prediction request(s) in {effective_window} days but keeps " - f"{total_min_replicas} dedicated node(s) running continuously, " - f"incurring an estimated {cost_note} in compute charges." - ) - else: - summary = ( - f"{gpu_prefix}Vertex AI endpoint '{display_name or endpoint_id}' in '{location}' " - f"has received zero predictions for {effective_window} days but keeps " - f"{total_min_replicas} dedicated node(s) running continuously, " - f"incurring an estimated {cost_note} in compute charges." - ) + # All conditions satisfied -- build finding + name = ep_info["name"] + endpoint_id = ep_info["endpoint_id"] + display_name = ep_info["display_name"] + provisioned_floor = ep_info["provisioned_floor"] + has_accelerator = ep_info["has_accelerator"] + resource_modes = ep_info["resource_modes"] + in_scope_count = ep_info["in_scope_count"] + capacity_floor_start = ep_info["capacity_floor_start"] + endpoint_create_time = ep_info["endpoint_create_time"] + + # Confidence always HIGH: emits only on full-window zero request-count + # telemetry with no heuristic fallback (spec 10.2) + confidence = ConfidenceLevel.HIGH + # Risk HIGH if any in-scope dedicated model has nonzero accelerator (spec 10.2) + risk = RiskLevel.HIGH if has_accelerator else RiskLevel.MEDIUM + + node_display = display_name or endpoint_id - requests_per_replica = count / max(total_min_replicas, 1) signals = [ - traffic_signal, - f"Dedicated capacity configured: minReplicaCount={total_min_replicas} " - "(always-on compute — billed continuously regardless of traffic)", - f"Requests per replica: {requests_per_replica:.2f} over {effective_window} days" - + ( - " — effectively unused" - if requests_per_replica < 0.1 - else (" — extremely low utilization" if requests_per_replica < 1.0 else "") + f"Location: {location}", + f"Endpoint createTime: {endpoint_create_time.isoformat()}", + ( + f"Capacity floor start (max of endpoint createTime and in-scope " + f"deployed model createTimes): {capacity_floor_start.isoformat()}" + ), + ( + f"Observation window: {window_start.isoformat()} to " + f"{window_end.isoformat()} ({idle_days}d)" + ), + ( + f"Provisioned serving floor: {provisioned_floor} total min replica(s) " + f"across {in_scope_count} in-scope deployed model(s)" + ), + f"Resource modes present on endpoint: {resource_modes}", + ( + f"Request metric: {_REQUEST_METRIC_TYPE} " + f"(resource: {_REQUEST_METRIC_RESOURCE_TYPE})" + ), + ( + f"Max observed request-count value over full window: {max_rate} " + f"(telemetry_coverage_state: {coverage_state})" + ), + ( + "Endpoint-scoped request-count telemetry showed no datapoint above 0 " + f"over the full {idle_days}d observation window" ), ] - if no_monitoring_data and not is_near_idle: - signals.append( - "No prediction request data found in Cloud Monitoring — " - "may indicate metrics are not enabled; classification less reliable. " - "Verify roles/monitoring.viewer and metrics ingestion before acting." - ) - if age_days is not None: - signals.append(f"Endpoint age: {age_days} days") - if machine_type: - signals.append(f"Machine type: {machine_type}") - if accel_type and accel_type != "ACCELERATOR_TYPE_UNSPECIFIED": - signals.append(f"Accelerator: {accel_type} × {accel_count}") - if is_gpu: - signals.append(f"GPU-backed endpoint — high continuous cost ({cost_note})") - if num_dedicated_models > 1: + if has_accelerator: signals.append( - f"{num_dedicated_models} deployed models with low aggregate traffic " - "— possible abandoned A/B test or failed experiment" + "Accelerator-backed in-scope dedicated model detected -- " + "risk is HIGH (nonzero accelerator count and recognized type)" ) - if total_min_replicas > 1: - signals.append( - f"{total_min_replicas} replicas configured — stronger waste signal " - "than single warm-endpoint pattern" - ) - signals.append( - f"Traffic threshold scaled sublinearly with replica count " - f"(sqrt({total_min_replicas}) × {base_threshold} = {effective_threshold} requests) " - "— prevents large deployments from masking inefficiency behind a linearly growing bar" - ) - if display_name and display_name != endpoint_id: - signals.append(f"Display name: {display_name}") - - evidence = Evidence( - signals_used=signals, - signals_not_checked=[ - "Scheduled or batch prediction requests outside the observation window", - "Internal health-check or canary traffic not tracked by Cloud Monitoring", - "Planned future usage or upcoming model promotion", - "Shadow mode or A/B test routing with low traffic share", - "Endpoints kept warm for latency-sensitive production traffic", - ], - time_window=f"{effective_window} days", - ) + + not_checked = [ + "Explain-only, health-check, or non-prediction endpoint usage", + "Shared DeploymentResourcePool cost attributable to this endpoint", + "Scheduled or batch traffic outside the observation window", + "Planned future usage or upcoming model promotion", + "Endpoints intentionally kept warm for latency-sensitive production traffic", + ] findings.append( Finding( provider="gcp", rule_id="gcp.vertex.endpoint.idle", resource_type="gcp.vertex.endpoint", - resource_id=endpoint_name, + resource_id=name, region=location, - estimated_monthly_cost_usd=monthly_cost, - title=title, - summary=summary, + title=( + f"Idle Vertex AI Endpoint " + f"({provisioned_floor} replica(s) always on, zero requests)" + ), + summary=( + f"Vertex AI endpoint '{node_display}' in '{location}' has " + f"{provisioned_floor} always-deployed replica(s) but " + f"endpoint-scoped request-count telemetry showed no prediction " + f"activity over the full {idle_days}d observation window." + ), reason=( - f"Vertex AI endpoint has {count} prediction(s) in {effective_window} days " - f"with dedicated capacity (minReplicaCount={total_min_replicas})" + f"Endpoint has provisioned serving floor of {provisioned_floor} " + f"replica(s); endpoint-scoped request-count telemetry " + f"(coverage: complete) shows max observed rate == 0 over " + f"{idle_days}d window" ), risk=risk, confidence=confidence, detected_at=now, - evidence=evidence, + evidence=Evidence( + signals_used=signals, + signals_not_checked=not_checked, + time_window=f"{idle_days}d", + ), + estimated_monthly_cost_usd=None, # spec 6.4: pricing varies; no flat estimate details={ "endpoint_id": endpoint_id, - "display_name": display_name, + "display_name": display_name or None, "location": location, - "machine_type": machine_type, - "accelerator_type": accel_type, - "accelerator_count": accel_count, - "is_gpu": is_gpu, - "min_replica_count": total_min_replicas, - "age_days": age_days if age_days is not None else "unknown", - "idle_window_days": effective_window, - "idle_days_threshold": _DAYS_IDLE, - "request_count": count, - "effective_threshold": effective_threshold, - "threshold_strategy": "sqrt_replica_scaling", - "no_monitoring_data": no_monitoring_data, - "waste_score": waste_score, - "requests_per_replica": round(requests_per_replica, 4), - "pattern": ("abandoned_experiment" if is_experiment_pattern else None), - "cost_confidence": "estimate", - "cost_basis": "us-central1 baseline estimate", - "cost_variance": ( - "Estimated based on us-central1 on-demand pricing; " - "varies by region and discounts." - ), - "estimated_monthly_cost": f"~${monthly_cost:,.0f}/month", - "recommendations": recommendations, + "provisioned_serving_floor": provisioned_floor, + "in_scope_model_count": in_scope_count, + "resource_modes": resource_modes, + "has_accelerator": has_accelerator, + "capacity_floor_start": capacity_floor_start.isoformat(), + "idle_days_threshold": idle_days, + "max_observed_request_rate_per_replica": max_rate, + "telemetry_coverage_state": coverage_state, + "telemetry_state": telemetry_state, }, ) ) @@ -521,7 +735,7 @@ def _list_endpoints(session: AuthorizedSession, project_id: str) -> list: """ List all Vertex AI Online Prediction endpoints across all locations. - Attempts the locations/- wildcard (AIP-131) first — a single paginated call + Attempts the locations/- wildcard (AIP-131) first -- a single paginated call covering every region. Falls back to querying each known location individually when the wildcard returns 400 (some projects only support specific locations such as 'global'). @@ -531,7 +745,6 @@ def _list_endpoints(session: AuthorizedSession, project_id: str) -> list: base_url = f"https://aiplatform.googleapis.com/v1/projects/{project_id}/locations" def _paginate(url: str) -> list: - """Paginate a single location URL and return all endpoints.""" results = [] params: dict = {"pageSize": 100} while True: @@ -541,9 +754,9 @@ def _paginate(url: str) -> list: "aiplatform.endpoints.list permission required (roles/aiplatform.viewer)" ) if resp.status_code == 404: - return [] # Vertex AI API not enabled for this project + return [] if resp.status_code == 400: - return None # signal to caller to try fallback + return None # signal fallback to per-location queries resp.raise_for_status() data = resp.json() results.extend(data.get("endpoints", [])) @@ -553,223 +766,19 @@ def _paginate(url: str) -> list: params["pageToken"] = next_token return results - # Fast path: wildcard covers all regions in one call sequence result = _paginate(f"{base_url}/-/endpoints") if result is not None: return result - # Fallback: wildcard not supported (e.g. project only has 'global' endpoints). - # Query each known location; skip 400s for unsupported regions. all_endpoints = [] seen_names: set = set() for location in _VERTEX_LOCATIONS: loc_result = _paginate(f"{base_url}/{location}/endpoints") if loc_result is None: - continue # 400 = unsupported location, skip + continue for ep in loc_result: - name = ep.get("name", "") - if name and name not in seen_names: - seen_names.add(name) + ep_name = ep.get("name", "") + if ep_name and ep_name not in seen_names: + seen_names.add(ep_name) all_endpoints.append(ep) return all_endpoints - - -def _parse_deployed_models( - deployed_models: list, -) -> Tuple[int, Optional[str], Optional[str], int, bool]: - """ - Aggregate dedicated resources across all deployed models on an endpoint. - - Only models with dedicatedResources are counted — automaticResources scale - to zero and do not incur idle compute cost. - - Returns (total_min_replicas, machine_type, accel_type, accel_count, is_gpu). - machine_type / accel_type are taken from the first dedicated model found - (used for display/reporting; cost is computed separately by - _compute_multi_model_cost for accuracy). - """ - total_min_replicas = 0 - machine_type: Optional[str] = None - accel_type: Optional[str] = None - accel_count = 0 - is_gpu = False - - for model in deployed_models: - dr = model.get("dedicatedResources") - if not dr: - continue # automaticResources / sharedResources — scales to zero - - min_replicas = dr.get("minReplicaCount", 0) or 0 - total_min_replicas += min_replicas - - spec = dr.get("machineSpec", {}) - if machine_type is None: - machine_type = spec.get("machineType") - - at = spec.get("acceleratorType", "ACCELERATOR_TYPE_UNSPECIFIED") - ac = int(spec.get("acceleratorCount", 0) or 0) - - if at and at != "ACCELERATOR_TYPE_UNSPECIFIED": - if accel_type is None: - accel_type = at - accel_count = ac - if at in _GPU_ACCELERATORS: - is_gpu = True - - return total_min_replicas, machine_type, accel_type, accel_count, is_gpu - - -def _compute_multi_model_cost(deployed_models: list) -> float: - """ - Compute total monthly cost by summing cost per deployed model accurately. - - Unlike the single-model estimate, this handles endpoints with multiple deployed - models of different machine types — each model's replicas are costed at their - own machine/GPU rate and summed. - """ - total = 0.0 - for model in deployed_models: - dr = model.get("dedicatedResources") - if not dr: - continue - min_replicas = dr.get("minReplicaCount", 0) or 0 - if min_replicas == 0: - continue - spec = dr.get("machineSpec", {}) - machine_type = spec.get("machineType") - at = spec.get("acceleratorType", "ACCELERATOR_TYPE_UNSPECIFIED") - ac = int(spec.get("acceleratorCount", 0) or 0) - accel_type = at if at and at != "ACCELERATOR_TYPE_UNSPECIFIED" else None - total += _estimate_cost(machine_type, accel_type, ac, min_replicas) - return total - - -def _estimate_cost( - machine_type: Optional[str], - accel_type: Optional[str], - accel_count: int, - min_replicas: int, -) -> float: - """ - Estimate total monthly cost for min_replicas always-on dedicated nodes. - - For a2-* and g2-* machines the GPU cost is already included in the machine price. - For n1-*/n2-* machines with attached GPUs, add the per-GPU cost separately. - """ - machine_cost = _MACHINE_MONTHLY_COST.get(machine_type or "", _DEFAULT_MACHINE_MONTHLY_COST) - - gpu_addon_cost = 0.0 - if accel_type and accel_type in _GPU_MONTHLY_COST_EACH: - # a2-* and g2-* bundle GPU cost — don't double-count - is_gpu_machine = (machine_type or "").startswith(("a2-", "g2-")) - if not is_gpu_machine: - gpu_addon_cost = _GPU_MONTHLY_COST_EACH[accel_type] * max(accel_count, 1) - - return (machine_cost + gpu_addon_cost) * min_replicas - - -def _get_prediction_counts_batch( - monitoring_client: monitoring_v3.MetricServiceClient, - project_id: str, - location: str, - days: int, - eligible_endpoint_ids: Optional[set] = None, -) -> Optional[Tuple[Dict[str, int], set]]: - """ - Batch query prediction counts for all Vertex AI endpoints in a location. - - Issues a single Cloud Monitoring call for the entire location (filtered by - metric type and location label) rather than one call per endpoint. - - eligible_endpoint_ids: if provided, series for endpoint IDs not in this set - are ignored — guards against stale or misattributed series from Cloud Monitoring. - - Returns (counts, recently_active_ids): - - counts: {endpoint_id: total_request_count} — endpoints with no data points - are absent from the dict (caller treats absence as "no monitoring data"). - - recently_active_ids: set of endpoint_ids that had traffic in the last 24 hours - — kept separate from counts to preserve clean signal semantics. - - Returns None on any error; callers should skip all endpoints in the location - (conservative fallback). - """ - try: - now = datetime.now(timezone.utc) - start = now - timedelta(days=max(days, 1)) - - end_ts = timestamp_pb2.Timestamp() - end_ts.FromDatetime(now) - start_ts = timestamp_pb2.Timestamp() - start_ts.FromDatetime(start) - - interval = monitoring_v3.TimeInterval(start_time=start_ts, end_time=end_ts) - - # ALIGN_SUM over the full window collapses all data points into one per - # series, preventing double-counting from overlapping metric intervals. - aggregation = monitoring_v3.Aggregation( - alignment_period=duration_pb2.Duration(seconds=max(days, 1) * 86400), - per_series_aligner=monitoring_v3.Aggregation.Aligner.ALIGN_SUM, - ) - - results = monitoring_client.list_time_series( - request={ - "name": f"projects/{project_id}", - "filter": ( - 'metric.type="aiplatform.googleapis.com/prediction/online/request_count"' - f' AND resource.labels.location="{location}"' - ), - "interval": interval, - "aggregation": aggregation, - "view": monitoring_v3.ListTimeSeriesRequest.TimeSeriesView.FULL, - } - ) - - now_ts = datetime.now(timezone.utc) - counts: Dict[str, int] = {} - recently_active_ids: set = set() - for series in results: - ep_id = series.resource.labels.get("endpoint_id", "") - if not ep_id: - continue - # Ignore series for endpoints not in our eligible set — guards against - # stale metrics or partial aggregation from unrelated endpoints - if eligible_endpoint_ids is not None and ep_id not in eligible_endpoint_ids: - continue - if not series.points: - # No data points — treat as absent (caller handles no_monitoring_data) - continue - # Sanity guard: ALIGN_SUM should yield ≤1 point per series; >5 suggests - # unexpected partial windows or aggregation edge cases — skip to be safe - if len(series.points) > 5: - continue - # Recency guard: if any traffic landed in the last 24 hours, the endpoint - # is recently active — tracked separately to keep count semantics clean - try: - timestamps = [ - p.interval.end_time.ToDatetime(tzinfo=timezone.utc) - for p in series.points - if p.interval and p.interval.end_time - ] - if timestamps: - latest_ts = max(timestamps) - # Arithmetic raises TypeError if latest_ts is not a real datetime - # (e.g. an unexpected protobuf type) — caught below - if now_ts - latest_ts < timedelta(hours=24): - recently_active_ids.add(ep_id) - continue - except (TypeError, AttributeError): - pass # no usable timestamp — fall through to normal count - # ALIGN_SUM over the full window should produce exactly one point per - # series. Sum across points defensively; duplicate/split points are - # accumulated rather than double-counted because each represents a - # distinct aligned window (non-overlapping by GCP guarantee). - series_total = sum( - point.value.int64_value or int(point.value.double_value or 0) - for point in series.points - ) - counts[ep_id] = counts.get(ep_id, 0) + series_total - - return counts, recently_active_ids - - except Exception: - return None # conservative: caller skips all endpoints in this location diff --git a/docs/rules/gcp.md b/docs/rules/gcp.md index be5dd89..2e0ae3d 100644 --- a/docs/rules/gcp.md +++ b/docs/rules/gcp.md @@ -11,11 +11,11 @@ | `gcp.compute.snapshot.old` | Storage | Disk snapshots older than 90 days | | `gcp.compute.ip.unused` | Network | Reserved static IPs in RESERVED state | | `gcp.sql.instance.idle` | Platform | Cloud SQL instances with zero connections 14+ days | -| `gcp.vertex.endpoint.idle` | AI/ML | Vertex AI endpoints with dedicated capacity and zero predictions 14+ days | +| `gcp.vertex.endpoint.idle` | AI/ML | Vertex AI endpoints with an always-deployed serving floor and zero observed request activity 14+ days | | `gcp.vertex.workbench.idle` | AI/ML | Vertex AI Workbench instances with no activity 14+ days | | `gcp.vertex.training_job.long_running` | AI/ML | Vertex AI jobs running beyond threshold | -| `gcp.tpu.idle` | AI/ML | Cloud TPU nodes with near-zero utilization 7+ days | -| `gcp.vertex.featurestore.idle` | AI/ML | Vertex AI Feature Stores with zero serving requests 30+ days | +| `gcp.tpu.idle` | AI/ML | Standalone Cloud TPU nodes in READY state with monitoring-based idle detection; currently no findings emit until worker-to-node join is documented | +| `gcp.vertex.featurestore.idle` | AI/ML | Vertex AI Feature Stores (legacy) and Bigtable-backed Feature Online Stores with zero serving requests 30+ days (Monitoring-confirmed only) | --- @@ -151,17 +151,34 @@ ## AI/ML *(opt-in: `--category ai`)* #### `gcp.vertex.endpoint.idle` -**Detects:** Vertex AI Online Prediction endpoints with `dedicatedResources` and zero predictions for `idle_days` +**Detects:** Vertex AI Online Prediction endpoints with an always-deployed serving floor (`dedicatedResources.minReplicaCount >= 1` or `automaticResources.minReplicaCount >= 1`) and no usable endpoint-scoped request-count datapoint above `0` across the full `idle_days` observation window, confirmed by Cloud Monitoring telemetry with proven gap-free coverage -**Confidence / Risk:** HIGH (zero predictions confirmed + age ≥ `idle_days`); MEDIUM (zero predictions, age ≥ 75% of threshold or age unknown) / HIGH (GPU-backed: T4, V100, A100, L4, H100, TPU); MEDIUM (CPU-only) +**Confidence / Risk:** HIGH (sole emit path: full-window zero request-count telemetry with no heuristic fallback; no MEDIUM tier) / HIGH (any in-scope dedicated model with nonzero accelerator count and recognized GPU/TPU type); MEDIUM (CPU-only or automatic-resources-only endpoints) + +**Cost:** `estimated_monthly_cost_usd = None` -- pricing varies by machine type, accelerator, region, and usage option; no flat estimate is appropriate **Permissions:** `aiplatform.endpoints.list` (roles/aiplatform.viewer), `monitoring.timeSeries.list` (roles/monitoring.viewer) **Params:** `idle_days` (default: 14) -**Exclusions:** endpoints using `automaticResources` (scale-to-zero); only `dedicatedResources` with `minReplicaCount > 0` - -**Spec:** — +**Exclusions:** +- endpoint name or location malformed or absent +- location filter set and location does not exactly match +- endpoint `createTime` absent, unparsable, or future +- no in-scope deployed models; `provisioned_serving_floor < 1` +- shared-resource-only endpoint (`sharedResources` only; spec 11.4) +- any in-scope deployed model `createTime` absent, unparsable, or future +- `capacity_floor_start > evaluation_window_start` (full window not coverable) +- malformed `minReplicaCount` or unrecognized prediction-resource union on any deployed model +- monitoring client creation failure -- all endpoints skip; no fallback +- monitoring query failure for a location -- all endpoints in that location skip +- telemetry coverage unresolved: no series, leading gap > `idle_days * 86400s / 2`, any interior gap > `idle_days * 86400s / 2`, or trailing gap > `idle_days * 86400s / 2` +- any usable request-count datapoint > `0` in the observation window +- `dedicatedResources.minReplicaCount == 0` (scale-to-zero preview; no always-deployed floor) +- `automaticResources.minReplicaCount == 0` (scale-to-zero; no always-deployed floor) +- near-idle, low-traffic, age-only, trafficSplit, or missing-telemetry-as-zero fallbacks are explicitly forbidden + +**Spec:** [docs/specs/gcp/ai/vertex_endpoint_idle.md](../specs/gcp/ai/vertex_endpoint_idle.md) #### `gcp.vertex.workbench.idle` **Detects:** Vertex AI Workbench instances `ACTIVE` with no control-plane activity (`updateTime`) for `idle_days` @@ -190,27 +207,55 @@ **Spec:** — #### `gcp.tpu.idle` -**Detects:** Cloud TPU nodes in `READY` state with max `duty_cycle ≤ 2%` across all workers for `idle_days` +**Detects:** Standalone Cloud TPU nodes in exact `READY` state where complete worker-joined duty-cycle telemetry (`tpu.googleapis.com/accelerator/duty_cycle` on `tpu.googleapis.com/GceTpuWorker`) confirms max observed duty cycle <= 2% across all joined workers and accelerators over the full buffered `idle_days` window; monitoring is required — no age-only, partial-join, or cadence-assumed fallback + +**Confidence / Risk:** HIGH / HIGH (when emitting — requires monitoring-confirmed complete join; no tiered fallback) + +**Current emission status:** No findings are emitted. The `GceTpuWorker` monitored resource labels (`resource_container`, `location`, `worker_id`) do not include a TPU Node name. No documented first-party Google Cloud surface maps `worker_id` to the owning TPU Node, so `telemetry_join_state` cannot be proven `complete` (spec 8.3). Emission requires `telemetry_join_state == complete` (spec 9, condition 7). The monitoring query is issued per zone to surface permission errors. When Google publishes a documented worker-to-node identity surface, implement the join in `_run_zone_diagnostic`. -**Confidence / Risk:** HIGH (Cloud Monitoring confirms near-zero duty cycle); LOW (Monitoring unavailable — age-only heuristic) / CRITICAL (HIGH confidence + hourly cost ≥ $10/hr); HIGH (HIGH confidence + < $10/hr); MEDIUM (LOW confidence) +**Cost:** `estimated_monthly_cost_usd = None` — pricing varies by TPU type, region, and usage option; no flat estimate is appropriate -**Permissions:** `tpu.nodes.list` (roles/tpu.viewer), `monitoring.timeSeries.list` (roles/monitoring.viewer, optional — falls back to age-based) +**Permissions:** `tpu.nodes.list` (roles/tpu.viewer), `monitoring.timeSeries.list` (roles/monitoring.viewer) **Params:** `idle_days` (default: 7) -**Exclusions:** nodes not in `READY` state; nodes younger than `idle_days` +**Exclusions (pre-checks applied before monitoring):** +- node name malformed, node ID or zone absent/unresolvable +- region filter set and derived region does not exactly match +- state not exactly `READY` +- `createTime` absent, unparsable, future, or node younger than full buffered window (`now - 180s - idle_days * 86400s`) +- `queuedResource` non-empty string (queued-resource-managed node) +- `multisliceNode == true` (multislice node) +- malformed `queuedResource` (non-string/non-null) or `multisliceNode` (non-bool/non-null) +- monitoring client creation failure (all nodes skip — no age-only fallback) +- monitoring query failure for a node (that node skips, warning issued) +- `telemetry_join_state` not `complete` — currently always the case (see above) -**Spec:** — +**Spec:** [docs/specs/gcp/ai/tpu_idle.md](../specs/gcp/ai/tpu_idle.md) #### `gcp.vertex.featurestore.idle` -**Detects:** Vertex AI Feature Stores (legacy and new-gen) with zero `online_serving/request_count` for `idle_days`; Bigtable-backed stores bill ~$197/node/month regardless of utilization +**Detects:** Vertex AI Feature Stores (legacy) and Bigtable-backed Feature Online Stores with provisioned online-serving capacity and zero `online_serving/request_count` confirmed by Cloud Monitoring for `idle_days`; no age-only or monitoring-absent fallback -**Confidence / Risk:** HIGH (Cloud Monitoring confirms zero requests); LOW (Monitoring unavailable — age-only) / HIGH (HIGH confidence); MEDIUM (LOW confidence) +**Confidence / Risk:** HIGH (Cloud Monitoring confirms zero requests for full aligned window) / HIGH -**Permissions:** `aiplatform.featurestores.list`, `aiplatform.featureOnlineStores.list` (roles/aiplatform.viewer), `monitoring.timeSeries.list` (roles/monitoring.viewer, optional) +**Cost:** `estimated_monthly_cost_usd = None` — pricing varies by backing, region, node count, and commitment model; no flat estimate is appropriate -**Params:** `idle_days` (default: 30) +**Permissions:** `aiplatform.featurestores.list`, `aiplatform.featureOnlineStores.list` (roles/aiplatform.viewer), `monitoring.timeSeries.list` (roles/monitoring.viewer) -**Exclusions:** legacy featurestores with `fixedNodeCount == 0` and `scaling.minNodeCount == 0`; stores not in `STABLE` state +**Params:** `idle_days` (default: 30) -**Spec:** — +**Exclusions:** +- resource name malformed or store ID / region absent +- region filter set and region does not exactly match +- state not exactly `STABLE` +- `reference_time` (`max(createTime, updateTime)`) absent, unparsable, or in the future +- store younger than full `idle_days` observation window +- legacy: `fixedNodeCount == 0` and `scaling.minNodeCount == 0` (no provisioned online-serving capacity) +- legacy: both `fixedNodeCount > 0` and `scaling.minNodeCount > 0` simultaneously — invalid serving mode +- FeatureOnlineStore: storage type not exactly Bigtable (`optimized` stores are out of scope) +- FeatureOnlineStore: `bigtable.autoScaling` absent, or `maxNodeCount < minNodeCount` +- monitoring client unavailable (no age-only fallback) +- metric coverage unresolved — not exactly `idle_days` aligned daily buckets, query failure, future timestamp, or gap > 86 400 s between adjacent points +- aggregate request count > 0 over the full window + +**Spec:** [docs/specs/gcp/ai/featurestore_idle.md](../specs/gcp/ai/featurestore_idle.md) diff --git a/docs/specs/gcp/ai/featurestore_idle.md b/docs/specs/gcp/ai/featurestore_idle.md new file mode 100644 index 0000000..d7e5142 --- /dev/null +++ b/docs/specs/gcp/ai/featurestore_idle.md @@ -0,0 +1,496 @@ +# GCP Rule Spec - `gcp.vertex.featurestore.idle` + +## 1. Rule Identity + +- **Rule ID:** `gcp.vertex.featurestore.idle` +- **Provider:** GCP +- **Covered resource families:** + - Vertex AI Feature Store (Legacy) `Featurestore` + - Vertex AI Feature Store `FeatureOnlineStore` with Bigtable online serving +- **Finding resource_type:** + - `gcp.vertex.featurestore` for legacy `Featurestore` + - `gcp.vertex.feature_online_store` for `FeatureOnlineStore` + +--- + +## 2. Intent + +Detect **Vertex AI feature serving stores with documented, provisioned online-serving capacity** that show **no documented online-serving request-count telemetry** over a conservative review window. + +This rule is deliberately **precision-first**. It is a **review-candidate** rule only. It is **not** proof that a store is safe to delete, **not** proof that offline feature workflows are unused, and **not** a license to infer a fixed monthly dollar saving. + +### 2.1 Canonical definitions + +| Term | Definition | +|---|---| +| legacy feature store | `projects.locations.featurestores` resource family | +| feature online store | `projects.locations.featureOnlineStores` resource family | +| provisioned online serving | A documented control-plane configuration that proves persistent online-serving capacity exists: legacy `fixedNodeCount > 0`, legacy `scaling.minNodeCount > 0`, or `FeatureOnlineStore.bigtable.autoScaling.minNodeCount >= 1` | +| reference time | `max(createTime, updateTime)` | +| evaluation window start | inclusive UTC instant `now_utc - idle_days × 86400 seconds` | +| evaluation window end | exclusive UTC instant `now_utc` | +| full observation window | `[evaluation_window_start_utc, evaluation_window_end_utc)`, usable only when `reference_time_utc <= evaluation_window_start_utc` | +| daily aligned bucket | bucket `n` covers `[evaluation_window_start_utc + (n-1) × 86400s, evaluation_window_start_utc + n × 86400s)` for `n = 1..idle_days` | +| expected aligned bucket count | `idle_days` daily buckets after canonical alignment | + +--- + +## 3. GCP Documentation Grounding + +### 3.1 Vertex AI has distinct current and legacy feature-store families + +Google documents two feature-store offerings: + +1. Vertex AI Feature Store +2. Vertex AI Feature Store (Legacy), which is deprecated + +Google also documents that the newer Vertex AI Feature Store uses BigQuery-backed feature data sources with online serving options, while the legacy product is a separate older resource family. + +Sources: + +- *Introduction to feature management and feature stores* +- *Online serving types* + +URLs: + +- https://cloud.google.com/vertex-ai/docs/featurestore +- https://docs.cloud.google.com/vertex-ai/docs/featurestore/latest/online-serving-types + +Rule consequence: + +1. This rule may cover both legacy `Featurestore` and newer `FeatureOnlineStore`, but they must be evaluated with separate documented contracts. +2. The rule must not blur these families into a single control-plane shape. +3. Feature groups, feature views, registries, and offline BigQuery sources are out of scope for this rule. + +### 3.2 Legacy `Featurestore` exposes the documented online-serving configuration + +Google documents the legacy `Featurestore` resource with fields including: + +1. `name` +2. `createTime` +3. `updateTime` +4. `state` +5. `onlineServingConfig.fixedNodeCount` +6. `onlineServingConfig.scaling.minNodeCount` + +Google also documents: + +1. `fixedNodeCount = 0` means the feature store will not have an online store and cannot be used for online serving +2. only one of `fixedNodeCount` and `scaling` can be set +3. `STABLE` means the feature store configuration reflects current state and is usable +4. `UPDATING` means configuration is in progress and fields can reflect either original or updated values + +Source: + +- *REST Resource: projects.locations.featurestores* + +URL: + +- https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.featurestores + +Rule consequence: + +1. Legacy stores are in scope only when online serving is explicitly provisioned. +2. Legacy stores with no online-serving capacity must skip. +3. Legacy stores in `UPDATING` must skip because, although the store can still be usable for serving, the documented configuration can reflect either the original or updated state and is therefore not stable enough for precision-first idle evaluation. + +### 3.3 `FeatureOnlineStore` exposes storage type and Bigtable minimum node floor + +Google documents the `FeatureOnlineStore` resource with fields including: + +1. `name` +2. `createTime` +3. `updateTime` +4. `state` +5. `bigtable.autoScaling.minNodeCount` +6. `optimized` + +Google also documents: + +1. `FeatureOnlineStore` storage type is a union of `bigtable` or `optimized` +2. Bigtable autoscaling requires `minNodeCount >= 1` +3. `STABLE` means the store reflects current configuration and is usable +4. `UPDATING` means the store is still usable, but configuration is being changed + +Sources: + +- *REST Resource: projects.locations.featureOnlineStores* +- *Online serving types* + +URLs: + +- https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.featureOnlineStores +- https://docs.cloud.google.com/vertex-ai/docs/featurestore/latest/online-serving-types + +Rule consequence: + +1. `FeatureOnlineStore` is in scope only when the storage type is documented Bigtable online serving and `bigtable.autoScaling.minNodeCount >= 1`. +2. `FeatureOnlineStore` resources using `optimized` serving are out of scope for this rule. +3. `UPDATING` stores must skip for the same evaluation-stability reason as legacy stores: the store can still serve traffic, but the configuration is not stable enough for a precision-first idle decision. + +### 3.4 Cloud Monitoring documents the canonical request-count metrics and monitored resources + +Google documents the monitored resources: + +1. `aiplatform.googleapis.com/Featurestore` with resource labels including `location` and `featurestore_id` +2. `aiplatform.googleapis.com/FeatureOnlineStore` with resource labels including `location` and `feature_online_store_id` + +Google also documents the request-count metrics: + +1. `aiplatform.googleapis.com/featurestore/online_serving/request_count` +2. `aiplatform.googleapis.com/featureonlinestore/online_serving/request_count` + +For the legacy metric, Google documents additional metric labels including: + +- `entity_type_id` +- `method` +- `error_code` + +For the `FeatureOnlineStore` metric, Google documents additional metric labels including: + +- `method` +- `feature_view_id` +- `error_code` +- `storage_type` + +Sources: + +- *Google Cloud monitored resource list* +- *Google Cloud metrics list* + +URLs: + +- https://docs.cloud.google.com/monitoring/api/resources +- https://docs.cloud.google.com/monitoring/api/metrics_gcp_a_b + +Rule consequence: + +1. These request-count metrics are the **sole canonical telemetry source** for this rule. +2. Telemetry must be evaluated per store ID on the documented monitored resource. +3. Aggregation must sum across all other label combinations for the store, not only a single method, entity type, feature view, or error code. +4. Because these metrics are documented as `DELTA` metrics on the monitored resource, the rule must use explicit alignment and reduction rather than relying on raw sparse series output. + +### 3.5 Legacy Feature Store uses Bigtable online serving and carries product-specific management overhead + +Google documents that Vertex AI Feature Store (Legacy): + +1. uses Bigtable for its online serving layer +2. can often be migrated to direct Bigtable for faster speeds and reduced costs +3. has a Vertex AI Feature Store (Legacy)-specific node-management premium + +Source: + +- *Migrate from Vertex AI Feature Store (Legacy) to Bigtable* + +URL: + +- https://docs.cloud.google.com/bigtable/docs/migrate-vertex-ai-legacy-bigtable + +Rule consequence: + +1. This rule is correctly framed as a review rule for persistent online-serving cost surfaces. +2. The rule must not hardcode a universal hourly or monthly cost estimate for either family. +3. `estimated_monthly_cost_usd` should remain `None` unless a future implementation computes current backend-specific pricing from authoritative pricing inputs. + +--- + +## 4. Detection Goal + +Emit a finding only when **all** of the following are true: + +1. a stable resource name and ID are present +2. the resource family resolves to either legacy `Featurestore` or Bigtable-backed `FeatureOnlineStore` +3. the normalized region is parseable from the documented resource name +4. if a region filter is set, it matches the normalized region exactly +5. the normalized state is exactly `STABLE` +6. `reference_time_utc` is present, parseable, and not in the future +7. the store is old enough that the full observation window is coverable +8. documented provisioned online-serving capacity is present +9. the canonical request-count metric query succeeds for that resource family +10. usable metric data exists for the store over the full window +11. aggregate request count over the full window is exactly `0` + +If any required signal cannot be established reliably, skip rather than emit. + +--- + +## 5. Non-Goals + +This rule does **not** attempt to prove: + +- that offline feature generation, sync, or BigQuery-based workflows are unused +- that the store is abandoned +- that the store is safe to delete or reconfigure +- that optimized online serving is wasteful when idle +- that a specific monthly saving exists + +--- + +## 6. Canonical Inputs + +### 6.1 Required surfaces + +| Surface | Purpose | +|---|---| +| Vertex AI `Featurestore` list inventory | enumerate legacy stores and their online-serving configuration | +| Vertex AI `FeatureOnlineStore` list inventory | enumerate newer stores and their storage type / Bigtable autoscaling floor | +| Cloud Monitoring time-series query | determine whether documented online-serving request activity occurred during the full window | + +### 6.2 Permissions + +Minimum permissions: + +- `aiplatform.featurestores.list` +- `aiplatform.featureOnlineStores.list` +- `monitoring.timeSeries.list` + +### 6.3 Idle window + +- Configurable parameter: `idle_days` +- Default: `30` +- Minimum effective value: `1` + +Reason: + +- Feature-serving systems can legitimately have lower-frequency access than interactive notebooks or online endpoints. +- A 30-day default is conservative enough to reduce false positives for periodic usage. + +--- + +## 7. Normalization Contract + +| Field | Normalization | +|---|---| +| `resource_family` | Resolve from the exact resource-name collection segment: `featurestores` => legacy `Featurestore`; `featureOnlineStores` => `FeatureOnlineStore`. Any other shape is unusable. | +| `resource_name` | Must be a non-empty string in the documented form `projects/{project}/locations/{location}/featurestores/{id}` or `projects/{project}/locations/{location}/featureOnlineStores/{id}`. Malformed names skip. | +| `store_id` | Final resource-name segment. Empty result skips. | +| `region` | Resolve from the exact `locations/{location}` segment in the documented resource name. If unresolved, skip. Compare using exact lowercase string equality. | +| `state` | Compare case-sensitively to exact documented enum value `STABLE`. Any other value, including `UPDATING`, is out of scope. | +| `create_time_utc` | Parse from documented RFC3339 `createTime` into a timezone-aware UTC instant. If present but unparsable, skip. Future timestamps skip. | +| `update_time_utc` | Parse from documented RFC3339 `updateTime` into a timezone-aware UTC instant. If present but unparsable, skip. Future timestamps skip. | +| `reference_time_utc` | `max(create_time_utc, update_time_utc)` when both exist; otherwise whichever documented timestamp exists. If neither exists, skip. If the resolved reference time is in the future, skip. | +| `evaluation_window_start_utc` | Inclusive UTC instant `now_utc - idle_days × 86400 seconds`. | +| `evaluation_window_end_utc` | Exclusive UTC instant `now_utc`. | +| `full_window_coverable` | True only when `reference_time_utc <= evaluation_window_start_utc`. Otherwise skip. | +| `legacy_fixed_node_count` | From `onlineServingConfig.fixedNodeCount`. Must be integer `> 0` to prove fixed online-serving capacity. `0` means no online store. | +| `legacy_scaling_min_node_count` | From `onlineServingConfig.scaling.minNodeCount`. Must be integer `> 0` to prove autoscaled online-serving capacity. | +| `legacy_online_serving_mode` | `fixed` when `fixedNodeCount > 0` and `scaling` is absent; `autoscaled` when `fixedNodeCount` is absent or `0` and `scaling.minNodeCount > 0`; `none` when neither proves capacity; `invalid` when both `fixedNodeCount` and `scaling` are materially present or when malformed shapes make the capacity unusable. | +| `feature_online_store_storage_type` | Exactly one union branch must be present: `bigtable` or `optimized`. If both or neither are present, treat as unusable and skip. | +| `bigtable_min_node_count` | From `bigtable.autoScaling.minNodeCount`. `bigtable.autoScaling` must be present and structurally usable for Bigtable-backed stores. The value must be integer `>= 1` to prove provisioned Bigtable serving capacity. | +| `bigtable_max_node_count` | From `bigtable.autoScaling.maxNodeCount`. For Bigtable-backed stores, this must be present and integer `>= bigtable_min_node_count`; otherwise the autoscaling block is unusable and the store must skip. | +| `provisioned_capacity_units` | Legacy: fixed nodes or scaling min nodes. New: Bigtable min nodes. Optimized stores do not produce this field for this rule. | +| `activity_metric_type` | Legacy: `aiplatform.googleapis.com/featurestore/online_serving/request_count`; new Bigtable-backed store: `aiplatform.googleapis.com/featureonlinestore/online_serving/request_count`. | +| `activity_metric_kind` | Must resolve to documented `DELTA`. If the descriptor or returned query surface contradicts this expectation, skip rather than silently reinterpret. | +| `activity_resource_label` | Legacy: `featurestore_id`; new store: `feature_online_store_id`. | +| `monitoring_filter` | Must constrain exact `metric.type`, exact `resource.type`, exact normalized `resource.labels.location`, and exact store ID label for the candidate store. | +| `alignment_period_seconds` | Fixed at `86400`. | +| `aligned_bucket_count_expected` | `idle_days`. | +| `request_count_total` | Sum of all aligned daily datapoints for the store over the full observation window after reducing across all additional series labels. | +| `metric_coverage_state` | `full_window`, `partial_window`, or `none`. | +| `telemetry_state` | `confirmed_zero`, `positive_activity`, or `unresolved`. No age-only fallback state is allowed. | + +Normalization requirements: + +1. Empty strings normalize to unusable, not meaningful values. +2. Timestamp parsing must preserve valid RFC3339 fractional seconds. +3. All timestamps used for comparison must be normalized to timezone-aware UTC before comparison. +4. If a field chosen for evaluation is present but unparsable, skip rather than silently falling back to weaker heuristics. + +--- + +## 8. Activity Determination Contract + +Cloud Monitoring request-count telemetry is the **sole trusted telemetry source** for this rule. + +### 8.1 Required metrics + +| Resource family | Metric type | Monitored resource | Resource ID label | +|---|---|---|---| +| Legacy `Featurestore` | `aiplatform.googleapis.com/featurestore/online_serving/request_count` | `aiplatform.googleapis.com/Featurestore` | `featurestore_id` | +| Bigtable-backed `FeatureOnlineStore` | `aiplatform.googleapis.com/featureonlinestore/online_serving/request_count` | `aiplatform.googleapis.com/FeatureOnlineStore` | `feature_online_store_id` | + +### 8.2 Required query shape + +The time-series query must: + +1. specify exactly one `metric.type` +2. specify the exact documented `resource.type` +3. constrain `resource.labels.location` to the candidate store's normalized region by exact equality +4. constrain the family-specific store ID label to the candidate store's exact normalized store ID +5. evaluate the interval `[evaluation_window_start_utc, evaluation_window_end_utc)` +6. request aligned data rather than raw sparse output + +This exact filter scoping is required to prevent cross-store and cross-region metric bleed. + +### 8.3 Alignment and reduction rules + +The query must use the following canonical aggregation: + +1. `alignment_period = 86400s` +2. `per_series_aligner = ALIGN_SUM` +3. `cross_series_reducer = REDUCE_SUM` +4. `group_by_fields = [resource.labels.]` +5. if the metric kind does not resolve to `DELTA`, skip rather than reinterpreting the metric contract + +Reason: + +- the request-count metrics are documented as `DELTA` +- this rule's idle contract is defined in whole UTC days, so the canonical alignment period is one day by design +- multiple raw series can exist for one store because of additional labels such as `entity_type_id`, `method`, `feature_view_id`, `error_code`, and `storage_type` +- idle determination must use the full sum for the store, not any single raw or partially aggregated series + +The rule must **not** restrict evaluation to a single: + +- legacy `entity_type_id` +- serving `method` +- `feature_view_id` +- `error_code` +- `storage_type` + +### 8.4 Coverage requirement + +`usable metric data` means **all** of the following are true: + +1. after the canonical filter and reduction, exactly one reduced time series must remain for the candidate store; if the result count is not exactly `1`, the store is unresolved +2. the reduced series contains exactly `aligned_bucket_count_expected = idle_days` aligned daily datapoints for the full window +3. each aligned datapoint must belong to exactly one documented daily aligned bucket, and no aligned datapoint may extend beyond `evaluation_window_end_utc` +4. no aligned datapoint timestamp is in the future +5. each aligned datapoint must carry at least one valid numeric request value; null, empty, or non-numeric datapoints are unusable +6. the spacing between adjacent aligned datapoints must not exceed one alignment period (`86400` seconds) + +If any of the above fails, `metric_coverage_state` is not `full_window` and the store is unresolved. + +The exact daily-bucket requirement is intentional. A missing bucket is treated as missing telemetry coverage, not as proof of zero activity. + +Interpretation: + +1. zero returned time series -> `metric_coverage_state = none` -> `telemetry_state = unresolved` +2. any reduced time-series count other than `1` -> `metric_coverage_state = partial_window` -> `telemetry_state = unresolved` +3. any aligned datapoint count other than `idle_days` -> `metric_coverage_state = partial_window` -> `telemetry_state = unresolved` +4. exactly `idle_days` aligned datapoints with aggregate request total `> 0` -> `telemetry_state = positive_activity` +5. exactly `idle_days` aligned datapoints with aggregate request total `== 0` -> `metric_coverage_state = full_window` and `telemetry_state = confirmed_zero` +6. any null, empty, non-numeric, or discontinuously spaced aligned bucket -> `metric_coverage_state = partial_window` -> `telemetry_state = unresolved` + +### 8.5 Forbidden fallbacks + +The following must **not** be used to prove idleness: + +- store age alone +- `createTime` alone +- `updateTime` alone +- legacy node count alone +- Bigtable `minNodeCount` alone +- absence of request metrics treated as equivalent to zero traffic +- a single zero datapoint without full-window bucket coverage + +--- + +## 9. Unified Decision Rule + +### 9.1 Legacy `Featurestore` + +Emit only when **all** of the following are true: + +1. resource family is legacy `Featurestore` +2. `state == "STABLE"` +3. region is parseable and matches the optional region filter exactly +4. `reference_time_utc` is valid and the full window is coverable +5. `legacy_online_serving_mode` is `fixed` or `autoscaled` +6. `metric_coverage_state == "full_window"` +7. canonical legacy request-count telemetry is `confirmed_zero` + +### 9.2 `FeatureOnlineStore` + +Emit only when **all** of the following are true: + +1. resource family is `FeatureOnlineStore` +2. `state == "STABLE"` +3. region is parseable and matches the optional region filter exactly +4. `reference_time_utc` is valid and the full window is coverable +5. `feature_online_store_storage_type == "bigtable"` +6. `bigtable_min_node_count >= 1` +7. `metric_coverage_state == "full_window"` +8. canonical `FeatureOnlineStore` request-count telemetry is `confirmed_zero` + +### 9.3 Explicit exclusions + +Always skip: + +- legacy stores with no online-serving config +- legacy stores with `fixedNodeCount == 0` and no valid `scaling.minNodeCount` +- legacy stores with `scaling.minNodeCount == 0` +- legacy stores with both materially present `fixedNodeCount` and `scaling` +- legacy stores in `UPDATING` +- `FeatureOnlineStore` resources in `UPDATING` +- `FeatureOnlineStore` resources using `optimized` +- `FeatureOnlineStore` resources with malformed storage union or unusable `bigtable.autoScaling` +- stores younger than the full observation window +- stores with unresolved telemetry + +--- + +## 10. Finding Shape + +### 10.1 Core fields + +| Field | Value | +|---|---| +| `provider` | `gcp` | +| `rule_id` | `gcp.vertex.featurestore.idle` | +| `resource_type` | `gcp.vertex.featurestore` for legacy; `gcp.vertex.feature_online_store` for new | +| `resource_id` | full resource name when available, otherwise normalized store ID | +| `region` | normalized location | +| `detected_at` | rule evaluation time | +| `estimated_monthly_cost_usd` | `None` | + +### 10.2 Confidence / risk + +| Field | Value | +|---|---| +| `confidence` | `HIGH` | +| `risk` | `HIGH` | + +Reason: + +- The rule emits only when documented provisioned online-serving capacity exists and documented request-count telemetry confirms zero observed requests over the full window. + +### 10.3 Required evidence content + +Evidence should include factual signals only, such as: + +1. resource family +2. normalized state +3. normalized region +4. reference time and idle window +5. serving mode / storage type +6. provisioned baseline node floor +7. canonical metric type used +8. aggregate request count total of `0` + +Evidence must **not**: + +- claim the store is abandoned +- claim offline data paths are unused +- include a flat cost estimate masquerading as authoritative pricing + +--- + +## 11. Failure Behavior + +### 11.1 Permission failures + +Permission failures on required listing or monitoring surfaces must be surfaced explicitly. They must not be silently converted into heuristic findings. + +### 11.2 Monitoring failures + +If monitoring request-count telemetry for a resource family cannot be queried reliably, findings that depend on that family must not be emitted from age-only or config-only fallback logic. + +### 11.3 Malformed records + +Malformed individual resources should be skipped item-by-item when required identity, location, timestamp, state, or provisioning signals are unusable. + +### 11.4 Partial coverage + +If an implementation can continue after a family-specific inventory or telemetry failure, it must preserve that incompleteness as operational visibility. It must not present the project as fully evaluated for the failed family. + +Partial metric coverage is unresolved, not weak evidence. This includes zero reduced series, more than one reduced series, partial bucket counts, invalid bucket values, and discontinuous bucket spacing. diff --git a/docs/specs/gcp/ai/tpu_idle.md b/docs/specs/gcp/ai/tpu_idle.md new file mode 100644 index 0000000..3257da7 --- /dev/null +++ b/docs/specs/gcp/ai/tpu_idle.md @@ -0,0 +1,526 @@ +# GCP Rule Spec - `gcp.tpu.idle` + +## 1. Rule Identity + +- **Rule ID:** `gcp.tpu.idle` +- **Provider:** GCP +- **Resource type:** Cloud TPU Node +- **Finding resource_type:** `gcp.tpu.node` + +--- + +## 2. Intent + +Detect **standalone Cloud TPU Nodes that are currently in the documented billable `READY` state** and show **no observed accelerator-processing activity above a conservative threshold** over a buffered review window, using documented Cloud Monitoring duty-cycle telemetry. + +This rule is deliberately **precision-first**. It is a **review-candidate** rule only. It is **not** proof that a TPU-backed job is abandoned, **not** proof that the node is safe to delete or stop, and **not** proof of a specific monthly dollar saving. + +### 2.1 Canonical definitions + +| Term | Definition | +|---|---| +| standalone TPU node | A TPU Node that is not clearly part of queued-resource or multislice orchestration for this rule: `queuedResource` absent/empty and `multisliceNode != true` | +| billable TPU node | A TPU Node in exact documented `READY` state | +| duty-cycle telemetry | Cloud Monitoring metric `tpu.googleapis.com/accelerator/duty_cycle` on monitored resource `tpu.googleapis.com/GceTpuWorker` | +| duty-cycle percent | Raw metric value in documented percent units with values in the range `[0,100]` | +| idle threshold percent | Product threshold for this rule: `2.0` percent maximum observed duty cycle; this threshold is a CleanCloud review threshold, not a Google-defined idle contract | +| evaluation window end | Inclusive buffered end instant `now_utc - 180 seconds` | +| evaluation window start | `evaluation_window_end_utc - idle_days × 86400 seconds` | +| full observation window | `[evaluation_window_start_utc, evaluation_window_end_utc]`, usable only when `create_time_utc <= evaluation_window_start_utc` | +| expected worker set | The full set of TPU workers that documented first-party identity surfaces prove belong to the TPU Node | +| joined worker set | The subset of the expected worker set for which worker-scoped duty-cycle telemetry is also proven to belong to the TPU Node | + +--- + +## 3. GCP Documentation Grounding + +### 3.1 TPU Node resource exposes the control-plane fields used by this rule + +Google documents the TPU `Node` resource with fields including: + +1. `name` +2. `description` +3. `acceleratorType` +4. `acceleratorConfig` +5. `state` +6. `runtimeVersion` +7. `createTime` +8. `schedulingConfig` +9. `health` +10. `queuedResource` +11. `multisliceNode` + +Google also documents TPU Node lifecycle states including: + +- `CREATING` +- `READY` +- `RESTARTING` +- `REIMAGING` +- `DELETING` +- `REPAIRING` +- `STOPPED` +- `STOPPING` +- `STARTING` +- `PREEMPTED` +- `TERMINATED` +- `HIDING` +- `HIDDEN` +- `UNHIDING` +- `UNKNOWN` + +Source: + +- *REST Resource: projects.locations.nodes* + +URL: + +- https://cloud.google.com/tpu/docs/reference/rest/v2/projects.locations.nodes + +Rule consequence: + +1. Eligibility must be based on documented TPU Node control-plane fields only. +2. Exact state `READY` is the only canonical in-scope billable lifecycle state for this rule. +3. Transitional or stopped-like states such as `STOPPING`, `STOPPED`, `STARTING`, `PREEMPTED`, and `TERMINATED` must skip. +4. `acceleratorConfig`, `acceleratorType`, `runtimeVersion`, `health`, and `schedulingConfig` are valid enrichment/context fields. + +### 3.2 Billing accrues while a TPU Node is `READY` + +Google documents Cloud TPU pricing as follows: + +1. charges for Cloud TPU accrue while a TPU node is in `READY` state +2. prices are listed per chip-hour in USD +3. pricing varies by TPU version, region, and usage option +4. On Demand, Spot/Preemptible, and commitment-based usage options have different prices + +Source: + +- *Cloud TPU pricing* + +URL: + +- https://cloud.google.com/tpu/pricing + +Rule consequence: + +1. This rule must evaluate only nodes in exact `READY` state. +2. Nodes outside `READY` are out of scope for idle-cost findings. +3. The rule must not hardcode a universal hourly or monthly cost estimate across regions, TPU versions, and usage options. +4. `estimated_monthly_cost_usd` should remain `None` unless a future implementation computes current pricing from authoritative current region- and usage-specific pricing inputs. + +### 3.3 Duty-cycle telemetry is documented on worker-scoped monitoring resources + +Google documents Cloud TPU monitoring metrics including: + +1. metric type `tpu.googleapis.com/accelerator/duty_cycle` +2. display name *Accelerator Duty Cycle* +3. kind `GAUGE` +4. type `DOUBLE` +5. unit `%` +6. monitored resource `tpu.googleapis.com/GceTpuWorker` +7. value semantics: percentage of time over the sample period during which the accelerator was actively processing, with values in the range `[0,100]` +8. metric label `accelerator_id` + +Google also documents the `tpu.googleapis.com/GceTpuWorker` monitored resource with labels including: + +1. `resource_container` +2. `location` +3. `worker_id` + +Sources: + +- *Monitor Cloud TPU VMs* +- *Google Cloud monitored resource list* + +URLs: + +- https://cloud.google.com/tpu/docs/troubleshooting/tpu-vm-monitoring +- https://docs.cloud.google.com/monitoring/api/resources + +Rule consequence: + +1. The canonical activity signal for this rule is `tpu.googleapis.com/accelerator/duty_cycle`. +2. This telemetry is documented at **worker/accelerator scope**, not directly at TPU Node scope. +3. Node-level idle determination therefore requires a **documented first-party join** from worker-scoped telemetry back to the TPU Node being evaluated. +4. If the implementation cannot prove the complete expected worker set and then prove that all evaluated worker/accelerator series belong to the TPU Node using documented first-party surfaces, the node must skip rather than guess. + +### 3.4 Google documents a monitoring visibility delay buffer + +Google documents: + +1. it can take up to `180 seconds` between the time a Cloud TPU metric value is generated and when it is displayed in Metrics Explorer + +Source: + +- *Monitor Cloud TPU VMs* + +URL: + +- https://cloud.google.com/tpu/docs/troubleshooting/tpu-vm-monitoring + +Rule consequence: + +1. The trailing `180 seconds` before `now` must be excluded from the evaluation window. +2. The rule must not treat missing very-recent telemetry as proof of inactivity. + +### 3.5 Queued resources and multislice TPU deployments are operationally different + +Google documents that: + +1. best practice is to create TPUs using queued resources rather than the direct Create Node API +2. multislice environments should use queued resources +3. TPU Node surfaces expose `queuedResource` and `multisliceNode` + +Sources: + +- *Manage TPU resources* +- *REST Resource: projects.locations.nodes* + +URLs: + +- https://cloud.google.com/tpu/docs/managing-tpus-tpu-vm +- https://cloud.google.com/tpu/docs/reference/rest/v2/projects.locations.nodes + +Rule consequence: + +1. Queued-resource-managed and multislice nodes are operationally different from standalone TPU nodes. +2. This rule should exclude `queuedResource`-backed and `multisliceNode == true` nodes rather than presenting them as ordinary standalone cleanup candidates. + +### 3.6 Other TPU VM metrics are not canonical substitutes for duty-cycle telemetry + +Google also documents TPU VM metrics such as: + +- `cpu/utilization` +- `memory/usage` +- `network/received_bytes_count` +- `network/sent_bytes_count` +- `tpu/tensorcore/idle_duration` + +These metrics support general monitoring and troubleshooting, but Google documents a distinct accelerator-processing signal via `accelerator/duty_cycle`. + +Source: + +- *Monitor Cloud TPU VMs* + +URL: + +- https://cloud.google.com/tpu/docs/troubleshooting/tpu-vm-monitoring + +Rule consequence: + +1. This rule must not substitute worker CPU, memory, network, or tensorcore-idle metrics for the canonical duty-cycle telemetry without a separately documented contract. +2. The rule is about **observed accelerator processing activity**, not generic TPU VM host utilization. + +--- + +## 4. Detection Goal + +Emit a finding only when **all** of the following are true: + +1. the TPU Node is in exact documented `READY` state +2. the TPU Node is standalone for this rule, not queued-resource-managed or multislice +3. canonical duty-cycle telemetry is provably joined to the TPU Node and sufficiently observed across the full buffered observation window +4. no joined duty-cycle datapoint above `2.0` exists anywhere in the full buffered observation window, including the earliest valid joined datapoint in that window + +If any required signal cannot be established reliably, skip rather than emit. + +--- + +## 5. Non-Goals + +This rule does **not** attempt to prove: + +- that the TPU-backed job is abandoned +- that the TPU node is safe to stop or delete +- that queued-resource-managed or multislice capacity is safe to dismantle +- that a specific monthly saving exists +- that host CPU, memory, or network inactivity is enough to prove TPU accelerator idleness + +--- + +## 6. Canonical Inputs + +### 6.1 Required surfaces + +| Surface | Purpose | +|---|---| +| TPU Nodes list (`projects.locations.nodes.list`) | enumerate candidate TPU Nodes and their lifecycle, creation, scheduling, and orchestration context | +| Cloud Monitoring `tpu.googleapis.com/accelerator/duty_cycle` | determine observed accelerator-processing activity | +| Additional first-party documented worker-identity surface, if needed | prove that worker-scoped duty-cycle telemetry belongs to the TPU Node being evaluated | + +### 6.2 Permissions + +Minimum permissions: + +- `tpu.nodes.list` +- `monitoring.timeSeries.list` + +If the implementation uses an additional documented first-party join surface, that read permission is also required. + +### 6.3 Idle window + +- Configurable parameter: `idle_days` +- Default: `7` +- Minimum effective value: `1` + +Reason: + +- TPU workloads are expensive enough that a one-week zero-processing window is a conservative review threshold. +- The threshold is intentionally longer than short experiment pauses but still surfaces likely idle accelerators. + +### 6.4 Idle threshold + +- Fixed rule threshold: `max_observed_duty_cycle_percent <= 2.0` + +Reason: + +- Google documents `duty_cycle` in percent units but does not define an idle threshold. +- `2.0%` is a conservative product threshold chosen to tolerate tiny background fluctuation without masking meaningful accelerator activity. + +--- + +## 7. Normalization Contract + +| Field | Normalization | +|---|---| +| `resource_name` | Must be a non-empty string in documented TPU Node name form `projects/{project}/locations/{zone}/nodes/{node_id}`. Malformed names skip. | +| `node_id` | Final node-name segment. Empty result skips. | +| `zone` | Resolve from exact `locations/{zone}` segment in the node name. If unresolved, skip. | +| `region` | Derive from the zone by removing the final hyphen-delimited zone suffix. If the zone is unusable for region derivation, skip. | +| `state` | Compare case-sensitively to exact documented enum value `READY`. Any other value is out of scope. | +| `create_time_utc` | Parse documented RFC3339 `createTime` into a timezone-aware UTC instant. If present but unparsable, skip. Future timestamps skip. | +| `evaluation_window_end_utc` | `now_utc - 180 seconds`. If this buffered end would be before `create_time_utc`, the node is too young to evaluate. | +| `evaluation_window_start_utc` | `evaluation_window_end_utc - idle_days × 86400 seconds`. | +| `full_window_coverable` | True only when `create_time_utc <= evaluation_window_start_utc`. Otherwise skip. | +| `queued_resource_name` | Preserve raw `queuedResource`; a non-empty value means the node is out of scope. Malformed non-string/non-null values skip rather than assume standalone. | +| `multislice_node` | Treat exact boolean `true` as out of scope. Malformed non-boolean/non-null values skip rather than assume standalone. | +| `accelerator_type_context` | Prefer documented `acceleratorConfig.type` when present; otherwise preserve raw legacy `acceleratorType` as context only. | +| `topology_context` | Preserve documented `acceleratorConfig.topology` when present; context only. | +| `preemptible` | Preserve documented `schedulingConfig.preemptible` as context only. | +| `spot` | Preserve documented `schedulingConfig.spot` as context only. | +| `reserved` | Preserve documented `schedulingConfig.reserved` as context only. | +| `duty_cycle_metric_type` | Exact `tpu.googleapis.com/accelerator/duty_cycle`. | +| `duty_cycle_resource_type` | Exact `tpu.googleapis.com/GceTpuWorker`. | +| `duty_cycle_percent` | Preserve raw metric value in percent units `[0,100]`. Do not reinterpret as fraction. | +| `expected_worker_count` | Cardinality of the documented expected worker set for the TPU Node. If unknown, `telemetry_join_state` is not `complete`. | +| `joined_worker_count` | Cardinality of the joined worker set with proven telemetry ownership. Must equal `expected_worker_count`. | +| `max_observed_duty_cycle_percent` | Maximum observed `duty_cycle_percent` across all joined worker/accelerator series over the buffered observation window. | +| `telemetry_join_state` | `complete`, `incomplete`, or `unresolved`. `complete` means provably complete via documented first-party linkage. Only `complete` is eligible. | +| `telemetry_coverage_state` | `complete` or `unresolved`. `complete` means the joined telemetry is sufficiently observed across the full buffered window with no unresolved gaps. | +| `telemetry_state` | `no_observed_activity_above_threshold`, `observed_activity_above_threshold`, or `unresolved`. No age-only fallback state is allowed. | + +Normalization requirements: + +1. All timestamps used for comparison must be timezone-aware UTC. +2. Empty strings normalize to unusable, not meaningful values. +3. If a chosen field is present but unparsable, skip rather than silently falling back. +4. Duty-cycle thresholds and comparisons must be performed in **percent units**, not fractional units. +5. Threshold comparison is exact: `<= 2.0` qualifies, `> 2.0` does not. Do not round values before comparison. +6. Monitoring timestamps are the source of truth for telemetry coverage. Small clock skew may be tolerated only for coverage-boundary interpretation; it must never convert missing telemetry into zero activity or suppress a datapoint above `2.0`. +7. Join completeness and telemetry completeness are separate proof obligations. Ownership proof does not imply coverage proof, and coverage proof does not imply ownership proof. + +--- + +## 8. Activity Determination Contract + +Cloud Monitoring duty-cycle telemetry is the **sole trusted telemetry source** for this rule. + +### 8.1 Required metric + +| Field | Value | +|---|---| +| Metric type | `tpu.googleapis.com/accelerator/duty_cycle` | +| Kind | `GAUGE` | +| Value type | `DOUBLE` | +| Unit | `%` | +| Monitored resource | `tpu.googleapis.com/GceTpuWorker` | +| Resource labels | `resource_container`, `location`, `worker_id` | +| Metric label | `accelerator_id` | + +### 8.2 Required query shape + +The monitoring query must: + +1. specify exactly one `metric.type` +2. specify exact resource type `tpu.googleapis.com/GceTpuWorker` +3. constrain resource location to the TPU Node zone or equivalent documented worker location +4. evaluate the buffered interval `[evaluation_window_start_utc, evaluation_window_end_utc]` +5. return all worker/accelerator series that can be proven to belong to the TPU Node +6. preserve worker/accelerator identity so the final decision can take the maximum across all workers, accelerators, and timestamps +7. avoid averaging across workers, accelerators, or timestamps + +### 8.3 Worker-to-node join requirement + +Because the documented duty-cycle metric is worker-scoped, the rule must establish a documented first-party join from the returned `GceTpuWorker` telemetry to the TPU Node under evaluation. + +Allowed join principles: + +1. use only documented first-party Google Cloud surfaces +2. join by explicit documented worker or VM identity, not names guessed from conventions +3. determine the complete expected worker set for the TPU Node from a documented identity surface only +4. prove that all worker/accelerator series used in evaluation belong to the TPU Node + +Forbidden join strategies: + +- guessing from prefixes, suffixes, or free-text descriptions +- inferring expected workers from topology, accelerator shape, or worker-count conventions unless Google explicitly documents that identity linkage +- treating all workers in a zone as belonging to one TPU Node +- silently ignoring workers or accelerators whose ownership cannot be proven + +Join completeness requirements: + +1. `expected_worker_count` must be known from documented first-party identity linkage +2. `joined_worker_count` must equal `expected_worker_count` +3. if the total worker count is unknown, partial, or contradictory, `telemetry_join_state` is not `complete` + +If the join cannot be fully established, `telemetry_join_state` is not `complete` and the node must skip. + +### 8.4 Telemetry coverage requirement + +Telemetry coverage is a separate requirement from join completeness. + +Coverage requirements: + +1. no-series-returned is unresolved, not zero activity +2. `0.0` values are valid observed signals +3. missing datapoints must never be treated as zero +4. telemetry must be sufficiently observed across the full buffered window for every joined worker/accelerator series +5. coverage must be established from actual datapoint timestamps, not assumed sampling behavior +6. if the implementation cannot prove from monitoring timestamps that coverage across the full buffered window is sufficient, `telemetry_coverage_state = unresolved` and the node must skip +7. any gap that cannot be proven from datapoint timestamps to preserve sufficient observation is unresolved and must skip +8. the rule must not emit unless the full buffered window has no joined duty-cycle datapoint above `2.0`, including at the earliest valid joined datapoint in that window + +Because Google does not publish a duty-cycle sampling cadence in the cited metric contract, the rule must not invent an age-only or heuristic fallback when telemetry completeness cannot be proven. + +### 8.5 Interpretation rules + +For a TPU Node with `telemetry_join_state == complete`: + +1. use monitoring timestamps as the source of truth for telemetry timing +2. compute `max_observed_duty_cycle_percent` as the maximum across all joined worker/accelerator datapoints in the buffered window +3. if any joined datapoint exceeds `2.0`, `telemetry_state = observed_activity_above_threshold` and the node must skip +4. if all usable joined datapoints are less than or equal to `2.0`, the node is eligible only if `telemetry_coverage_state == complete` +5. do not average duty-cycle values across workers, accelerators, or timestamps +6. do not round duty-cycle values before comparison + +Missing telemetry, missing joined workers, sparse coverage, or query failures are unresolved and must skip. + +### 8.6 Forbidden fallbacks + +The following must **not** be used to prove idleness: + +- node age alone +- `createTime` alone +- host CPU, host memory, or host network metrics +- `tpu/tensorcore/idle_duration` or other TPU metrics as undocumented substitutes for `accelerator/duty_cycle` +- pricing level, TPU type, or topology alone +- missing monitoring telemetry treated as equivalent to zero activity +- incomplete join or sparse telemetry treated as equivalent to no joined duty-cycle datapoint above threshold + +--- + +## 9. Unified Decision Rule + +Emit only when **all** of the following are true: + +1. the node identity and zone are parseable +2. if a region filter is set, the derived region matches exactly +3. exact node state is `READY` +4. `create_time_utc` is valid and the full buffered window is coverable +5. `queuedResource` is absent/empty and not malformed +6. `multisliceNode != true` and is not malformed +7. `telemetry_join_state == "complete"` +8. `joined_worker_count == expected_worker_count` +9. `telemetry_coverage_state == "complete"` +10. `max_observed_duty_cycle_percent <= 2.0` + +If canonical duty-cycle telemetry is not both provably joined and sufficiently observed across the full buffered window, the rule **MUST NOT** emit. + +Always skip: + +- nodes not in `READY` +- nodes younger than the full buffered window +- nodes with malformed identity, timestamps, or orchestration fields +- queued-resource-managed or multislice nodes +- telemetry query failures +- incomplete or unresolved worker-to-node joins +- missing or sparse joined telemetry treated as unresolved +- no joined series returned +- any joined duty-cycle observation above `2.0` + +--- + +## 10. Finding Shape + +### 10.1 Core fields + +| Field | Value | +|---|---| +| `provider` | `gcp` | +| `rule_id` | `gcp.tpu.idle` | +| `resource_type` | `gcp.tpu.node` | +| `resource_id` | full TPU Node resource name when available, otherwise normalized `node_id` | +| `region` | normalized region derived from node zone | +| `detected_at` | evaluation time | +| `estimated_monthly_cost_usd` | `None` | + +### 10.2 Confidence / risk + +| Field | Value | +|---|---| +| `confidence` | `HIGH` | +| `risk` | `HIGH` | + +Reason: + +- The rule emits only when the node is in documented billable `READY` state, is not queued/multislice-managed, and complete documented duty-cycle telemetry joined to the node shows **no joined duty-cycle datapoint above the conservative threshold** over the buffered window. + +### 10.3 Required evidence content + +Evidence should include factual signals only, such as: + +1. exact state `READY` +2. node zone and derived region +3. `createTime` +4. buffered idle window +5. standalone/orchestration context (`queuedResource`, `multisliceNode`) +6. accelerator type/topology context when available +7. scheduling context (`preemptible`, `spot`, `reserved`) when available +8. canonical metric type used +9. expected vs joined worker counts +10. maximum observed duty-cycle percent +11. statement that complete joined telemetry showed no joined duty-cycle datapoint above threshold over the buffered window + +Evidence must **not**: + +- claim the TPU-backed job is abandoned +- claim the node is safe to stop or delete +- present a flat price estimate as authoritative current spend + +--- + +## 11. Failure Behavior + +### 11.1 Permission failures + +Permission failures on required TPU inventory or monitoring surfaces must be surfaced explicitly. They must not be silently converted into heuristic findings. + +### 11.2 Monitoring failures + +If duty-cycle telemetry cannot be queried reliably, findings must not be emitted from age-only, host-metric, partial-join, or sparse-coverage fallback logic. + +### 11.3 Malformed records + +Malformed individual TPU Nodes should be skipped item-by-item when required identity, state, timestamp, or orchestration fields are unusable. + +### 11.4 Join or telemetry incompleteness + +Partial worker-to-node joins or incomplete duty-cycle telemetry are unresolved, not weak evidence. + +Examples: + +- some worker/accelerator series can be joined but not all +- the complete expected worker set cannot be proven +- no worker-scoped telemetry can be proven to belong to the node +- no joined series are returned for the node +- query succeeds but returns no usable joined telemetry +- telemetry exists only for a subset of proven workers or accelerators diff --git a/docs/specs/gcp/ai/vertex_endpoint_idle.md b/docs/specs/gcp/ai/vertex_endpoint_idle.md new file mode 100644 index 0000000..9e31a6e --- /dev/null +++ b/docs/specs/gcp/ai/vertex_endpoint_idle.md @@ -0,0 +1,513 @@ +# GCP Rule Spec - `gcp.vertex.endpoint.idle` + +## 1. Rule Identity + +- **Rule ID:** `gcp.vertex.endpoint.idle` +- **Provider:** GCP +- **Resource type:** Vertex AI Endpoint +- **Finding resource_type:** `gcp.vertex.endpoint` + +--- + +## 2. Intent + +Detect **Vertex AI Endpoints with a documented always-deployed serving floor** and **no observed online prediction request activity above zero** over a conservative review window, using documented Cloud Monitoring request-count telemetry. + +This rule is deliberately **precision-first**. It is a **review-candidate** rule only. It is **not** proof that the endpoint is safe to delete, **not** proof that all endpoint verbs are unused, and **not** proof of a specific monthly dollar saving. + +### 2.1 Canonical definitions + +| Term | Definition | +|---|---| +| Vertex AI Endpoint | `projects/{project}/locations/{location}/endpoints/{endpoint_id}` resource into which one or more models are deployed | +| in-scope deployed model | A `DeployedModel` whose prediction resource mode has an always-deployed serving floor for this rule: `dedicatedResources.minReplicaCount >= 1` or `automaticResources.minReplicaCount >= 1` | +| out-of-scope deployed model | A `DeployedModel` using only `sharedResources`, or a deployed model whose serving-floor minimum is `0` | +| provisioned serving floor | Sum of `minReplicaCount` across in-scope deployed models on the endpoint | +| shared-resource-only endpoint | An endpoint with deployed models, but none of them have an in-scope provisioned serving floor | +| capacity floor start | The latest documented creation timestamp that proves the endpoint’s current provisioned serving floor existed: `max(endpoint.createTime, all in-scope deployedModel.createTime)` | +| evaluation window end | `now_utc` | +| evaluation window start | `evaluation_window_end_utc - idle_days × 86400 seconds` | +| full observation window | `[evaluation_window_start_utc, evaluation_window_end_utc]`, usable only when `capacity_floor_start_utc <= evaluation_window_start_utc` | +| request-count telemetry | Cloud Monitoring metric `aiplatform.googleapis.com/prediction/online/request_count` | +| zero-activity threshold | Exact threshold for this rule: **no usable endpoint-scoped request-count datapoint above `0`** anywhere in the full observation window | + +--- + +## 3. GCP Documentation Grounding + +### 3.1 Endpoint is the control-plane resource for online prediction traffic + +Google documents the Vertex AI `Endpoint` resource with fields including: + +1. `name` +2. `displayName` +3. `description` +4. `deployedModels` +5. `trafficSplit` +6. `labels` +7. `createTime` +8. `updateTime` +9. `network` +10. `privateServiceConnectConfig` +11. `modelDeploymentMonitoringJob` +12. `dedicatedEndpointEnabled` +13. `dedicatedEndpointDns` + +Google also documents that: + +1. models are deployed into an `Endpoint` +2. the `Endpoint` is afterwards called to obtain predictions and explanations + +Source: + +- *REST Resource: projects.locations.endpoints* + +URL: + +- https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.endpoints + +Rule consequence: + +1. Eligibility must be based on documented Endpoint and DeployedModel control-plane fields only. +2. The canonical resource identity is the full Endpoint resource name. +3. `trafficSplit`, `network`, private networking fields, and logging fields are context only; they are not independent proof of activity or inactivity. + +### 3.2 DeployedModel exposes a union of prediction resource modes + +Google documents `DeployedModel` as containing the prediction resource union: + +1. `dedicatedResources` +2. `automaticResources` +3. `sharedResources` + +Google also documents: + +1. `createTime` for `DeployedModel` +2. `privateEndpoints` +3. `status` + +Source: + +- *REST Resource: projects.locations.endpoints* (`DeployedModel`) + +URL: + +- https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.endpoints + +Rule consequence: + +1. Endpoint idle evaluation must consider the prediction resource mode of each deployed model. +2. `sharedResources` deployments are operationally distinct because serving cost is attached to a shared pool, not directly to the endpoint. +3. `DeployedModel.createTime` is relevant for proving how long the current serving floor has existed. + +### 3.3 DedicatedResources provides an always-deployed serving floor + +Google documents `DedicatedResources` as follows: + +1. `machineSpec` is required +2. `minReplicaCount` is required +3. `minReplicaCount` is the minimum number of machine replicas that will be **always deployed on** +4. `maxReplicaCount` may be higher for autoscaling +5. for online prediction, supported autoscaling metrics include: + - `aiplatform.googleapis.com/prediction/online/accelerator/duty_cycle` + - `aiplatform.googleapis.com/prediction/online/cpu/utilization` + - `aiplatform.googleapis.com/prediction/online/request_count` + +Google also documents in the autoscaling guide: + +1. when configuring a standard `DeployedModel`, `dedicatedResources.minReplicaCount` must be at least `1` +2. Vertex AI normally cannot scale a standard dedicated deployment to zero inference nodes +3. a **Scale To Zero** preview path exists where `dedicatedResources.minReplicaCount` may be set to `0` + +Sources: + +- *DedicatedResources* +- *Scale inference nodes for Vertex AI Inference* + +URLs: + +- https://cloud.google.com/vertex-ai/docs/reference/rest/v1/DedicatedResources +- https://cloud.google.com/vertex-ai/docs/predictions/autoscaling + +Rule consequence: + +1. `dedicatedResources.minReplicaCount >= 1` is a documented always-deployed serving floor and is in scope. +2. `dedicatedResources.minReplicaCount == 0` is out of scope for this rule because there is no documented always-deployed floor to review. +3. `maxReplicaCount`, autoscaling thresholds, and target metrics are context only; they do not themselves prove current activity. + +### 3.4 AutomaticResources can also provide an always-deployed serving floor + +Google documents `AutomaticResources` as follows: + +1. `minReplicaCount` is the minimum number of replicas that will be **always deployed on** +2. `maxReplicaCount` is the maximum number of replicas that may be deployed on as traffic increases + +Source: + +- *AutomaticResources* + +URL: + +- https://cloud.google.com/vertex-ai/docs/reference/rest/v1/AutomaticResources + +Rule consequence: + +1. `automaticResources.minReplicaCount >= 1` is an always-deployed serving floor and is in scope for idle review. +2. `automaticResources.minReplicaCount == 0` is out of scope because there is no always-deployed floor to review. +3. The rule must **not** assume that all `automaticResources` endpoints scale to zero. + +### 3.5 Online inference nodes and deployed models incur cost even without traffic + +Google documents that: + +1. Vertex AI allocates **nodes** to handle online inference +2. when serving online inference, machine type is specified in a deployed model’s prediction resources +3. for AutoML models, you pay for each model deployed to an endpoint **even if no prediction is made** +4. you must undeploy the model to stop incurring further charges +5. pricing varies by machine type, accelerator, region, and usage option + +Sources: + +- *Configure compute resources for inference* +- *Vertex AI pricing* + +URLs: + +- https://cloud.google.com/vertex-ai/docs/predictions/configure-compute +- https://cloud.google.com/vertex-ai/pricing + +Rule consequence: + +1. An endpoint with an always-deployed serving floor and zero observed prediction activity is a valid idle-cost review candidate. +2. The rule must not hardcode a flat monthly estimate from one region’s pricing table. +3. `estimated_monthly_cost_usd` should remain `None` unless a future implementation computes current pricing from authoritative current pricing inputs. + +### 3.6 Endpoint is a documented Cloud Monitoring monitored resource + +Google documents the monitored resource type: + +1. `aiplatform.googleapis.com/Endpoint` +2. labels: + - `resource_container` + - `location` + - `endpoint_id` + +Source: + +- *Google Cloud monitored resource list* + +URL: + +- https://cloud.google.com/monitoring/api/resources + +Rule consequence: + +1. Endpoint activity telemetry must be attributed using the documented Endpoint monitored resource. +2. Exact endpoint identity must be established from `location` and `endpoint_id`, not inferred from display name or traffic split. + +### 3.7 Online prediction request_count is a documented autoscaling metric + +Google documents the online prediction autoscaling metric: + +1. metric name `aiplatform.googleapis.com/prediction/online/request_count` +2. it scales based on the **number of requests** +3. its unit is **requests per minute per replica** +4. the target value is an integer + +Source: + +- *Scale inference nodes for Vertex AI Inference* + +URL: + +- https://cloud.google.com/vertex-ai/docs/predictions/autoscaling + +Rule consequence: + +1. The canonical activity signal for this rule is `aiplatform.googleapis.com/prediction/online/request_count`. +2. Because the metric is request-based, **any usable datapoint above `0`** is observed endpoint activity for this rule. +3. Near-idle heuristics such as “few requests” or replica-scaled thresholds are out of scope for this rule. + +--- + +## 4. Detection Goal + +Emit a finding only when **all** of the following are true: + +1. the Endpoint has at least one in-scope deployed model with a documented always-deployed serving floor +2. the documented serving floor has existed for the full observation window +3. canonical endpoint-scoped request-count telemetry is sufficiently observed across the full observation window +4. no usable endpoint-scoped request-count datapoint above `0` is observed anywhere in the full observation window + +If any required signal cannot be established reliably, skip rather than emit. + +--- + +## 5. Non-Goals + +This rule does **not** attempt to prove: + +- that the endpoint is safe to delete or undeploy +- that explain-only, health-only, or non-prediction endpoint usage is absent +- that shared DeploymentResourcePool cost is attributable to a specific endpoint +- that a low-but-nonzero request volume is “near-idle” +- that a specific monthly saving exists + +--- + +## 6. Canonical Inputs + +### 6.1 Required surfaces + +| Surface | Purpose | +|---|---| +| Endpoint list (`projects.locations.endpoints.list`) | enumerate endpoints, deployed models, resource modes, timestamps, traffic split, and networking context | +| Cloud Monitoring `aiplatform.googleapis.com/prediction/online/request_count` | determine observed online prediction request activity | + +### 6.2 Permissions + +Minimum permissions: + +- `aiplatform.endpoints.list` +- `monitoring.timeSeries.list` + +### 6.3 Idle window + +- Configurable parameter: `idle_days` +- Default: `14` +- Minimum effective value: `1` + +Reason: + +- Vertex endpoints are frequently created for demos, experiments, and staged launches. +- A two-week zero-request window is conservative enough to reduce false positives while still surfacing abandoned always-on capacity. + +### 6.4 Cost field + +- `estimated_monthly_cost_usd = None` + +Reason: + +- Dedicated, automatic, and shared serving modes have materially different pricing and cost attribution semantics. +- Official pricing is region- and configuration-specific, so a flat estimate would be misleading. + +--- + +## 7. Normalization Contract + +| Field | Normalization | +|---|---| +| `resource_name` | Must be a non-empty string in documented Endpoint name form `projects/{project}/locations/{location}/endpoints/{endpoint_id}`. Malformed names skip. | +| `endpoint_id` | Final endpoint-name segment. Empty result skips. | +| `location` | Resolve from exact `locations/{location}` segment in the resource name. If unresolved, skip. | +| `endpoint_create_time_utc` | Parse documented RFC3339 `Endpoint.createTime` into timezone-aware UTC. If present but unparsable, skip. Future timestamps skip. | +| `in_scope_deployed_models` | Deployed models with `dedicatedResources.minReplicaCount >= 1` or `automaticResources.minReplicaCount >= 1`. | +| `shared_resource_only` | True when deployed models exist but none are in scope because they use only `sharedResources` or have serving-floor minimum `0`. | +| `provisioned_serving_floor` | Sum of `minReplicaCount` across in-scope deployed models. Must be `>= 1` to be eligible. | +| `deployed_model_create_time_utc` | Parse each in-scope deployed model’s `createTime` into timezone-aware UTC. If a chosen timestamp is missing, future, or unparsable, skip. | +| `capacity_floor_start_utc` | `max(endpoint_create_time_utc, all in-scope deployed_model_create_time_utc)` | +| `evaluation_window_end_utc` | `now_utc` | +| `evaluation_window_start_utc` | `evaluation_window_end_utc - idle_days × 86400 seconds` | +| `full_window_coverable` | True only when `capacity_floor_start_utc <= evaluation_window_start_utc`. Otherwise skip. | +| `request_metric_type` | Exact `aiplatform.googleapis.com/prediction/online/request_count`. | +| `request_metric_resource_type` | Exact `aiplatform.googleapis.com/Endpoint`. | +| `usable_request_datapoint` | A request-count datapoint whose timestamp falls inside the full observation window and whose value is numeric: use `int64Value` first, else `doubleValue`; ignore null, missing, NaN, or unsupported value shapes. | +| `max_observed_request_rate_per_replica` | Maximum usable `request_count` datapoint value across all endpoint-scoped series/points in the observation window. | +| `telemetry_coverage_state` | `complete` or `unresolved`. `complete` means endpoint-scoped request telemetry is sufficiently observed across the full window. | +| `telemetry_state` | `no_observed_prediction_requests`, `observed_prediction_requests`, or `unresolved`. No age-only fallback state is allowed. | + +Normalization requirements: + +1. All timestamps used for comparison must be timezone-aware UTC. +2. Empty strings normalize to unusable, not meaningful values. +3. If a chosen field is present but unparsable, skip rather than silently falling back. +4. Resource-mode interpretation must follow the documented prediction resource union only. +5. The rule must not reinterpret `automaticResources` as automatically scale-to-zero. +6. Endpoint activity is endpoint-level for this rule: any usable endpoint-scoped request-count datapoint above `0` counts as endpoint activity regardless of which deployed model received traffic. + +--- + +## 8. Activity Determination Contract + +Cloud Monitoring request-count telemetry is the **sole trusted activity signal** for this rule. + +### 8.1 Required metric + +| Field | Value | +|---|---| +| Metric type | `aiplatform.googleapis.com/prediction/online/request_count` | +| Meaning | Number of requests | +| Unit | Requests per minute per replica | +| Monitored resource | `aiplatform.googleapis.com/Endpoint` | +| Resource labels | `resource_container`, `location`, `endpoint_id` | + +### 8.2 Required query shape + +The monitoring query must: + +1. specify exact `metric.type = "aiplatform.googleapis.com/prediction/online/request_count"` +2. specify exact resource type `aiplatform.googleapis.com/Endpoint` +3. attribute data to the endpoint using exact `location` and `endpoint_id` +4. evaluate the full interval `[evaluation_window_start_utc, evaluation_window_end_utc]` +5. preserve enough timestamp information to prove telemetry coverage from actual datapoint timestamps +6. consider all endpoint-scoped series returned for that endpoint; any positive datapoint on any returned series is observed activity +7. preserve raw request-count values for zero/nonzero evaluation + +The query/evaluation path must **not**: + +- use a cross-series reducer +- use `ALIGN_RATE` +- transform raw request-count datapoints into derived rate, threshold, or utilization signals before zero/nonzero evaluation + +The implementation **may** batch by location for efficiency, provided it still enforces exact endpoint attribution from the returned resource labels and does not treat location-only matches as sufficient. + +### 8.3 Telemetry coverage requirement + +Coverage requirements: + +1. no-series-returned is unresolved, not zero activity +2. `0` values are valid observed signals +3. missing datapoints must never be treated as zero +4. coverage must be established from actual datapoint timestamps, not assumed sampling behavior +5. if the implementation cannot prove from monitoring timestamps that endpoint-scoped telemetry sufficiently covers the full observation window, `telemetry_coverage_state = unresolved` and the endpoint must skip +6. any gap that cannot be proven from datapoint timestamps to preserve sufficient observation is unresolved and must skip +7. datapoints outside the full observation window must be ignored for both activity and coverage evaluation +8. unexplained large gaps in endpoint-scoped telemetry are unresolved and must skip + +Because the cited docs define metric name and unit but do **not** publish a definitive sampling cadence or visibility-delay contract for this metric, the rule must not invent: + +- an exact sample-count requirement +- a mandatory trailing ingestion buffer +- a heuristic fallback that treats missing recent data as zero + +### 8.4 Interpretation rules + +For an endpoint with `telemetry_coverage_state == complete`: + +1. use monitoring timestamps as the source of truth for telemetry timing +2. ignore endpoint-scoped datapoints whose timestamps fall outside the full observation window +3. extract each usable datapoint value by reading `int64Value` first, else `doubleValue`; ignore null, NaN, or unsupported value shapes +4. compute `max_observed_request_rate_per_replica` as the maximum usable endpoint-scoped request-count datapoint over the full observation window +5. if any usable datapoint is greater than `0`, `telemetry_state = observed_prediction_requests` and the endpoint must skip +6. if all usable datapoints are exactly `0`, `telemetry_state = no_observed_prediction_requests` + +### 8.5 Forbidden fallbacks + +The following must **not** be used to prove endpoint idleness: + +- endpoint age alone +- trafficSplit alone +- `updateTime` alone +- Cloud Logging access or request/response logging alone +- CPU utilization, accelerator duty cycle, latency, or other metrics as substitutes for request-count telemetry +- “near-idle”, low-traffic, or replica-scaled request thresholds +- missing monitoring telemetry treated as equivalent to zero activity + +--- + +## 9. Unified Decision Rule + +Emit only when **all** of the following are true: + +1. the endpoint identity and location are parseable +2. if a location filter is set, the endpoint location matches exactly +3. at least one deployed model is in scope and `provisioned_serving_floor >= 1` +4. the endpoint is not shared-resource-only +5. `capacity_floor_start_utc` is valid and the full observation window is coverable +6. `telemetry_coverage_state == "complete"` +7. `max_observed_request_rate_per_replica == 0` + +If canonical request-count telemetry is not sufficiently observed across the full observation window, the rule **MUST NOT** emit. + +Always skip: + +- malformed endpoint names or locations +- endpoints with unusable chosen timestamps +- endpoints whose serving floor is too new for the full window +- endpoints with no deployed models +- endpoints with only `sharedResources` +- endpoints whose only deployed models have serving-floor minimum `0` +- endpoints with malformed prediction-resource unions or malformed chosen `minReplicaCount` fields +- monitoring query failures +- missing or sparse telemetry treated as unresolved +- any observed request-count datapoint above `0` + +--- + +## 10. Finding Shape + +### 10.1 Core fields + +| Field | Value | +|---|---| +| `provider` | `gcp` | +| `rule_id` | `gcp.vertex.endpoint.idle` | +| `resource_type` | `gcp.vertex.endpoint` | +| `resource_id` | full Endpoint resource name when available, otherwise normalized `endpoint_id` | +| `region` | exact endpoint `location` | +| `detected_at` | evaluation time | +| `estimated_monthly_cost_usd` | `None` | + +### 10.2 Confidence / risk + +| Field | Value | +|---|---| +| `confidence` | `HIGH` | +| `risk` | `HIGH` if any in-scope dedicated model exposes a nonzero accelerator count or accelerator type; otherwise `MEDIUM` | + +Reason: + +- Confidence is HIGH only because the rule emits solely on full-window zero request-count telemetry with no heuristic fallback. +- Risk reflects that accelerator-backed endpoints are typically costlier and more operationally sensitive than CPU-only endpoints. + +### 10.3 Required evidence content + +Evidence should include factual signals only, such as: + +1. endpoint location +2. endpoint `createTime` +3. capacity floor start timestamp +4. observation window +5. deployed model resource modes present on the endpoint +6. provisioned serving floor +7. whether any shared-resource deployments are also present +8. canonical request-count metric type used +9. maximum observed request-count datapoint value over the window +10. statement that endpoint-scoped request-count telemetry showed no datapoint above `0` + +Evidence must **not**: + +- claim the endpoint is safe to delete or undeploy +- claim that explain-only or non-prediction endpoint traffic is impossible +- present a flat price estimate as authoritative current spend + +--- + +## 11. Failure Behavior + +### 11.1 Permission failures + +Permission failures on required Endpoint inventory or Monitoring surfaces must be surfaced explicitly. They must not be silently converted into heuristic findings. + +### 11.2 Monitoring failures + +If request-count telemetry cannot be queried reliably, findings must not be emitted from age-only, traffic-split, or low-traffic fallback logic. + +### 11.3 Malformed records + +Malformed individual endpoints should be skipped item-by-item when required identity, location, timestamp, or prediction-resource fields are unusable. + +### 11.4 Shared-resource ambiguity + +Endpoints deployed only on `sharedResources` are unresolved for endpoint-attributed idle-cost findings because shared pool cost is not directly attributable to one endpoint. + +### 11.5 Telemetry incompleteness + +Partial or sparse endpoint telemetry is unresolved, not weak evidence. + +Examples: + +- query succeeds but no endpoint-scoped series are returned +- series are returned but timestamps do not prove sufficient full-window observation +- endpoint-scoped telemetry contains gaps that cannot be resolved from datapoint timestamps +- only age or trafficSplit suggests inactivity while canonical request telemetry is absent diff --git a/pyproject.toml b/pyproject.toml index 6d85d6b..b50a9dc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "cleancloud" -version = "1.28.0" +version = "1.29.0" description = "Read-only cloud hygiene for AWS, Azure, and GCP. Multi-account org scanning, CI/CD enforcement, and deterministic cost modeling. No agents, no telemetry." readme = "README.md" requires-python = ">=3.10" diff --git a/tests/cleancloud/providers/gcp/ai/test_gcp_featurestore_idle.py b/tests/cleancloud/providers/gcp/ai/test_gcp_featurestore_idle.py index 004603b..a03d266 100644 --- a/tests/cleancloud/providers/gcp/ai/test_gcp_featurestore_idle.py +++ b/tests/cleancloud/providers/gcp/ai/test_gcp_featurestore_idle.py @@ -2,43 +2,56 @@ Tests for gcp.vertex.featurestore.idle rule. Coverage: -- Legacy featurestore: monitoring confirms zero requests → HIGH confidence -- Legacy featurestore: age-based fallback → MEDIUM confidence -- Legacy featurestore: active (requests > 0) → no finding -- Legacy featurestore: no online serving (fixedNodeCount=0) → no finding +- Legacy featurestore: monitoring confirmed_zero → HIGH confidence / HIGH risk +- Legacy featurestore: positive_activity → no finding +- Legacy featurestore: unresolved coverage → no finding (no age fallback) +- Legacy featurestore: no online serving capacity → no finding - Legacy featurestore: non-STABLE state → no finding -- New featureOnlineStore (Bigtable): monitoring idle → HIGH confidence -- New featureOnlineStore (Optimized): idle → finding with estimated cost +- Legacy featurestore: UPDATING state → no finding +- Legacy featurestore: invalid mode (both fixedNodeCount + scaling present) → no finding +- Legacy featurestore: autoscaled (scaling.minNodeCount) → finding +- Legacy featurestore: too young for full window → no finding +- Legacy featurestore: future reference_time → no finding +- FeatureOnlineStore (Bigtable): confirmed_zero → HIGH confidence / HIGH risk +- FeatureOnlineStore (Bigtable): positive_activity → no finding +- FeatureOnlineStore (Bigtable): optimized → skipped (out of scope) +- FeatureOnlineStore (Bigtable): maxNodeCount < minNodeCount → skipped +- FeatureOnlineStore (Bigtable): missing autoScaling → skipped +- FeatureOnlineStore: minNodeCount == 0 → skipped +- Region filter: exact equality (not prefix) +- estimated_monthly_cost_usd always None - Permission error (403) → raises PermissionError - API not enabled (404) → returns [] -- Region filter: stores in other regions skipped -- Cost: fixedNodeCount × $0.27/hr × 730 h/month -- Monitoring failure → age fallback (no exception raised) -- Node too young + no monitoring → no finding +- Monitoring client creation failure → no findings (no age fallback) - Both resource types produce findings independently -- estimated_monthly_cost_usd is always set +- reference_time = max(createTime, updateTime) +- _query_store_activity coverage unit tests """ +import warnings from datetime import datetime, timedelta, timezone from unittest.mock import MagicMock, patch import pytest +from google.api import metric_pb2 from cleancloud.core.confidence import ConfidenceLevel from cleancloud.core.risk import RiskLevel from cleancloud.providers.gcp.rules.ai.featurestore_idle import ( - _BIGTABLE_NODE_HOURLY_COST, _DEFAULT_IDLE_DAYS, - _HOURS_PER_MONTH, - _OPTIMIZED_STORE_MONTHLY_COST, - _age_days, + _LEGACY_METRIC, + _METRIC_KIND_DELTA, + _NEW_METRIC, _parse_location, _parse_resource_id, + _parse_rfc3339, + _query_store_activity, + _resolve_reference_time, find_idle_featurestores, ) # --------------------------------------------------------------------------- -# Constants +# Test constants # --------------------------------------------------------------------------- NOW = datetime(2025, 7, 1, 12, 0, 0, tzinfo=timezone.utc) @@ -61,21 +74,24 @@ def _make_legacy_store( state: str = "STABLE", node_count: int = 1, age_days: float = 60.0, - display_name: str = "", autoscaled: bool = False, + update_age_days: float = None, ) -> dict: create_dt = NOW - timedelta(days=age_days) - if autoscaled: - serving_config = {"scaling": {"minNodeCount": node_count, "maxNodeCount": node_count * 2}} - else: - serving_config = {"fixedNodeCount": node_count} - return { + store: dict = { "name": f"projects/{_PROJECT}/locations/{region}/featurestores/{store_id}", - "displayName": display_name, "state": state, - "onlineServingConfig": serving_config, "createTime": _iso(create_dt), } + if update_age_days is not None: + store["updateTime"] = _iso(NOW - timedelta(days=update_age_days)) + if autoscaled: + store["onlineServingConfig"] = { + "scaling": {"minNodeCount": node_count, "maxNodeCount": node_count * 2} + } + else: + store["onlineServingConfig"] = {"fixedNodeCount": node_count} + return store def _make_new_store( @@ -83,54 +99,56 @@ def _make_new_store( region: str = "us-central1", state: str = "STABLE", min_nodes: int = 1, + max_nodes: int = None, is_optimized: bool = False, age_days: float = 60.0, - display_name: str = "", + update_age_days: float = None, + missing_autoscaling: bool = False, ) -> dict: create_dt = NOW - timedelta(days=age_days) store: dict = { - "name": (f"projects/{_PROJECT}/locations/{region}" f"/featureOnlineStores/{store_id}"), - "displayName": display_name, + "name": (f"projects/{_PROJECT}/locations/{region}/featureOnlineStores/{store_id}"), "state": state, "createTime": _iso(create_dt), } + if update_age_days is not None: + store["updateTime"] = _iso(NOW - timedelta(days=update_age_days)) if is_optimized: store["optimized"] = {} - else: + elif not missing_autoscaling: + effective_max = max_nodes if max_nodes is not None else min_nodes * 2 store["bigtable"] = { - "autoScaling": {"minNodeCount": min_nodes, "maxNodeCount": min_nodes * 2} + "autoScaling": {"minNodeCount": min_nodes, "maxNodeCount": effective_max} } + else: + store["bigtable"] = {} # autoScaling absent return store -def _make_monitoring_ts(store_id: str, label_key: str, total_count: int): - """Build a mock monitoring time-series.""" - point = MagicMock() - point.value.int64_value = total_count - ts = MagicMock() - ts.resource.labels = {label_key: store_id} - ts.points = [point] - return ts - - def _run( legacy_stores: list = (), new_stores: list = (), - legacy_counts: dict[str, int] | None = None, - new_counts: dict[str, int] | None = None, + legacy_activities: dict = None, # store_id → "confirmed_zero"|"positive_activity"|"unresolved" + new_activities: dict = None, region_filter=None, idle_days: int = _IDLE_DAYS, legacy_list_status: int = 200, new_list_status: int = 200, - monitoring_raises: Exception | None = None, + monitoring_client_fails: bool = False, ): - """Run find_idle_featurestores with mocked HTTP and monitoring.""" + """ + Run find_idle_featurestores with mocked HTTP, mocked _query_store_activity, + and a fixed 'now'. + """ + legacy_activities = legacy_activities or {} + new_activities = new_activities or {} credentials = MagicMock() def _make_list_resp(status: int, data_key: str, items: list) -> MagicMock: resp = MagicMock() resp.status_code = status resp.json.return_value = {data_key: list(items)} + resp.raise_for_status = MagicMock() return resp legacy_resp = _make_list_resp(legacy_list_status, "featurestores", legacy_stores) @@ -144,34 +162,51 @@ def _get_side_effect(url, **kwargs): mock_session = MagicMock() mock_session.get.side_effect = _get_side_effect - def _monitoring_side_effect(request=None, **kwargs): - if monitoring_raises: - raise monitoring_raises - metric = (request or {}).get("filter", "") - if "featureonlinestore" in metric: - store_counts = new_counts or {} - label_key = "feature_online_store_id" - else: - store_counts = legacy_counts or {} - label_key = "featurestore_id" - return [_make_monitoring_ts(sid, label_key, count) for sid, count in store_counts.items()] - - mock_monitoring = MagicMock() - mock_monitoring.list_time_series.side_effect = _monitoring_side_effect + def _mock_query( + client, + project_id, + store_id, + region, + metric_type, + resource_type, + id_label, + window_start, + window_end, + idle_days_arg, + ): + if "featureonlinestore" in metric_type: + return new_activities.get(store_id, "unresolved") + return legacy_activities.get(store_id, "unresolved") + + monitoring_patch = ( + patch( + "cleancloud.providers.gcp.rules.ai.featurestore_idle." + "monitoring_v3.MetricServiceClient", + side_effect=Exception("monitoring unavailable"), + ) + if monitoring_client_fails + else patch( + "cleancloud.providers.gcp.rules.ai.featurestore_idle." + "monitoring_v3.MetricServiceClient", + return_value=MagicMock(), + ) + ) with ( patch( "cleancloud.providers.gcp.rules.ai.featurestore_idle.AuthorizedSession", return_value=mock_session, ), + monitoring_patch, patch( - "cleancloud.providers.gcp.rules.ai.featurestore_idle.monitoring_v3.MetricServiceClient", - return_value=mock_monitoring, + "cleancloud.providers.gcp.rules.ai.featurestore_idle._query_store_activity", + side_effect=_mock_query, ), patch("cleancloud.providers.gcp.rules.ai.featurestore_idle.datetime") as mock_dt, ): mock_dt.now.return_value = NOW mock_dt.fromisoformat = datetime.fromisoformat + mock_dt.fromtimestamp = datetime.fromtimestamp findings = find_idle_featurestores( project_id=_PROJECT, credentials=credentials, @@ -191,33 +226,220 @@ def test_parse_location(self): name = f"projects/{_PROJECT}/locations/us-central1/featurestores/s1" assert _parse_location(name) == "us-central1" + def test_parse_location_missing(self): + assert _parse_location("bad-name") is None + assert _parse_location("") is None + def test_parse_resource_id(self): name = f"projects/{_PROJECT}/locations/us-central1/featurestores/my-store" assert _parse_resource_id(name) == "my-store" - def test_age_days_valid(self): - create_dt = NOW - timedelta(days=45) - age = _age_days(_iso(create_dt), NOW) - assert age == pytest.approx(45.0, abs=0.01) + def test_parse_rfc3339_valid(self): + ts = "2025-05-01T12:00:00Z" + dt = _parse_rfc3339(ts) + assert dt is not None + assert dt.tzinfo is not None + assert dt.year == 2025 and dt.month == 5 and dt.day == 1 + + def test_parse_rfc3339_offset_normalized_to_utc(self): + """spec 7: non-UTC offsets must be normalized to UTC before comparison.""" + # +05:30 offset — value = 2025-05-01T06:30:00Z in UTC + ts = "2025-05-01T12:00:00+05:30" + dt = _parse_rfc3339(ts) + assert dt is not None + assert dt.tzinfo == timezone.utc + assert dt.hour == 6 and dt.minute == 30 + + def test_parse_rfc3339_invalid(self): + assert _parse_rfc3339("not-a-date") is None + + def test_parse_rfc3339_empty(self): + assert _parse_rfc3339("") is None + + def test_resolve_reference_time_both_present(self): + create = _iso(NOW - timedelta(days=60)) + update = _iso(NOW - timedelta(days=10)) + ref = _resolve_reference_time(create, update, NOW) + # max(60 days ago, 10 days ago) = 10 days ago + assert ref == NOW - timedelta(days=10) + + def test_resolve_reference_time_only_create(self): + create = _iso(NOW - timedelta(days=45)) + ref = _resolve_reference_time(create, "", NOW) + assert ref == NOW - timedelta(days=45) + + def test_resolve_reference_time_future_discarded(self): + create = _iso(NOW - timedelta(days=60)) + update = _iso(NOW + timedelta(days=1)) # future + ref = _resolve_reference_time(create, update, NOW) + # future updateTime is discarded, falls back to createTime + assert ref == NOW - timedelta(days=60) + + def test_resolve_reference_time_both_future(self): + create = _iso(NOW + timedelta(days=1)) + update = _iso(NOW + timedelta(days=2)) + assert _resolve_reference_time(create, update, NOW) is None + + def test_resolve_reference_time_both_missing(self): + assert _resolve_reference_time("", "", NOW) is None + - def test_age_days_invalid(self): - assert _age_days("not-a-date", NOW) is None +# --------------------------------------------------------------------------- +# Unit tests — _query_store_activity +# --------------------------------------------------------------------------- - def test_age_days_empty(self): - assert _age_days("", NOW) is None + +def _make_mock_point(val: int, seconds_offset: int, window_end: datetime): + """Build a mock monitoring point with a specific value and timestamp.""" + p = MagicMock() + p.value.WhichOneof.return_value = "int64_value" + p.value.int64_value = val + ts_dt = window_end - timedelta(seconds=seconds_offset) + p.interval.end_time.seconds = int(ts_dt.timestamp()) + p.interval.end_time.nanos = 0 + return p + + +class TestQueryStoreActivity: + _WINDOW_START = NOW - timedelta(days=3) + _WINDOW_END = NOW + + def _run_query(self, series_list): + client = MagicMock() + client.list_time_series.return_value = series_list + return _query_store_activity( + client, + _PROJECT, + "store1", + "us-central1", + _LEGACY_METRIC, + "aiplatform.googleapis.com/Featurestore", + "featurestore_id", + self._WINDOW_START, + self._WINDOW_END, + 3, + ) + + def _make_series(self, vals): + """Make a series with one point per aligned bucket, DELTA kind.""" + points = [_make_mock_point(v, i * 86400, self._WINDOW_END) for i, v in enumerate(vals)] + series = MagicMock() + series.points = points + series.metric_kind = _METRIC_KIND_DELTA + return series + + def test_confirmed_zero_returns_correct(self): + series = self._make_series([0, 0, 0]) + assert self._run_query([series]) == "confirmed_zero" + + def test_positive_activity_detected(self): + series = self._make_series([0, 5, 0]) + assert self._run_query([series]) == "positive_activity" + + def test_zero_series_returns_unresolved(self): + assert self._run_query([]) == "unresolved" + + def test_two_series_returns_unresolved(self): + s1 = self._make_series([0, 0, 0]) + s2 = self._make_series([0, 0, 0]) + assert self._run_query([s1, s2]) == "unresolved" + + def test_wrong_point_count_returns_unresolved(self): + series = self._make_series([0, 0]) # 2 points, expected 3 + assert self._run_query([series]) == "unresolved" + + def test_unrecognized_value_type_returns_unresolved(self): + p = MagicMock() + p.value.WhichOneof.return_value = "string_value" + p.interval.end_time.seconds = int(self._WINDOW_END.timestamp()) + p.interval.end_time.nanos = 0 + series = MagicMock() + series.points = [p, p, p] + series.metric_kind = _METRIC_KIND_DELTA + assert self._run_query([series]) == "unresolved" + + def test_future_timestamp_returns_unresolved(self): + # One point falls outside the expected bucket boundaries + p_ok = _make_mock_point(0, 86400, self._WINDOW_END) + p_future = _make_mock_point(0, 0, self._WINDOW_END) + p_future.interval.end_time.seconds = int( + (self._WINDOW_END + timedelta(hours=1)).timestamp() + ) + series = MagicMock() + series.points = [p_ok, p_ok, p_future] + series.metric_kind = _METRIC_KIND_DELTA + assert self._run_query([series]) == "unresolved" + + def test_gap_exceeding_alignment_period_returns_unresolved(self): + # Points not aligned to expected bucket boundaries (off by 1 second) + p1 = _make_mock_point(0, 0, self._WINDOW_END) + p2 = _make_mock_point(0, 86401 + 86400, self._WINDOW_END) # off by 1s from bucket + p3 = _make_mock_point(0, 86401 + 86400 * 2, self._WINDOW_END) + series = MagicMock() + series.points = [p1, p2, p3] + series.metric_kind = _METRIC_KIND_DELTA + assert self._run_query([series]) == "unresolved" + + def test_double_value_type_accepted(self): + """double_value metric kind is accepted alongside int64_value.""" + + # Use distinct aligned bucket timestamps (one point per bucket) + def _double_point(offset_seconds): + p = MagicMock() + p.value.WhichOneof.return_value = "double_value" + p.value.double_value = 0.0 + ts_dt = self._WINDOW_END - timedelta(seconds=offset_seconds) + p.interval.end_time.seconds = int(ts_dt.timestamp()) + p.interval.end_time.nanos = 0 + return p + + series = MagicMock() + series.points = [_double_point(0), _double_point(86400), _double_point(2 * 86400)] + series.metric_kind = _METRIC_KIND_DELTA + assert self._run_query([series]) == "confirmed_zero" + + def test_metric_kind_non_delta_returns_unresolved(self): + """spec 8.3 point 5: GAUGE or CUMULATIVE metric kind must return unresolved.""" + series = self._make_series([0, 0, 0]) + series.metric_kind = int(metric_pb2.MetricDescriptor.MetricKind.GAUGE) + assert self._run_query([series]) == "unresolved" + + def test_shifted_buckets_return_unresolved(self): + """spec 8.4 point 3: points not aligned to expected bucket ends are rejected.""" + # Shift all points by +1 second — evenly spaced but wrong boundaries + series = self._make_series([0, 0, 0]) + for p in series.points: + p.interval.end_time.seconds += 1 + assert self._run_query([series]) == "unresolved" + + def test_query_exception_propagates(self): + """spec 11.4: RPC failures propagate rather than silently returning 'unresolved'.""" + client = MagicMock() + client.list_time_series.side_effect = RuntimeError("network failure") + with pytest.raises(RuntimeError, match="network failure"): + _query_store_activity( + client, + _PROJECT, + "store1", + "us-central1", + _LEGACY_METRIC, + "aiplatform.googleapis.com/Featurestore", + "featurestore_id", + self._WINDOW_START, + self._WINDOW_END, + 3, + ) # --------------------------------------------------------------------------- -# Integration tests — legacy featurestores +# Legacy featurestore tests # --------------------------------------------------------------------------- class TestLegacyFeaturestore: def test_idle_high_confidence(self): - """Monitoring confirms 0 requests → HIGH confidence.""" store = _make_legacy_store(store_id="s1", node_count=2, age_days=60) - findings = _run(legacy_stores=[store], legacy_counts={"s1": 0}) - + findings = _run(legacy_stores=[store], legacy_activities={"s1": "confirmed_zero"}) assert len(findings) == 1 f = findings[0] assert f.confidence == ConfidenceLevel.HIGH @@ -226,209 +448,425 @@ def test_idle_high_confidence(self): assert f.resource_type == "gcp.vertex.featurestore" def test_active_store_skipped(self): - """Monitoring shows non-zero requests → no finding.""" store = _make_legacy_store(store_id="s1", age_days=60) - findings = _run(legacy_stores=[store], legacy_counts={"s1": 500}) + findings = _run(legacy_stores=[store], legacy_activities={"s1": "positive_activity"}) + assert findings == [] + + def test_unresolved_coverage_skips_not_falls_back(self): + """Unresolved monitoring must skip — no age-only fallback (spec 8.5).""" + store = _make_legacy_store(store_id="s1", age_days=60) + findings = _run(legacy_stores=[store]) # no activity entry → unresolved assert findings == [] def test_no_online_serving_skipped(self): - """fixedNodeCount=0 → no online serving cost, skip.""" store = _make_legacy_store(store_id="s1", node_count=0, age_days=60) - findings = _run(legacy_stores=[store], legacy_counts={"s1": 0}) + findings = _run(legacy_stores=[store], legacy_activities={"s1": "confirmed_zero"}) assert findings == [] def test_non_stable_state_skipped(self): - """UPDATING store → not stably billable, skip.""" store = _make_legacy_store(store_id="s1", state="UPDATING", age_days=60) - findings = _run(legacy_stores=[store], legacy_counts={"s1": 0}) + findings = _run(legacy_stores=[store], legacy_activities={"s1": "confirmed_zero"}) assert findings == [] - def test_age_based_fallback(self): - """No monitoring data + old store → LOW confidence (heuristic: age only).""" + def test_invalid_mode_both_present_skipped(self): + """Both fixedNodeCount and scaling.minNodeCount materially present → invalid.""" store = _make_legacy_store(store_id="s1", age_days=60) - findings = _run(legacy_stores=[store]) # no legacy_counts → no monitoring data + store["onlineServingConfig"] = { + "fixedNodeCount": 2, + "scaling": {"minNodeCount": 1}, + } + findings = _run(legacy_stores=[store], legacy_activities={"s1": "confirmed_zero"}) + assert findings == [] + def test_autoscaled_store_included(self): + store = _make_legacy_store(store_id="s1", node_count=2, age_days=60, autoscaled=True) + findings = _run(legacy_stores=[store], legacy_activities={"s1": "confirmed_zero"}) assert len(findings) == 1 - assert findings[0].confidence == ConfidenceLevel.LOW - assert findings[0].risk == RiskLevel.MEDIUM + assert findings[0].details["legacy_serving_mode"] == "autoscaled" + assert findings[0].details["provisioned_node_floor"] == 2 - def test_store_too_young_no_monitoring(self): - """No monitoring data + store younger than threshold → no finding.""" - store = _make_legacy_store(store_id="s1", age_days=10) - findings = _run(legacy_stores=[store]) + def test_autoscaled_zero_min_nodes_excluded(self): + store = _make_legacy_store(store_id="s1", node_count=0, age_days=60, autoscaled=True) + findings = _run(legacy_stores=[store], legacy_activities={"s1": "confirmed_zero"}) assert findings == [] - def test_cost_single_node(self): - """1-node store: $0.27/hr × 730 h/month.""" - store = _make_legacy_store(store_id="s1", node_count=1, age_days=60) - findings = _run(legacy_stores=[store], legacy_counts={"s1": 0}) + def test_too_young_for_full_window_skipped(self): + """Store created 20 days ago cannot cover a 30-day window.""" + store = _make_legacy_store(store_id="s1", age_days=20) + findings = _run(legacy_stores=[store], legacy_activities={"s1": "confirmed_zero"}) + assert findings == [] - expected_monthly = _BIGTABLE_NODE_HOURLY_COST * 1 * _HOURS_PER_MONTH - assert findings[0].estimated_monthly_cost_usd == pytest.approx(expected_monthly, rel=1e-3) - assert findings[0].details["bigtable_node_count"] == 1 + def test_old_enough_for_window_included(self): + store = _make_legacy_store(store_id="s1", age_days=45) + findings = _run(legacy_stores=[store], legacy_activities={"s1": "confirmed_zero"}) + assert len(findings) == 1 - def test_cost_three_nodes(self): - """3-node HA store: $0.27 × 3 × 730 ≈ $591/mo.""" - store = _make_legacy_store(store_id="s1", node_count=3, age_days=60) - findings = _run(legacy_stores=[store], legacy_counts={"s1": 0}) + def test_reference_time_uses_update_time_when_newer(self): + """updateTime newer than createTime → reference_time = updateTime.""" + # Store created 90 days ago, updated 20 days ago — too recent for 30-day window + store = _make_legacy_store(store_id="s1", age_days=90, update_age_days=20) + findings = _run(legacy_stores=[store], legacy_activities={"s1": "confirmed_zero"}) + assert findings == [] + + def test_reference_time_create_used_when_no_update(self): + """No updateTime → reference_time = createTime; 60-day-old store passes.""" + store = _make_legacy_store(store_id="s1", age_days=60) + assert "updateTime" not in store + findings = _run(legacy_stores=[store], legacy_activities={"s1": "confirmed_zero"}) + assert len(findings) == 1 + + def test_monitoring_client_failure_skips_not_fallback(self): + """Monitoring client creation failure → skip with operational warning (spec 11.4).""" + store = _make_legacy_store(store_id="s1", age_days=60) + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + findings = _run( + legacy_stores=[store], + legacy_activities={"s1": "confirmed_zero"}, + monitoring_client_fails=True, + ) + assert findings == [] + assert any("monitoring client creation failed" in str(w.message) for w in caught) + + def test_malformed_legacy_config_skipped(self): + """Malformed onlineServingConfig must skip item, not abort the rule (spec 11.3).""" + bad_store = _make_legacy_store(store_id="bad", age_days=60) + bad_store["onlineServingConfig"] = {"fixedNodeCount": "not-an-int"} + good_store = _make_legacy_store(store_id="good", age_days=60) + findings = _run( + legacy_stores=[bad_store, good_store], + legacy_activities={"bad": "confirmed_zero", "good": "confirmed_zero"}, + ) + assert len(findings) == 1 + assert findings[0].details["store_id"] == "good" + + def test_monitoring_query_exception_skips_with_warning(self): + """Per-store RPC failure → skip store + emit UserWarning (spec 11.4).""" + store = _make_legacy_store(store_id="s1", age_days=60) + credentials = MagicMock() + + legacy_resp = MagicMock() + legacy_resp.status_code = 200 + legacy_resp.json.return_value = {"featurestores": [store]} + legacy_resp.raise_for_status = MagicMock() + new_resp = MagicMock() + new_resp.status_code = 200 + new_resp.json.return_value = {"featureOnlineStores": []} + new_resp.raise_for_status = MagicMock() + mock_session = MagicMock() + mock_session.get.side_effect = lambda url, **kw: ( + new_resp if "featureOnlineStores" in url else legacy_resp + ) - expected_monthly = _BIGTABLE_NODE_HOURLY_COST * 3 * _HOURS_PER_MONTH - assert findings[0].estimated_monthly_cost_usd == pytest.approx(expected_monthly, rel=1e-3) + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + with ( + patch( + "cleancloud.providers.gcp.rules.ai.featurestore_idle.AuthorizedSession", + return_value=mock_session, + ), + patch( + "cleancloud.providers.gcp.rules.ai.featurestore_idle" + ".monitoring_v3.MetricServiceClient", + return_value=MagicMock(), + ), + patch( + "cleancloud.providers.gcp.rules.ai.featurestore_idle._query_store_activity", + side_effect=RuntimeError("simulated RPC failure"), + ), + patch("cleancloud.providers.gcp.rules.ai.featurestore_idle.datetime") as mock_dt, + ): + mock_dt.now.return_value = NOW + mock_dt.fromisoformat = datetime.fromisoformat + mock_dt.fromtimestamp = datetime.fromtimestamp + findings = find_idle_featurestores(project_id=_PROJECT, credentials=credentials) + + assert findings == [] + assert any( + "monitoring query failed" in str(w.message) and "s1" in str(w.message) for w in caught + ) def test_permission_error_on_403(self): - """403 on legacy list → PermissionError.""" with pytest.raises(PermissionError, match="aiplatform.featurestores.list"): _run(legacy_list_status=403) def test_api_not_enabled_returns_empty(self): - """404 on legacy list → no findings (not an error).""" findings = _run(legacy_list_status=404) assert findings == [] - def test_region_filter_matches(self): + def test_region_filter_exact_match(self): store = _make_legacy_store(store_id="s1", region="us-central1", age_days=60) - findings = _run(legacy_stores=[store], legacy_counts={"s1": 0}, region_filter="us-central1") + findings = _run( + legacy_stores=[store], + legacy_activities={"s1": "confirmed_zero"}, + region_filter="us-central1", + ) assert len(findings) == 1 - def test_region_filter_excludes(self): + def test_region_filter_excludes_non_matching(self): store = _make_legacy_store(store_id="s1", region="europe-west4", age_days=60) - findings = _run(legacy_stores=[store], legacy_counts={"s1": 0}, region_filter="us-central1") + findings = _run( + legacy_stores=[store], + legacy_activities={"s1": "confirmed_zero"}, + region_filter="us-central1", + ) assert findings == [] + def test_region_filter_is_exact_not_prefix(self): + """region_filter='us' must NOT match 'us-central1' (spec 7: exact equality).""" + store = _make_legacy_store(store_id="s1", region="us-central1", age_days=60) + findings = _run( + legacy_stores=[store], + legacy_activities={"s1": "confirmed_zero"}, + region_filter="us", + ) + assert findings == [] + + def test_estimated_monthly_cost_always_none(self): + store = _make_legacy_store(store_id="s1", node_count=3, age_days=60) + findings = _run(legacy_stores=[store], legacy_activities={"s1": "confirmed_zero"}) + assert findings[0].estimated_monthly_cost_usd is None + def test_details_fields(self): store = _make_legacy_store(store_id="s1", region="us-central1", node_count=2, age_days=45) - findings = _run(legacy_stores=[store], legacy_counts={"s1": 0}) + findings = _run(legacy_stores=[store], legacy_activities={"s1": "confirmed_zero"}) d = findings[0].details assert d["store_id"] == "s1" - assert d["store_type"] == "legacy_featurestore" + assert d["store_family"] == "legacy_featurestore" + assert d["state"] == "STABLE" assert d["region"] == "us-central1" - assert d["bigtable_node_count"] == 2 - assert d["request_count"] == 0 + assert d["provisioned_node_floor"] == 2 + assert d["metric_type"] == _LEGACY_METRIC + assert d["metric_coverage_state"] == "full_window" + assert d["telemetry_state"] == "confirmed_zero" + assert d["request_count_total"] == 0 assert d["idle_days_threshold"] == _IDLE_DAYS - assert d["pricing_confidence"] == "published" - def test_monitoring_error_age_fallback(self): - """Monitoring raises exception → falls back to age-based detection (LOW confidence).""" - store = _make_legacy_store(store_id="s1", age_days=60) - findings = _run( - legacy_stores=[store], - monitoring_raises=Exception("monitoring down"), - ) - assert len(findings) == 1 - assert findings[0].confidence == ConfidenceLevel.LOW - - def test_estimated_monthly_cost_set(self): - store = _make_legacy_store(store_id="s1", age_days=60) - findings = _run(legacy_stores=[store], legacy_counts={"s1": 0}) - assert findings[0].estimated_monthly_cost_usd is not None - assert findings[0].estimated_monthly_cost_usd > 0 - - def test_display_name_in_summary(self): - store = _make_legacy_store(store_id="s1", display_name="prod-features", age_days=60) - findings = _run(legacy_stores=[store], legacy_counts={"s1": 0}) - assert "prod-features" in findings[0].summary - - def test_custom_idle_days(self): - """Custom idle_days is respected for age-based fallback (LOW confidence).""" - store = _make_legacy_store(store_id="s1", age_days=15) - findings = _run(legacy_stores=[store], idle_days=10) - assert len(findings) == 1 - assert findings[0].confidence == ConfidenceLevel.LOW + def test_details_fixed_node_count_present(self): + store = _make_legacy_store(store_id="s1", node_count=1, age_days=60, autoscaled=False) + findings = _run(legacy_stores=[store], legacy_activities={"s1": "confirmed_zero"}) + d = findings[0].details + assert d["legacy_serving_mode"] == "fixed" + assert "fixed_node_count" in d + assert "scaling_min_node_count" not in d - def test_autoscaled_store_included(self): - """Autoscaled legacy store (scaling.minNodeCount) is in scope.""" + def test_details_scaling_min_node_count_present_for_autoscaled(self): store = _make_legacy_store(store_id="s1", node_count=2, age_days=60, autoscaled=True) - findings = _run(legacy_stores=[store], legacy_counts={"s1": 0}) + findings = _run(legacy_stores=[store], legacy_activities={"s1": "confirmed_zero"}) + d = findings[0].details + assert d["legacy_serving_mode"] == "autoscaled" + assert "scaling_min_node_count" in d + assert "fixed_node_count" not in d + + def test_custom_idle_days_respected(self): + """idle_days=10: store 15 days old is old enough; store 8 days old is not.""" + old_enough = _make_legacy_store(store_id="s1", age_days=15) + too_young = _make_legacy_store(store_id="s2", age_days=8) + findings = _run( + legacy_stores=[old_enough, too_young], + legacy_activities={"s1": "confirmed_zero", "s2": "confirmed_zero"}, + idle_days=10, + ) assert len(findings) == 1 - f = findings[0] - assert f.confidence == ConfidenceLevel.HIGH - assert f.details["bigtable_node_count"] == 2 - assert f.details["bigtable_scaling"] == "autoscaled" - expected_monthly = _BIGTABLE_NODE_HOURLY_COST * 2 * _HOURS_PER_MONTH - assert f.estimated_monthly_cost_usd == pytest.approx(expected_monthly, rel=1e-3) - - def test_autoscaled_store_zero_min_nodes_excluded(self): - """Autoscaled store with minNodeCount=0 has no online serving cost — skip.""" - store = _make_legacy_store(store_id="s1", node_count=0, age_days=60, autoscaled=True) - findings = _run(legacy_stores=[store], legacy_counts={"s1": 0}) - assert findings == [] + assert findings[0].details["store_id"] == "s1" # --------------------------------------------------------------------------- -# Integration tests — new featureOnlineStores +# FeatureOnlineStore tests # --------------------------------------------------------------------------- class TestFeatureOnlineStore: def test_bigtable_store_idle_high_confidence(self): - """New Bigtable-backed store: monitoring idle → HIGH confidence.""" store = _make_new_store(store_id="fos1", min_nodes=2, age_days=45) - findings = _run(new_stores=[store], new_counts={"fos1": 0}) - + findings = _run(new_stores=[store], new_activities={"fos1": "confirmed_zero"}) assert len(findings) == 1 f = findings[0] assert f.confidence == ConfidenceLevel.HIGH + assert f.risk == RiskLevel.HIGH assert f.resource_type == "gcp.vertex.feature_online_store" - assert f.details["store_type"] == "feature_online_store" - assert f.details["backing"] == "bigtable" - def test_optimized_store_idle(self): - """Optimized (BigQuery-backed) store: idle → estimated cost.""" + def test_active_new_store_skipped(self): + store = _make_new_store(store_id="fos1", age_days=45) + findings = _run(new_stores=[store], new_activities={"fos1": "positive_activity"}) + assert findings == [] + + def test_unresolved_coverage_skips(self): + store = _make_new_store(store_id="fos1", age_days=45) + findings = _run(new_stores=[store]) # no activity entry → unresolved + assert findings == [] + + def test_optimized_store_skipped(self): + """Optimized (BigQuery-backed) stores are out of scope (spec 9.3).""" store = _make_new_store(store_id="fos1", is_optimized=True, age_days=45) - findings = _run(new_stores=[store], new_counts={"fos1": 0}) + findings = _run(new_stores=[store], new_activities={"fos1": "confirmed_zero"}) + assert findings == [] + + def test_non_stable_state_skipped(self): + store = _make_new_store(store_id="fos1", state="UPDATING", age_days=60) + findings = _run(new_stores=[store], new_activities={"fos1": "confirmed_zero"}) + assert findings == [] + + def test_too_young_for_window_skipped(self): + store = _make_new_store(store_id="fos1", age_days=20) + findings = _run(new_stores=[store], new_activities={"fos1": "confirmed_zero"}) + assert findings == [] + + def test_missing_autoscaling_skipped(self): + """bigtable.autoScaling absent → unusable → skip (spec 7).""" + store = _make_new_store(store_id="fos1", age_days=60, missing_autoscaling=True) + findings = _run(new_stores=[store], new_activities={"fos1": "confirmed_zero"}) + assert findings == [] + + def test_min_nodes_zero_skipped(self): + store = _make_new_store(store_id="fos1", min_nodes=0, age_days=60) + findings = _run(new_stores=[store], new_activities={"fos1": "confirmed_zero"}) + assert findings == [] + def test_max_nodes_less_than_min_skipped(self): + """maxNodeCount < minNodeCount → unusable autoscaling block (spec 7).""" + store = _make_new_store(store_id="fos1", min_nodes=3, max_nodes=1, age_days=60) + findings = _run(new_stores=[store], new_activities={"fos1": "confirmed_zero"}) + assert findings == [] + + def test_max_nodes_equal_to_min_accepted(self): + """maxNodeCount == minNodeCount is valid (single-node floor).""" + store = _make_new_store(store_id="fos1", min_nodes=1, max_nodes=1, age_days=60) + findings = _run(new_stores=[store], new_activities={"fos1": "confirmed_zero"}) assert len(findings) == 1 - f = findings[0] - assert f.details["backing"] == "optimized" - assert f.details["pricing_confidence"] == "estimated" - assert f.estimated_monthly_cost_usd == pytest.approx(_OPTIMIZED_STORE_MONTHLY_COST) - def test_active_new_store_skipped(self): - """New store with non-zero requests → no finding.""" - store = _make_new_store(store_id="fos1", age_days=45) - findings = _run(new_stores=[store], new_counts={"fos1": 1000}) + def test_reference_time_uses_update_time_when_newer(self): + """updateTime newer than createTime → reference_time = updateTime → too young.""" + store = _make_new_store(store_id="fos1", age_days=90, update_age_days=20) + findings = _run(new_stores=[store], new_activities={"fos1": "confirmed_zero"}) assert findings == [] - def test_new_store_permission_error(self): - """403 on featureOnlineStores list → PermissionError.""" + def test_permission_error_on_403(self): with pytest.raises(PermissionError, match="aiplatform.featureOnlineStores.list"): _run(new_list_status=403) - def test_new_store_not_enabled(self): - """404 on featureOnlineStores list → no findings.""" + def test_api_not_enabled_returns_empty(self): findings = _run(new_list_status=404) assert findings == [] - def test_bigtable_store_cost(self): - """Bigtable store: minNodeCount × $0.27/hr × 730 h/month.""" - store = _make_new_store(store_id="fos1", min_nodes=3, age_days=45) - findings = _run(new_stores=[store], new_counts={"fos1": 0}) - - expected_monthly = _BIGTABLE_NODE_HOURLY_COST * 3 * _HOURS_PER_MONTH - assert findings[0].estimated_monthly_cost_usd == pytest.approx(expected_monthly, rel=1e-3) - - def test_new_store_age_fallback(self): - """No monitoring data + old new store → LOW confidence (heuristic: age only).""" - store = _make_new_store(store_id="fos1", age_days=45) - findings = _run(new_stores=[store]) # no new_counts → no monitoring data + def test_region_filter_exact_match(self): + store = _make_new_store(store_id="fos1", region="us-central1", age_days=60) + findings = _run( + new_stores=[store], + new_activities={"fos1": "confirmed_zero"}, + region_filter="us-central1", + ) assert len(findings) == 1 - assert findings[0].confidence == ConfidenceLevel.LOW - def test_new_store_too_young(self): - """New store younger than threshold + no monitoring → no finding.""" - store = _make_new_store(store_id="fos1", age_days=5) - findings = _run(new_stores=[store]) + def test_region_filter_excludes(self): + store = _make_new_store(store_id="fos1", region="europe-west4", age_days=60) + findings = _run( + new_stores=[store], + new_activities={"fos1": "confirmed_zero"}, + region_filter="us-central1", + ) assert findings == [] - def test_non_stable_new_store_skipped(self): - store = _make_new_store(store_id="fos1", state="UPDATING", age_days=60) - findings = _run(new_stores=[store], new_counts={"fos1": 0}) + def test_region_filter_is_exact_not_prefix(self): + store = _make_new_store(store_id="fos1", region="us-central1", age_days=60) + findings = _run( + new_stores=[store], + new_activities={"fos1": "confirmed_zero"}, + region_filter="us", + ) assert findings == [] - def test_new_store_region_filter(self): - store = _make_new_store(store_id="fos1", region="europe-west4", age_days=45) - findings = _run(new_stores=[store], new_counts={"fos1": 0}, region_filter="us-central1") + def test_estimated_monthly_cost_always_none(self): + store = _make_new_store(store_id="fos1", min_nodes=3, age_days=60) + findings = _run(new_stores=[store], new_activities={"fos1": "confirmed_zero"}) + assert findings[0].estimated_monthly_cost_usd is None + + def test_details_fields(self): + store = _make_new_store(store_id="fos1", region="us-central1", min_nodes=2, age_days=45) + findings = _run(new_stores=[store], new_activities={"fos1": "confirmed_zero"}) + d = findings[0].details + assert d["store_id"] == "fos1" + assert d["store_family"] == "feature_online_store" + assert d["state"] == "STABLE" + assert d["region"] == "us-central1" + assert d["storage_type"] == "bigtable" + assert d["bigtable_min_node_count"] == 2 + assert d["bigtable_max_node_count"] == 4 # helper sets max = min * 2 + assert d["metric_type"] == _NEW_METRIC + assert d["metric_coverage_state"] == "full_window" + assert d["telemetry_state"] == "confirmed_zero" + assert d["request_count_total"] == 0 + + def test_monitoring_client_failure_skips(self): + """Monitoring client creation failure → skip with operational warning (spec 11.4).""" + store = _make_new_store(store_id="fos1", age_days=60) + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + findings = _run( + new_stores=[store], + new_activities={"fos1": "confirmed_zero"}, + monitoring_client_fails=True, + ) assert findings == [] + assert any("monitoring client creation failed" in str(w.message) for w in caught) + + def test_malformed_bigtable_config_skipped(self): + """Malformed bigtable.autoScaling must skip item, not abort the rule (spec 11.3).""" + bad_store = _make_new_store(store_id="bad", age_days=60) + bad_store["bigtable"] = {"autoScaling": {"minNodeCount": "not-an-int"}} + good_store = _make_new_store(store_id="good", min_nodes=1, age_days=60) + findings = _run( + new_stores=[bad_store, good_store], + new_activities={"bad": "confirmed_zero", "good": "confirmed_zero"}, + ) + assert len(findings) == 1 + assert findings[0].details["store_id"] == "good" + + def test_monitoring_query_exception_skips_with_warning(self): + """Per-store RPC failure → skip store + emit UserWarning (spec 11.4).""" + store = _make_new_store(store_id="fos1", age_days=60) + credentials = MagicMock() + + legacy_resp = MagicMock() + legacy_resp.status_code = 200 + legacy_resp.json.return_value = {"featurestores": []} + legacy_resp.raise_for_status = MagicMock() + new_resp = MagicMock() + new_resp.status_code = 200 + new_resp.json.return_value = {"featureOnlineStores": [store]} + new_resp.raise_for_status = MagicMock() + mock_session = MagicMock() + mock_session.get.side_effect = lambda url, **kw: ( + new_resp if "featureOnlineStores" in url else legacy_resp + ) + + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + with ( + patch( + "cleancloud.providers.gcp.rules.ai.featurestore_idle.AuthorizedSession", + return_value=mock_session, + ), + patch( + "cleancloud.providers.gcp.rules.ai.featurestore_idle" + ".monitoring_v3.MetricServiceClient", + return_value=MagicMock(), + ), + patch( + "cleancloud.providers.gcp.rules.ai.featurestore_idle._query_store_activity", + side_effect=RuntimeError("simulated RPC failure"), + ), + patch("cleancloud.providers.gcp.rules.ai.featurestore_idle.datetime") as mock_dt, + ): + mock_dt.now.return_value = NOW + mock_dt.fromisoformat = datetime.fromisoformat + mock_dt.fromtimestamp = datetime.fromtimestamp + findings = find_idle_featurestores(project_id=_PROJECT, credentials=credentials) + + assert findings == [] + assert any( + "monitoring query failed" in str(w.message) and "fos1" in str(w.message) for w in caught + ) # --------------------------------------------------------------------------- @@ -438,32 +876,32 @@ def test_new_store_region_filter(self): class TestCombined: def test_both_types_independent(self): - """Legacy and new stores both produce findings independently.""" - legacy = _make_legacy_store(store_id="legacy1", age_days=60) - new = _make_new_store(store_id="new1", age_days=45) + legacy = _make_legacy_store(store_id="l1", age_days=60) + new = _make_new_store(store_id="n1", age_days=45) findings = _run( legacy_stores=[legacy], new_stores=[new], - legacy_counts={"legacy1": 0}, - new_counts={"new1": 0}, + legacy_activities={"l1": "confirmed_zero"}, + new_activities={"n1": "confirmed_zero"}, ) assert len(findings) == 2 types = {f.resource_type for f in findings} assert types == {"gcp.vertex.featurestore", "gcp.vertex.feature_online_store"} def test_no_stores_returns_empty(self): - findings = _run() - assert findings == [] + assert _run() == [] def test_one_active_one_idle(self): - """Active legacy + idle new → only one finding.""" legacy = _make_legacy_store(store_id="l1", age_days=60) new = _make_new_store(store_id="n1", age_days=45) findings = _run( legacy_stores=[legacy], new_stores=[new], - legacy_counts={"l1": 1000}, # active - new_counts={"n1": 0}, # idle + legacy_activities={"l1": "positive_activity"}, + new_activities={"n1": "confirmed_zero"}, ) assert len(findings) == 1 assert findings[0].resource_type == "gcp.vertex.feature_online_store" + + def test_rule_id_attribute(self): + assert find_idle_featurestores.RULE_ID == "gcp.vertex.featurestore.idle" diff --git a/tests/cleancloud/providers/gcp/ai/test_gcp_tpu_idle.py b/tests/cleancloud/providers/gcp/ai/test_gcp_tpu_idle.py index 10f7aba..7f1fae0 100644 --- a/tests/cleancloud/providers/gcp/ai/test_gcp_tpu_idle.py +++ b/tests/cleancloud/providers/gcp/ai/test_gcp_tpu_idle.py @@ -2,40 +2,35 @@ Tests for gcp.tpu.idle rule. Coverage: -- Core detection: monitoring confirms idle → HIGH confidence, HIGH/CRITICAL risk -- Age-based fallback: no monitoring data, old node → LOW confidence -- Active node (duty_cycle > threshold) → no finding -- STOPPED node → no finding (not billable) -- Permission error on 403 → raises PermissionError -- TPU API not enabled (404) → returns [] -- Region filter: nodes in other zones skipped -- Cost: per-chip × chip_count; risk CRITICAL for hourly >= $10 -- Different TPU types: V2, V4, V5LITE_POD, V5P -- acceleratorConfig preferred over legacy acceleratorType -- topology-based chip count -- Preemptible flag surfaced in signals -- Monitoring failure → age fallback (no exception raised) -- Node too young + no monitoring → no finding -- estimated_monthly_cost_usd is set (TPU is a standing resource) +- Pre-checks: state, region filter, createTime, standalone (queuedResource, multisliceNode) +- Monitoring client creation failure -> warning, all nodes skip +- Monitoring query exception -> warning, node skips +- _run_zone_diagnostic: returns None (side-effect / diagnostic only), RPC exceptions propagate, + monitoring IS queried (to surface permission errors per spec 11.1) +- No findings emitted: join (spec 8.3) cannot be proven with documented surfaces +- Permission error on 403 -> raises PermissionError +- TPU API not enabled (404) -> returns [] +- RULE_ID attribute set correctly +- Helper functions: _parse_location, _parse_node_id, _zone_to_region, _parse_rfc3339_utc, + _tpu_type_from_legacy """ +import warnings from datetime import datetime, timedelta, timezone +from typing import Optional from unittest.mock import MagicMock, patch import pytest -from cleancloud.core.confidence import ConfidenceLevel -from cleancloud.core.risk import RiskLevel from cleancloud.providers.gcp.rules.ai.tpu_idle import ( - _CHIP_HOURLY_COST, _DEFAULT_IDLE_DAYS, - _DUTY_CYCLE_IDLE_THRESHOLD, - _HOURS_PER_MONTH, - _chip_count, - _hourly_cost, + _MONITORING_BUFFER_SECONDS, _parse_location, _parse_node_id, + _parse_rfc3339_utc, + _run_zone_diagnostic, _tpu_type_from_legacy, + _zone_to_region, find_idle_tpu_nodes, ) @@ -47,6 +42,10 @@ _PROJECT = "my-project" _IDLE_DAYS = _DEFAULT_IDLE_DAYS # 7 +# Derived window boundaries matching find_idle_tpu_nodes logic for NOW +_WINDOW_END = NOW - timedelta(seconds=_MONITORING_BUFFER_SECONDS) +_WINDOW_START = _WINDOW_END - timedelta(seconds=_IDLE_DAYS * 86400) + # --------------------------------------------------------------------------- # Helpers @@ -64,11 +63,12 @@ def _make_node( tpu_type: str = "V4", topology: str = "2x2x1", accel_type_legacy: str = "v4-8", - chips: int = 4, age_days: float = 14.0, runtime: str = "tpu-vm-tf-2.16.1", preemptible: bool = False, description: str = "", + queued_resource: Optional[str] = None, + multislice_node: Optional[bool] = None, ) -> dict: create_dt = NOW - timedelta(days=age_days) node: dict = { @@ -81,68 +81,58 @@ def _make_node( "schedulingConfig": {"preemptible": preemptible}, "description": description, } + if queued_resource is not None: + node["queuedResource"] = queued_resource + if multislice_node is not None: + node["multisliceNode"] = multislice_node return node -def _make_ts(node_id: str, duty_cycle: float, n_points: int = _IDLE_DAYS): - """Build a mock monitoring time-series for a TPU node. - - n_points defaults to _IDLE_DAYS to satisfy the minimum-coverage check in - _fetch_duty_cycles (which requires ≥ idle_days data points). - """ - - def _point(v: float): - p = MagicMock() - p.value.double_value = v - return p - - ts = MagicMock() - ts.resource.labels = {"node_id": node_id} - ts.points = [_point(duty_cycle) for _ in range(n_points)] - return ts +def _make_mock_monitoring_client() -> MagicMock: + """Return a mock monitoring client that returns an empty series list.""" + client = MagicMock() + client.list_time_series.return_value = [] + return client def _run( nodes: list, - duty_cycles: dict[str, float] | None = None, region_filter=None, idle_days: int = _IDLE_DAYS, - monitoring_raises: Exception | None = None, -): - """Run find_idle_tpu_nodes with mocked HTTP and monitoring.""" - # Build mock HTTP session for node listing + monitoring_client_fails: bool = False, +) -> list: + """Run find_idle_tpu_nodes with mocked HTTP session and monitoring client.""" list_resp = MagicMock() list_resp.status_code = 200 list_resp.json.return_value = {"nodes": nodes} - mock_session_inst = MagicMock() - mock_session_inst.get.return_value = list_resp + mock_session = MagicMock() + mock_session.get.return_value = list_resp - # Build mock monitoring client - mock_monitoring_inst = MagicMock() - if monitoring_raises: - mock_monitoring_inst.list_time_series.side_effect = monitoring_raises + if monitoring_client_fails: + client_patch = patch( + "cleancloud.providers.gcp.rules.ai.tpu_idle.monitoring_v3.MetricServiceClient", + side_effect=RuntimeError("client creation failed"), + ) else: - time_series = [_make_ts(nid, dc) for nid, dc in (duty_cycles or {}).items()] - mock_monitoring_inst.list_time_series.return_value = time_series - - credentials = MagicMock() + mock_client = _make_mock_monitoring_client() + client_patch = patch( + "cleancloud.providers.gcp.rules.ai.tpu_idle.monitoring_v3.MetricServiceClient", + return_value=mock_client, + ) with ( patch( "cleancloud.providers.gcp.rules.ai.tpu_idle.AuthorizedSession", - return_value=mock_session_inst, - ), - patch( - "cleancloud.providers.gcp.rules.ai.tpu_idle.monitoring_v3.MetricServiceClient", - return_value=mock_monitoring_inst, + return_value=mock_session, ), + client_patch, patch("cleancloud.providers.gcp.rules.ai.tpu_idle.datetime") as mock_dt, ): mock_dt.now.return_value = NOW mock_dt.fromisoformat = datetime.fromisoformat findings = find_idle_tpu_nodes( project_id=_PROJECT, - credentials=credentials, + credentials=MagicMock(), region_filter=region_filter, idle_days=idle_days, ) @@ -175,6 +165,50 @@ def test_empty(self): assert _parse_node_id("") == "" +class TestZoneToRegion: + @pytest.mark.parametrize( + "zone,expected", + [ + ("us-central1-f", "us-central1"), + ("europe-west4-a", "europe-west4"), + ("northamerica-northeast1-a", "northamerica-northeast1"), + ("us-east1-b", "us-east1"), + ], + ) + def test_valid_zones(self, zone, expected): + assert _zone_to_region(zone) == expected + + def test_no_hyphen_returns_none(self): + assert _zone_to_region("somezonewithouthyphen") is None + + def test_empty_returns_none(self): + assert _zone_to_region("") is None + + +class TestParseRfc3339Utc: + def test_z_suffix(self): + dt = _parse_rfc3339_utc("2025-05-01T10:00:00Z") + assert dt is not None + assert dt.tzinfo == timezone.utc + assert dt.hour == 10 + + def test_offset_normalized_to_utc(self): + dt = _parse_rfc3339_utc("2025-05-01T12:00:00+05:30") + assert dt is not None + assert dt.tzinfo == timezone.utc + assert dt.hour == 6 + assert dt.minute == 30 + + def test_empty_returns_none(self): + assert _parse_rfc3339_utc("") is None + + def test_none_input_returns_none(self): + assert _parse_rfc3339_utc(None) is None # type: ignore[arg-type] + + def test_unparsable_returns_none(self): + assert _parse_rfc3339_utc("not-a-date") is None + + class TestTpuTypeFromLegacy: @pytest.mark.parametrize( "accel_type,expected", @@ -193,44 +227,68 @@ def test_mapping(self, accel_type, expected): assert _tpu_type_from_legacy(accel_type) == expected -class TestChipCount: - @pytest.mark.parametrize( - "accel_type,topology,expected", - [ - ("v2-8", None, 8), - ("v4-8", None, 8), - ("v5litepod-4", None, 4), - ("", "2x2", 4), - ("", "2x2x2", 8), - ("", "4x4", 16), - ("v4-8", "2x2x1", 4), # topology wins - ("", "", 1), - ], - ) - def test_chip_count(self, accel_type, topology, expected): - chips, _ = _chip_count(accel_type, topology) - assert chips == expected - - -class TestHourlyCost: - def test_v4_4chips(self): - cost, confidence = _hourly_cost("V4", 4) - assert cost == pytest.approx(_CHIP_HOURLY_COST["V4"] * 4) - assert confidence == "published" +# --------------------------------------------------------------------------- +# Unit tests — _run_zone_diagnostic +# --------------------------------------------------------------------------- - def test_v5p_8chips(self): - cost, confidence = _hourly_cost("V5P", 8) - assert cost == pytest.approx(_CHIP_HOURLY_COST["V5P"] * 8) - assert confidence == "published" - def test_unknown_type(self): - cost, confidence = _hourly_cost("UNKNOWN", 4) - assert cost == pytest.approx(2.00 * 4) - assert confidence == "estimated" +class TestQueryNodeActivity: + _ZONE = "us-central1-f" - def test_v6e_estimated(self): - _, confidence = _hourly_cost("V6E", 8) - assert confidence == "estimated" + def test_returns_none_side_effect_only(self): + """Function is diagnostic only (spec 11.1): returns None, not a telemetry verdict.""" + client = _make_mock_monitoring_client() + result = _run_zone_diagnostic( + client, + _PROJECT, + self._ZONE, + _WINDOW_START, + _WINDOW_END, + ) + assert result is None + + def test_monitoring_is_queried(self): + """Monitoring IS queried to surface permission and availability errors (spec 11.1).""" + client = _make_mock_monitoring_client() + _run_zone_diagnostic( + client, + _PROJECT, + self._ZONE, + _WINDOW_START, + _WINDOW_END, + ) + assert client.list_time_series.called + + def test_rpc_exception_propagates(self): + """RPC errors propagate to the caller — no swallowing.""" + client = MagicMock() + client.list_time_series.side_effect = RuntimeError("network error") + with pytest.raises(RuntimeError, match="network error"): + _run_zone_diagnostic( + client, + _PROJECT, + self._ZONE, + _WINDOW_START, + _WINDOW_END, + ) + + def test_returns_none_regardless_of_series_content(self): + """Returns None regardless of series content — result is never used for decisions.""" + series = MagicMock() + series.resource.labels = {"worker_id": "0"} + pt = MagicMock() + pt.value.double_value = 0.0 + series.points = [pt] + client = MagicMock() + client.list_time_series.return_value = [series] + result = _run_zone_diagnostic( + client, + _PROJECT, + self._ZONE, + _WINDOW_START, + _WINDOW_END, + ) + assert result is None # --------------------------------------------------------------------------- @@ -239,48 +297,37 @@ def test_v6e_estimated(self): class TestFindIdleTpuNodes: - def test_idle_high_confidence(self): - """Monitoring confirms idle → HIGH confidence.""" - node = _make_node(node_id="tpu-1", tpu_type="V4", topology="2x2x1", age_days=14) - findings = _run([node], duty_cycles={"tpu-1": 0.005}) - - assert len(findings) == 1 - f = findings[0] - assert f.confidence == ConfidenceLevel.HIGH - assert f.rule_id == "gcp.tpu.idle" - assert f.resource_type == "gcp.tpu.node" - assert "tpu-1" in f.resource_id - - def test_active_node_skipped(self): - """Duty cycle above threshold → no finding.""" - node = _make_node(node_id="tpu-active") - findings = _run([node], duty_cycles={"tpu-active": 0.85}) + # --- No findings emitted --- + + def test_ready_node_no_finding_join_unprovable(self): + """READY node passes all pre-checks but no finding: join is unprovable (spec 8.3).""" + node = _make_node(node_id="tpu-1", age_days=14) + findings = _run([node]) + assert findings == [] + + def test_no_nodes_returns_empty(self): + findings = _run([]) assert findings == [] - def test_exactly_at_threshold_skipped(self): - """Duty cycle exactly at threshold (0.02) is NOT idle — must be strictly above.""" - node = _make_node(node_id="tpu-border") - # duty_cycle == threshold → active (not <=, but >) - # 0.02 is NOT > 0.02 → idle path fires — this checks the boundary - # Rule: if duty_cycle > threshold → skip. 0.02 is not > 0.02. - findings = _run([node], duty_cycles={"tpu-border": _DUTY_CYCLE_IDLE_THRESHOLD}) - assert len(findings) == 1 # At exactly threshold → flagged as idle + # --- State pre-check --- def test_stopped_node_skipped(self): - """STOPPED nodes do not incur charges and should not be flagged.""" node = _make_node(state="STOPPED", age_days=30) - findings = _run([node], duty_cycles={}) + findings = _run([node]) assert findings == [] - def test_no_nodes_returns_empty(self): - """No TPU nodes → no findings.""" - findings = _run([], duty_cycles={}) + def test_creating_state_skipped(self): + node = _make_node(state="CREATING", age_days=30) + findings = _run([node]) assert findings == [] + # --- Permission / API availability --- + def test_permission_error_on_403(self): - """403 from the TPU API → PermissionError propagated.""" + """403 from TPU API -> PermissionError propagated.""" list_resp = MagicMock() list_resp.status_code = 403 + list_resp.json.return_value = {"error": {"details": [{"reason": "PERMISSION_DENIED"}]}} mock_session = MagicMock() mock_session.get.return_value = list_resp @@ -295,7 +342,7 @@ def test_permission_error_on_403(self): find_idle_tpu_nodes(project_id=_PROJECT, credentials=MagicMock()) def test_tpu_api_not_enabled_returns_empty(self): - """404 from the TPU API (API not enabled) → empty list, no exception.""" + """404 from TPU API (API not enabled) -> empty list, no exception.""" list_resp = MagicMock() list_resp.status_code = 404 mock_session = MagicMock() @@ -311,201 +358,212 @@ def test_tpu_api_not_enabled_returns_empty(self): findings = find_idle_tpu_nodes(project_id=_PROJECT, credentials=MagicMock()) assert findings == [] - def test_age_based_fallback_no_monitoring(self): - """No monitoring data + old node → LOW confidence (age is a weak proxy).""" - node = _make_node(node_id="tpu-old", age_days=30) - findings = _run([node], duty_cycles={}) # no monitoring data + # --- Region filter --- + + def test_region_filter_exact_match_included_still_no_finding(self): + """Node in filtered region passes filter but still no finding (join unprovable).""" + node = _make_node(node_id="tpu-1", zone="us-central1-f", age_days=14) + findings = _run([node], region_filter="us-central1") + assert findings == [] - assert len(findings) == 1 - f = findings[0] - assert f.confidence == ConfidenceLevel.LOW - assert f.risk == RiskLevel.MEDIUM + def test_region_filter_exact_match_excluded(self): + """Node in different region is excluded at region-filter pre-check.""" + node = _make_node(node_id="tpu-1", zone="europe-west4-a", age_days=14) + findings = _run([node], region_filter="us-central1") + assert findings == [] - def test_node_too_young_no_monitoring(self): - """No monitoring data + node younger than threshold → no finding.""" - node = _make_node(node_id="tpu-new", age_days=3) - findings = _run([node], duty_cycles={}) + def test_region_filter_prefix_not_matched(self): + """Region filter is exact: 'us-central1' does not match 'us-central10-a' derived region.""" + node = _make_node(node_id="tpu-1", zone="us-central10-a", age_days=14) + findings = _run([node], region_filter="us-central1") assert findings == [] - def test_monitoring_error_falls_back_to_age(self): - """Monitoring API raises an exception → falls back to age-based detection.""" - node = _make_node(node_id="tpu-1", age_days=20) - findings = _run( - [node], - duty_cycles={}, - monitoring_raises=Exception("monitoring unavailable"), + # --- Standalone pre-checks --- + + def test_queued_resource_non_empty_skipped(self): + node = _make_node( + node_id="tpu-1", age_days=14, queued_resource="projects/p/queuedResources/q" ) - # Age-based fallback fires: 20d >= 7d → LOW (weak signal) - assert len(findings) == 1 - assert findings[0].confidence == ConfidenceLevel.LOW + findings = _run([node]) + assert findings == [] - def test_region_filter_matches(self): - """Node in the filtered region is included.""" - node = _make_node(node_id="tpu-1", zone="us-central1-f", age_days=14) - findings = _run([node], duty_cycles={"tpu-1": 0.0}, region_filter="us-central1") - assert len(findings) == 1 + def test_queued_resource_empty_passes_precheck(self): + """Empty queuedResource string is standalone — still no finding due to join.""" + node = _make_node(node_id="tpu-1", age_days=14, queued_resource="") + findings = _run([node]) + assert findings == [] - def test_region_filter_excludes(self): - """Node in a different region is excluded.""" - node = _make_node(node_id="tpu-1", zone="europe-west4-a", age_days=14) - findings = _run([node], duty_cycles={"tpu-1": 0.0}, region_filter="us-central1") + def test_multislice_true_skipped(self): + node = _make_node(node_id="tpu-1", age_days=14, multislice_node=True) + findings = _run([node]) assert findings == [] - def test_cost_v4_4chips(self): - """V4 node with 4 chips: cost = _CHIP_HOURLY_COST['V4'] × 4.""" - node = _make_node( - node_id="tpu-v4", - tpu_type="V4", - topology="2x2x1", - accel_type_legacy="v4-8", - age_days=14, - ) - # topology "2x2x1" = 4 chips - findings = _run([node], duty_cycles={"tpu-v4": 0.0}) - assert len(findings) == 1 - expected_hourly = _CHIP_HOURLY_COST["V4"] * 4 - assert findings[0].estimated_monthly_cost_usd == pytest.approx( - expected_hourly * _HOURS_PER_MONTH, rel=1e-3 - ) - assert findings[0].details["chip_count"] == 4 - - def test_risk_critical_for_expensive_node(self): - """HIGH confidence + hourly >= $10 → CRITICAL risk.""" - # V4, 4 chips: 3.22 * 4 = $12.88/hr >= $10 → CRITICAL - node = _make_node(node_id="tpu-v4", tpu_type="V4", topology="2x2x1", age_days=14) - findings = _run([node], duty_cycles={"tpu-v4": 0.0}) - assert findings[0].risk == RiskLevel.CRITICAL - - def test_risk_high_for_cheap_node(self): - """HIGH confidence + hourly < $10 → HIGH risk.""" - # V5LITE_POD, 4 chips: $1.20 * 4 = $4.80/hr < $10 → HIGH - node = _make_node( - node_id="tpu-v5e", - tpu_type="V5LITE_POD", - topology="2x2", - accel_type_legacy="v5litepod-4", - age_days=14, - ) - findings = _run([node], duty_cycles={"tpu-v5e": 0.0}) - assert findings[0].risk == RiskLevel.HIGH + def test_multislice_false_passes_precheck(self): + """multisliceNode=False is standalone — still no finding due to join.""" + node = _make_node(node_id="tpu-1", age_days=14, multislice_node=False) + findings = _run([node]) + assert findings == [] - def test_risk_critical_for_v2_8chip(self): - """V2 8-chip node: $1.50/chip × 8 = $12/hr ≥ $10 → CRITICAL.""" - node = _make_node( - node_id="tpu-v2", - tpu_type="V2", - topology="2x4", - accel_type_legacy="v2-8", - age_days=14, - ) - findings = _run([node], duty_cycles={"tpu-v2": 0.0}) - assert findings[0].risk == RiskLevel.CRITICAL + def test_malformed_queued_resource_skipped(self): + """Non-string/non-null queuedResource -> skip.""" + node = _make_node(node_id="tpu-1", age_days=14) + node["queuedResource"] = 12345 + findings = _run([node]) + assert findings == [] - def test_v5litepod_type_detected(self): - """V5LITE_POD type is correctly detected from acceleratorConfig.""" - node = _make_node( - node_id="tpu-v5e", - tpu_type="V5LITE_POD", - topology="2x4", - accel_type_legacy="v5litepod-8", - age_days=14, - ) - findings = _run([node], duty_cycles={"tpu-v5e": 0.01}) - assert len(findings) == 1 - assert findings[0].details["tpu_type"] == "V5LITE_POD" - - def test_legacy_type_fallback(self): - """No acceleratorConfig.type → falls back to legacy acceleratorType.""" - node = _make_node(node_id="tpu-legacy", age_days=14) - # Remove acceleratorConfig.type - node["acceleratorConfig"] = {} - node["acceleratorType"] = "v4-8" - findings = _run([node], duty_cycles={"tpu-legacy": 0.0}) - assert len(findings) == 1 - assert findings[0].details["tpu_type"] == "V4" - - def test_topology_chip_count_used(self): - """Topology is preferred for chip count over acceleratorType suffix.""" - node = _make_node( - node_id="tpu-1", - tpu_type="V4", - topology="2x2x2", # 8 chips - accel_type_legacy="v4-8", - age_days=14, - ) - findings = _run([node], duty_cycles={"tpu-1": 0.0}) - assert findings[0].details["chip_count"] == 8 - - def test_preemptible_in_signals(self): - """Preemptible flag is surfaced in evidence signals.""" - node = _make_node(node_id="tpu-1", preemptible=True, age_days=14) - findings = _run([node], duty_cycles={"tpu-1": 0.0}) - assert len(findings) == 1 - signals_text = " ".join(findings[0].evidence.signals_used) - assert "preemptible" in signals_text.lower() - - def test_description_used_as_display_name(self): - """Node description is used in summary when present.""" - node = _make_node(node_id="tpu-1", description="my-llm-tpu", age_days=14) - findings = _run([node], duty_cycles={"tpu-1": 0.0}) - assert "my-llm-tpu" in findings[0].summary - - def test_monthly_cost_set(self): - """estimated_monthly_cost_usd is always set (TPU is a standing resource).""" + def test_malformed_multislice_skipped(self): + """Non-bool/non-null multisliceNode -> skip.""" node = _make_node(node_id="tpu-1", age_days=14) - findings = _run([node], duty_cycles={"tpu-1": 0.0}) - assert findings[0].estimated_monthly_cost_usd is not None - assert findings[0].estimated_monthly_cost_usd > 0 + node["multisliceNode"] = "yes" + findings = _run([node]) + assert findings == [] - def test_details_fields(self): - """Key fields are present in finding details.""" - node = _make_node( - node_id="tpu-1", - zone="us-central1-f", - tpu_type="V4", - topology="2x2x1", - age_days=14, - ) - findings = _run([node], duty_cycles={"tpu-1": 0.005}) - d = findings[0].details - assert d["zone"] == "us-central1-f" - assert d["chip_count"] == 4 - assert d["idle_days_threshold"] == _IDLE_DAYS - assert d["max_duty_cycle"] == pytest.approx(0.005) - assert d["pricing_scope"] == "us_central1_reference_not_region_adjusted" - assert d["pricing_confidence"] == "published" - - def test_custom_idle_days(self): - """Custom idle_days parameter is respected.""" - # Node 5 days old — below default 7d threshold but above custom 3d + # --- createTime pre-checks --- + + def test_missing_create_time_skipped(self): + node = _make_node(node_id="tpu-1", age_days=14) + del node["createTime"] + findings = _run([node]) + assert findings == [] + + def test_future_create_time_skipped(self): + node = _make_node(node_id="tpu-1", age_days=-1) + findings = _run([node]) + assert findings == [] + + def test_too_recent_skipped(self): + """Node created after window_start -> full window not coverable -> skip.""" + node = _make_node(node_id="tpu-1", age_days=3) + findings = _run([node]) + assert findings == [] + + # --- Monitoring failures --- + + def test_monitoring_client_failure_skips_all_with_warning(self): + """Monitoring client creation failure -> all nodes skip, warning issued.""" + node = _make_node(node_id="tpu-1", age_days=14) + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + findings = _run([node], monitoring_client_fails=True) + assert findings == [] + msgs = " ".join(str(warning.message) for warning in w) + assert "monitoring client creation failed" in msgs + + def test_monitoring_query_exception_skips_with_warning(self): + """Query RPC exception for a zone -> node in that zone skips, zone name in warning.""" + node = _make_node(node_id="tpu-1", zone="us-central1-f", age_days=14) + list_resp = MagicMock() + list_resp.status_code = 200 + list_resp.json.return_value = {"nodes": [node]} + mock_session = MagicMock() + mock_session.get.return_value = list_resp + + failing_client = MagicMock() + failing_client.list_time_series.side_effect = RuntimeError("rpc failed") + + with ( + patch( + "cleancloud.providers.gcp.rules.ai.tpu_idle.AuthorizedSession", + return_value=mock_session, + ), + patch( + "cleancloud.providers.gcp.rules.ai.tpu_idle.monitoring_v3.MetricServiceClient", + return_value=failing_client, + ), + patch("cleancloud.providers.gcp.rules.ai.tpu_idle.datetime") as mock_dt, + ): + mock_dt.now.return_value = NOW + mock_dt.fromisoformat = datetime.fromisoformat + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + findings = find_idle_tpu_nodes(project_id=_PROJECT, credentials=MagicMock()) + assert findings == [] + msgs = " ".join(str(warning.message) for warning in w) + assert "monitoring query failed" in msgs + assert "us-central1-f" in msgs # zone-cached path warns on zone, not node ID + + def test_two_nodes_same_zone_single_monitoring_call(self): + """Two READY nodes in the same zone produce only one monitoring API call.""" + nodes = [ + _make_node(node_id="tpu-a", zone="us-central1-f", age_days=14), + _make_node(node_id="tpu-b", zone="us-central1-f", age_days=14), + ] + list_resp = MagicMock() + list_resp.status_code = 200 + list_resp.json.return_value = {"nodes": nodes} + mock_session = MagicMock() + mock_session.get.return_value = list_resp + mock_client = _make_mock_monitoring_client() + + with ( + patch( + "cleancloud.providers.gcp.rules.ai.tpu_idle.AuthorizedSession", + return_value=mock_session, + ), + patch( + "cleancloud.providers.gcp.rules.ai.tpu_idle.monitoring_v3.MetricServiceClient", + return_value=mock_client, + ), + patch("cleancloud.providers.gcp.rules.ai.tpu_idle.datetime") as mock_dt, + ): + mock_dt.now.return_value = NOW + mock_dt.fromisoformat = datetime.fromisoformat + findings = find_idle_tpu_nodes(project_id=_PROJECT, credentials=MagicMock()) + assert findings == [] + assert mock_client.list_time_series.call_count == 1 + + def test_zone_error_cached_second_node_skipped_without_second_warning(self): + """Zone query error is cached; a second node in the same zone skips silently.""" + nodes = [ + _make_node(node_id="tpu-a", zone="us-central1-f", age_days=14), + _make_node(node_id="tpu-b", zone="us-central1-f", age_days=14), + ] + list_resp = MagicMock() + list_resp.status_code = 200 + list_resp.json.return_value = {"nodes": nodes} + mock_session = MagicMock() + mock_session.get.return_value = list_resp + failing_client = MagicMock() + failing_client.list_time_series.side_effect = RuntimeError("rpc failed") + + with ( + patch( + "cleancloud.providers.gcp.rules.ai.tpu_idle.AuthorizedSession", + return_value=mock_session, + ), + patch( + "cleancloud.providers.gcp.rules.ai.tpu_idle.monitoring_v3.MetricServiceClient", + return_value=failing_client, + ), + patch("cleancloud.providers.gcp.rules.ai.tpu_idle.datetime") as mock_dt, + ): + mock_dt.now.return_value = NOW + mock_dt.fromisoformat = datetime.fromisoformat + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + findings = find_idle_tpu_nodes(project_id=_PROJECT, credentials=MagicMock()) + assert findings == [] + zone_warns = [x for x in w if "monitoring query failed" in str(x.message)] + assert len(zone_warns) == 1 # one warning for the zone, not one per node + assert "us-central1-f" in str(zone_warns[0].message) + assert failing_client.list_time_series.call_count == 1 # only one API call + + # --- Miscellaneous --- + + def test_multiple_ready_nodes_no_findings(self): + """Each READY node passes pre-checks but no findings (join unprovable for all).""" + nodes = [ + _make_node(node_id="tpu-a", zone="us-central1-f", age_days=14), + _make_node(node_id="tpu-b", zone="us-central1-b", age_days=20), + ] + findings = _run(nodes) + assert findings == [] + + def test_custom_idle_days_still_no_findings(self): + """Custom idle_days is respected for pre-checks but join still blocks emission.""" node = _make_node(node_id="tpu-1", age_days=5) - findings = _run([node], duty_cycles={}, idle_days=3) - assert len(findings) == 1 - assert findings[0].confidence == ConfidenceLevel.LOW - - def test_multiple_nodes_independent(self): - """Each node is evaluated independently.""" - idle_node = _make_node(node_id="tpu-idle", age_days=14) - active_node = _make_node(node_id="tpu-active", age_days=14) - findings = _run( - [idle_node, active_node], - duty_cycles={"tpu-idle": 0.0, "tpu-active": 0.9}, - ) - assert len(findings) == 1 - assert "tpu-idle" in findings[0].resource_id + findings = _run([node], idle_days=3) + assert findings == [] - def test_v5p_cost_and_risk(self): - """V5P node with 8 chips: $4.20 × 8 = $33.60/hr → CRITICAL.""" - node = _make_node( - node_id="tpu-v5p", - tpu_type="V5P", - topology="2x4", # 8 chips - accel_type_legacy="v5p-8", - age_days=14, - ) - findings = _run([node], duty_cycles={"tpu-v5p": 0.0}) - assert len(findings) == 1 - f = findings[0] - assert f.risk == RiskLevel.CRITICAL - expected_hourly = _CHIP_HOURLY_COST["V5P"] * 8 - assert f.details["hourly_cost_usd"] == pytest.approx(expected_hourly, rel=1e-3) + def test_rule_id_attribute(self): + assert find_idle_tpu_nodes.RULE_ID == "gcp.tpu.idle" diff --git a/tests/cleancloud/providers/gcp/ai/test_gcp_vertex_endpoint_idle.py b/tests/cleancloud/providers/gcp/ai/test_gcp_vertex_endpoint_idle.py index 23f74d6..e3d0ce7 100644 --- a/tests/cleancloud/providers/gcp/ai/test_gcp_vertex_endpoint_idle.py +++ b/tests/cleancloud/providers/gcp/ai/test_gcp_vertex_endpoint_idle.py @@ -2,19 +2,21 @@ Tests for gcp.vertex.endpoint.idle rule. Coverage: -- Core detection: idle CPU endpoint (MEDIUM risk), idle GPU endpoint (HIGH risk) -- Near-idle tier: < LOW_TRAFFIC_THRESHOLD requests -> MEDIUM confidence -- Skipping logic: active endpoints, young endpoints, automaticResources (scales to zero) -- Age calculation and effective window capping -- Confidence levels: HIGH (age >= 14d), MEDIUM (age >= 10.5d or unknown age or near-idle) -- GPU detection: NVIDIA_TESLA_T4, NVIDIA_TESLA_A100, TPU_V2 -- Cost estimation: per-model accuracy, machine + GPU addon for n1 machines, bundled for a2 -- Multiple deployed models: total min_replica_count aggregated, per-model cost summed -- Monitoring: batched per location (one call per location), error -> assume active (conservative) -- Region filter: endpoints outside the filter are skipped -- Pagination: nextPageToken causes a second API call -- Permission errors: PermissionError raised on 403 from list call -- RULE_METADATA and RULE_ID attributes present +- _parse_location: valid name, missing segment, empty segment +- _parse_endpoint_id: valid name, empty string +- _parse_rfc3339_utc: Z suffix, +00:00 suffix, empty, invalid +- _classify_deployed_models: dedicated in-scope, automatic in-scope (spec 3.4), + shared-only, floor-0, malformed minReplicaCount, bad createTime, GPU detection, + mixed resource modes, multiple models +- _evaluate_endpoint_telemetry: no points, leading/trailing/interior gap > window/2 + -> unresolved, gap exactly at threshold -> complete, dense zero points, dense + nonzero points, float double_value not truncated to int +- find_idle_vertex_endpoints integration: idle CPU (MEDIUM risk), idle GPU (HIGH risk), + automatic resources in-scope (spec 3.4), scale-to-zero skipped, sharedResources skipped, + active endpoint skipped, telemetry unresolved skipped, monitoring client failure, + monitoring query failure, young endpoint skipped, region filter, pagination, + estimated_monthly_cost_usd is None, confidence always HIGH, details fields, + RULE_METADATA, RULE_ID attribute """ from datetime import datetime, timedelta, timezone @@ -23,1343 +25,902 @@ import pytest from cleancloud.providers.gcp.rules.ai.vertex_endpoint_idle import ( - _DAYS_IDLE, - _DEFAULT_MACHINE_MONTHLY_COST, - _GPU_MONTHLY_COST_EACH, - _LOW_TRAFFIC_THRESHOLD, - _LOW_TRAFFIC_THRESHOLD_GPU, - _MACHINE_MONTHLY_COST, - _MIN_MONTHLY_COST_USD, + _DEFAULT_IDLE_DAYS, + _REQUEST_METRIC_RESOURCE_TYPE, + _REQUEST_METRIC_TYPE, RULE_METADATA, + _classify_deployed_models, + _evaluate_endpoint_telemetry, + _parse_endpoint_id, + _parse_location, + _parse_rfc3339_utc, find_idle_vertex_endpoints, ) # --------------------------------------------------------------------------- -# Helpers +# Shared constants # --------------------------------------------------------------------------- -NOW = datetime(2025, 6, 1, 12, 0, 0, tzinfo=timezone.utc) -_OLD = NOW - timedelta(days=30) -_YOUNG = NOW - timedelta(days=3) +NOW = datetime(2026, 1, 15, 12, 0, 0, tzinfo=timezone.utc) +_WINDOW_START = NOW - timedelta(days=_DEFAULT_IDLE_DAYS) _PROJECT = "my-project" _LOCATION = "us-central1" _ENDPOINT_ID = "1234567890" _ENDPOINT_NAME = f"projects/{_PROJECT}/locations/{_LOCATION}/endpoints/{_ENDPOINT_ID}" +# A timestamp well before the observation window (endpoint and models are "old") +_OLD = NOW - timedelta(days=30) +_OLD_STR = _OLD.strftime("%Y-%m-%dT%H:%M:%SZ") + +# A timestamp inside the observation window (endpoint is too young) +_YOUNG = NOW - timedelta(days=5) +_YOUNG_STR = _YOUNG.strftime("%Y-%m-%dT%H:%M:%SZ") + +# Telemetry: dense 3-point fixtures that satisfy all gap checks for a 14-day window. +# Gap threshold = window/2 = 7 days. Points: +# P1 at _WINDOW_START+1h (leading gap = 1h << 7d) +# P2 at NOW-7d (P1→P2 gap ≈ 6d23h < 7d) +# P3 at NOW-1h (P2→P3 gap ≈ 6d23h < 7d; trailing gap = 1h << 7d) +_ZERO_POINTS = [ + (0, _WINDOW_START + timedelta(hours=1)), + (0, NOW - timedelta(days=7)), + (0, NOW - timedelta(hours=1)), +] +_ACTIVE_POINTS = [ + (0, _WINDOW_START + timedelta(hours=1)), + (42, NOW - timedelta(days=7)), + (0, NOW - timedelta(hours=1)), +] + + +# --------------------------------------------------------------------------- +# Endpoint / deployed-model builders +# --------------------------------------------------------------------------- + def _endpoint( endpoint_id: str = _ENDPOINT_ID, location: str = _LOCATION, project: str = _PROJECT, - display_name: str = "my-model-endpoint", - create_time: datetime = _OLD, - min_replica_count: int = 1, - machine_type: str = "n1-standard-4", - accelerator_type: str = "ACCELERATOR_TYPE_UNSPECIFIED", - accelerator_count: int = 0, - use_automatic_resources: bool = False, + display_name: str = "my-endpoint", + create_time_str: str = _OLD_STR, deployed_models: list = None, ) -> dict: """Build a minimal Vertex AI endpoint response dict.""" name = f"projects/{project}/locations/{location}/endpoints/{endpoint_id}" - create_str = create_time.strftime("%Y-%m-%dT%H:%M:%SZ") - - if deployed_models is not None: - return { - "name": name, - "displayName": display_name, - "createTime": create_str, - "deployedModels": deployed_models, - } + return { + "name": name, + "displayName": display_name, + "createTime": create_time_str, + "deployedModels": deployed_models if deployed_models is not None else [], + } + - model: dict = {"id": "model-abc"} - if use_automatic_resources: - model["automaticResources"] = {"minReplicaCount": 0, "maxReplicaCount": 4} - else: - model["dedicatedResources"] = { +def _dedicated( + min_replica: int = 1, + machine_type: str = "n1-standard-4", + accel_type: str = "ACCELERATOR_TYPE_UNSPECIFIED", + accel_count: int = 0, + create_time_str: str = _OLD_STR, +) -> dict: + return { + "id": "m1", + "createTime": create_time_str, + "dedicatedResources": { "machineSpec": { "machineType": machine_type, - "acceleratorType": accelerator_type, - "acceleratorCount": accelerator_count, + "acceleratorType": accel_type, + "acceleratorCount": accel_count, }, - "minReplicaCount": min_replica_count, - "maxReplicaCount": min_replica_count + 2, - } + "minReplicaCount": min_replica, + "maxReplicaCount": max(min_replica, 1) + 2, + }, + } + +def _automatic(min_replica: int = 1, create_time_str: str = _OLD_STR) -> dict: return { - "name": name, - "displayName": display_name, - "createTime": create_str, - "deployedModels": [model], + "id": "m2", + "createTime": create_time_str, + "automaticResources": { + "minReplicaCount": min_replica, + "maxReplicaCount": min_replica + 3, + }, } -def _make_monitoring_client(request_counts: dict = None, error: bool = False): - """ - Build a mock monitoring client for the batch query API. +def _shared(create_time_str: str = _OLD_STR) -> dict: + return { + "id": "m3", + "createTime": create_time_str, + "sharedResources": "projects/p/locations/l/deploymentResourcePools/pool1", + } - request_counts: {endpoint_id: total_request_count} - {} -> no activity (all idle) - {"1234567890": 5} -> endpoint has 5 requests (near-idle if < threshold) - {"1234567890": 42} -> endpoint is active (>= threshold) - error=True: list_time_series raises an exception. - """ - client = MagicMock() - if error: - client.list_time_series.side_effect = Exception("monitoring unavailable") - return client - - series_list = [] - for ep_id, count in (request_counts or {}).items(): - series = MagicMock() - point = MagicMock() - point.value.int64_value = count - point.value.double_value = 0.0 - series.points = [point] - series.resource = MagicMock() - series.resource.labels = {"endpoint_id": ep_id} - series_list.append(series) - - client.list_time_series.return_value = series_list - return client + +# --------------------------------------------------------------------------- +# Integration test runner +# --------------------------------------------------------------------------- def _run( endpoints: list, - has_activity: bool = False, - request_counts: dict = None, + telemetry: dict = None, + query_fails: bool = False, + client_fails: bool = False, region_filter: str = None, - monitoring_error: bool = False, -): - """Helper: patch _list_endpoints and monitoring, run the rule.""" - mock_session = MagicMock() - mock_credentials = MagicMock() - - # Determine effective request counts - if request_counts is not None: - effective_counts = request_counts - elif has_activity: - # 42 is well above _LOW_TRAFFIC_THRESHOLD (10) — unambiguously active - effective_counts = {_ENDPOINT_ID: 42} - else: - effective_counts = {} # all idle - - monitoring_client = _make_monitoring_client( - request_counts=effective_counts, - error=monitoring_error, + idle_days: int = _DEFAULT_IDLE_DAYS, +) -> list: + """ + Run find_idle_vertex_endpoints with mocked dependencies. + + telemetry: {endpoint_id: [(value, timestamp), ...]} + None -> no series returned (telemetry_coverage_state = unresolved) + {} -> empty dict (same) + {id: _ZERO_POINTS} -> zero requests, coverage complete + {id: _ACTIVE_POINTS} -> nonzero requests, coverage complete -> not idle + query_fails: True -> _query_location_request_counts returns None (query failure) + client_fails: True -> MetricServiceClient() raises (all endpoints skip) + """ + + def _mock_query(client, project_id, location, ws, we, eids): + if query_fails: + return None + if telemetry is None: + return {} + return {eid: pts for eid, pts in telemetry.items() if eid in eids} + + client_factory = ( + MagicMock(side_effect=Exception("client init failed")) + if client_fails + else MagicMock(return_value=MagicMock()) ) + module = "cleancloud.providers.gcp.rules.ai.vertex_endpoint_idle" with ( - patch( - "cleancloud.providers.gcp.rules.ai.vertex_endpoint_idle._list_endpoints", - return_value=endpoints, - ), - patch( - "cleancloud.providers.gcp.rules.ai.vertex_endpoint_idle.monitoring_v3.MetricServiceClient", - return_value=monitoring_client, - ), - patch( - "cleancloud.providers.gcp.rules.ai.vertex_endpoint_idle.AuthorizedSession", - return_value=mock_session, - ), - patch( - "cleancloud.providers.gcp.rules.ai.vertex_endpoint_idle.datetime", - ) as mock_dt, + patch(f"{module}._list_endpoints", return_value=endpoints), + patch(f"{module}.monitoring_v3.MetricServiceClient", client_factory), + patch(f"{module}.AuthorizedSession", return_value=MagicMock()), + patch(f"{module}._query_location_request_counts", side_effect=_mock_query), + patch(f"{module}.datetime") as mock_dt, ): mock_dt.now.return_value = NOW mock_dt.fromisoformat.side_effect = datetime.fromisoformat findings = find_idle_vertex_endpoints( project_id=_PROJECT, - credentials=mock_credentials, + credentials=MagicMock(), region_filter=region_filter, + idle_days=idle_days, ) return findings -# --------------------------------------------------------------------------- -# RULE_METADATA / RULE_ID -# --------------------------------------------------------------------------- - - -def test_rule_metadata(): - assert RULE_METADATA["id"] == "gcp.vertex.endpoint.idle" - assert RULE_METADATA["category"] == "ai" - assert RULE_METADATA["cost_impact"] == "high" - - -def test_rule_id_attribute(): - assert find_idle_vertex_endpoints.RULE_ID == "gcp.vertex.endpoint.idle" - - -# --------------------------------------------------------------------------- -# Core detection — idle CPU endpoint -# --------------------------------------------------------------------------- - - -def test_idle_cpu_endpoint_flagged(): - ep = _endpoint(machine_type="n1-standard-4", min_replica_count=1) - findings = _run([ep]) - - assert len(findings) == 1 - f = findings[0] - assert f.rule_id == "gcp.vertex.endpoint.idle" - assert f.provider == "gcp" - assert f.resource_type == "gcp.vertex.endpoint" - assert f.resource_id == _ENDPOINT_NAME - assert f.region == _LOCATION - assert f.risk.value == "medium" - assert f.confidence.value == "high" - assert f.estimated_monthly_cost_usd == _MACHINE_MONTHLY_COST["n1-standard-4"] * 1 - - -def test_idle_gpu_endpoint_flagged_high_risk(): - ep = _endpoint( - machine_type="n1-standard-4", - accelerator_type="NVIDIA_TESLA_T4", - accelerator_count=1, - min_replica_count=1, - ) - findings = _run([ep]) +# =========================================================================== +# Unit tests: _parse_location +# =========================================================================== - assert len(findings) == 1 - f = findings[0] - assert f.risk.value == "high" - assert f.confidence.value == "high" - # Cost = machine + GPU addon - expected_cost = ( - _MACHINE_MONTHLY_COST["n1-standard-4"] + _GPU_MONTHLY_COST_EACH["NVIDIA_TESLA_T4"] - ) * 1 - assert f.estimated_monthly_cost_usd == pytest.approx(expected_cost) +def test_parse_location_valid_name(): + name = "projects/p/locations/us-central1/endpoints/123" + assert _parse_location(name) == "us-central1" -def test_idle_a2_gpu_endpoint_no_double_count(): - """a2-highgpu machines already include GPU cost — no addon should be added.""" - ep = _endpoint( - machine_type="a2-highgpu-1g", - accelerator_type="NVIDIA_TESLA_A100", - accelerator_count=1, - min_replica_count=1, - ) - findings = _run([ep]) - assert len(findings) == 1 - f = findings[0] - assert f.risk.value == "high" - # Cost = machine only (GPU bundled) - assert f.estimated_monthly_cost_usd == pytest.approx(_MACHINE_MONTHLY_COST["a2-highgpu-1g"]) +def test_parse_location_missing_segment_returns_none(): + assert _parse_location("projects/p/endpoints/123") is None -# --------------------------------------------------------------------------- -# Near-idle detection tier -# --------------------------------------------------------------------------- +def test_parse_location_empty_string_returns_none(): + assert _parse_location("") is None -def test_near_idle_endpoint_flagged_medium_confidence(): - """Endpoint with low but non-zero traffic is flagged as near-idle at MEDIUM confidence.""" - ep = _endpoint(machine_type="n1-standard-4", min_replica_count=1) - count = _LOW_TRAFFIC_THRESHOLD - 1 # just below threshold - findings = _run([ep], request_counts={_ENDPOINT_ID: count}) +def test_parse_location_segment_present_but_empty_returns_none(): + # "locations/" with no value after it -> parts has "" after "locations" + assert _parse_location("projects/p/locations/") is None - assert len(findings) == 1 - f = findings[0] - assert f.confidence.value == "medium" - assert "near-idle" in f.title.lower() or str(count) in f.title - assert f.details["request_count"] == count +# =========================================================================== +# Unit tests: _parse_endpoint_id +# =========================================================================== -def test_near_idle_with_single_request_flagged(): - """Even 1 request in 14 days is near-idle.""" - ep = _endpoint() - findings = _run([ep], request_counts={_ENDPOINT_ID: 1}) - assert len(findings) == 1 - assert findings[0].details["request_count"] == 1 - assert findings[0].confidence.value == "medium" +def test_parse_endpoint_id_valid(): + name = "projects/p/locations/us-central1/endpoints/9876" + assert _parse_endpoint_id(name) == "9876" -def test_above_threshold_endpoint_active_skipped(): - """Endpoint with >= effective_threshold requests is considered active and skipped.""" - ep = _endpoint() - # Single replica, CPU: effective_threshold = 10 * 1 = 10 - findings = _run([ep], request_counts={_ENDPOINT_ID: _LOW_TRAFFIC_THRESHOLD}) - assert findings == [] +def test_parse_endpoint_id_empty_string(): + assert _parse_endpoint_id("") == "" -def test_near_idle_confidence_medium_even_if_old(): - """Near-idle endpoints are capped at MEDIUM even if age >= 14 days.""" - ep = _endpoint(create_time=NOW - timedelta(days=30)) - findings = _run([ep], request_counts={_ENDPOINT_ID: 3}) +# =========================================================================== +# Unit tests: _parse_rfc3339_utc +# =========================================================================== - assert len(findings) == 1 - assert findings[0].confidence.value == "medium" # not HIGH — has some traffic +def test_parse_rfc3339_utc_z_suffix(): + dt = _parse_rfc3339_utc("2026-01-01T00:00:00Z") + assert dt is not None + assert dt.tzinfo == timezone.utc + assert dt.year == 2026 -def test_gpu_endpoint_near_idle_at_lower_threshold(): - """GPU endpoints use a lower threshold — flagged near-idle at count < GPU threshold.""" - ep = _endpoint( - machine_type="n1-standard-4", - accelerator_type="NVIDIA_TESLA_T4", - accelerator_count=1, - min_replica_count=1, - ) - count = _LOW_TRAFFIC_THRESHOLD_GPU - 1 # near-idle for GPU (below 5) - findings = _run([ep], request_counts={_ENDPOINT_ID: count}) - assert len(findings) == 1 - f = findings[0] - assert f.confidence.value == "medium" - assert f.details["request_count"] == count - assert "gpu-adjusted" in f.evidence.signals_used[0].lower() +def test_parse_rfc3339_utc_plus00(): + dt = _parse_rfc3339_utc("2026-01-01T00:00:00+00:00") + assert dt is not None + assert dt.tzinfo == timezone.utc -def test_gpu_endpoint_at_gpu_threshold_active(): - """A GPU endpoint at exactly the GPU threshold is active (not near-idle).""" - ep = _endpoint( - machine_type="n1-standard-4", - accelerator_type="NVIDIA_TESLA_T4", - accelerator_count=1, - min_replica_count=1, - ) - # Single replica GPU: effective_threshold = 5 * 1 = 5; count=5 -> active - findings = _run([ep], request_counts={_ENDPOINT_ID: _LOW_TRAFFIC_THRESHOLD_GPU}) - assert findings == [] +def test_parse_rfc3339_utc_empty_string_returns_none(): + assert _parse_rfc3339_utc("") is None -def test_cpu_endpoint_not_affected_by_gpu_threshold(): - """CPU endpoint at count 7 is still near-idle (CPU threshold is 10).""" - ep = _endpoint(machine_type="n1-standard-4", min_replica_count=1) # no GPU - count = _LOW_TRAFFIC_THRESHOLD_GPU + 2 # 7: near-idle for CPU (below 10) - findings = _run([ep], request_counts={_ENDPOINT_ID: count}) +def test_parse_rfc3339_utc_invalid_string_returns_none(): + assert _parse_rfc3339_utc("not-a-timestamp") is None - assert len(findings) == 1 - assert findings[0].details["request_count"] == count - assert "gpu-adjusted" not in findings[0].evidence.signals_used[0].lower() +def test_parse_rfc3339_utc_none_returns_none(): + assert _parse_rfc3339_utc(None) is None -def test_replica_aware_threshold_scales_with_replicas(): - """3-replica CPU endpoint: threshold = int(10 * sqrt(3)) = 17; count=16 -> near-idle.""" - ep = _endpoint(machine_type="n1-standard-4", min_replica_count=3) - # sqrt(3) ≈ 1.732 -> threshold = int(10 * 1.732) = 17 - findings = _run([ep], request_counts={_ENDPOINT_ID: 16}) - assert len(findings) == 1 - assert findings[0].confidence.value == "medium" - assert findings[0].details["effective_threshold"] == 17 +# =========================================================================== +# Unit tests: _classify_deployed_models +# =========================================================================== -def test_replica_aware_threshold_active_when_above_scaled(): - """3-replica CPU endpoint: count >= sqrt-scaled threshold -> active.""" - ep = _endpoint(machine_type="n1-standard-4", min_replica_count=3) - findings = _run([ep], request_counts={_ENDPOINT_ID: 17}) # at threshold -> active - assert findings == [] - - -def test_sublinear_scaling_prevents_over_leniency(): - """20-replica endpoint: threshold = int(10 * sqrt(20)) ≈ 44, not 200.""" - ep = _endpoint(machine_type="n1-standard-4", min_replica_count=20) - expected_threshold = int(10 * max(1.0, 20**0.5)) # int(44.7) = 44 - - # At threshold -> active - assert _run([ep], request_counts={_ENDPOINT_ID: expected_threshold}) == [] - # Below threshold -> near-idle - findings = _run([ep], request_counts={_ENDPOINT_ID: expected_threshold - 1}) - assert len(findings) == 1 - assert findings[0].details["effective_threshold"] == expected_threshold - +def test_classify_empty_list(): + result = _classify_deployed_models([], now=NOW) + assert result["skip"] is False + assert result["provisioned_floor"] == 0 + assert result["in_scope_count"] == 0 + assert result["has_accelerator"] is False -def test_no_monitoring_data_unknown_age_skipped(): - """No monitoring data + unknown age = too many unknowns -> skip.""" - ep = _endpoint() - ep["createTime"] = "" # unknown age - # Empty counts -> no_monitoring_data=True - findings = _run([ep], request_counts={}) - assert findings == [] +def test_classify_dedicated_min_replica_one_in_scope(): + models = [_dedicated(min_replica=1)] + result = _classify_deployed_models(models, now=NOW) + assert result["skip"] is False + assert result["provisioned_floor"] == 1 + assert result["in_scope_count"] == 1 + assert result["capacity_floor_start"] is not None -def test_no_monitoring_data_known_age_still_flagged(): - """No monitoring data + age ≥ 2×idle_threshold -> still flagged (age well-established).""" - # Require age >= 2*14=28 before trusting absence of metrics as evidence of idleness - ep = _endpoint(create_time=NOW - timedelta(days=30)) # age=30 ≥ 28 - findings = _run([ep], request_counts={}) # no monitoring data - assert len(findings) == 1 - assert findings[0].details["no_monitoring_data"] is True +def test_classify_dedicated_min_replica_zero_out_of_scope(): + models = [_dedicated(min_replica=0)] + result = _classify_deployed_models(models, now=NOW) + assert result["provisioned_floor"] == 0 + assert result["in_scope_count"] == 0 -def test_high_confidence_requires_full_observation_window(): - """HIGH confidence requires age >= 14 AND effective_window == 14 AND monitoring data present.""" - # Age exactly at threshold; provide explicit 0 so monitoring data is present (no_monitoring_data=False) - ep = _endpoint(create_time=NOW - timedelta(days=14)) - findings = _run([ep], request_counts={_ENDPOINT_ID: 0}) - assert len(findings) == 1 - assert findings[0].confidence.value == "high" +def test_classify_automatic_min_replica_one_in_scope(): + """automaticResources.minReplicaCount >= 1 is in scope (spec 3.4).""" + models = [_automatic(min_replica=1)] + result = _classify_deployed_models(models, now=NOW) + assert result["skip"] is False + assert result["provisioned_floor"] == 1 + assert result["in_scope_count"] == 1 + assert result["capacity_floor_start"] is not None -def test_waste_score_zero_traffic_equals_full_cost(): - """When count=0, waste_score == monthly_cost (full waste).""" - ep = _endpoint(machine_type="n1-standard-4", min_replica_count=1) - findings = _run([ep]) - assert len(findings) == 1 - f = findings[0] - assert f.details["waste_score"] == pytest.approx(f.estimated_monthly_cost_usd) +def test_classify_automatic_min_replica_zero_out_of_scope(): + """automaticResources.minReplicaCount == 0 is scale-to-zero -- out of scope.""" + models = [_automatic(min_replica=0)] + result = _classify_deployed_models(models, now=NOW) + assert result["provisioned_floor"] == 0 + assert result["in_scope_count"] == 0 -def test_waste_score_partial_traffic_less_than_full_cost(): - """Near-idle endpoint has waste_score < monthly_cost (partial waste).""" - ep = _endpoint(machine_type="n1-standard-4", min_replica_count=1) - count = 5 # below threshold of 10 - findings = _run([ep], request_counts={_ENDPOINT_ID: count}) - assert len(findings) == 1 - f = findings[0] - assert f.details["waste_score"] < f.estimated_monthly_cost_usd - assert f.details["waste_score"] > 0 +def test_classify_shared_resources_only(): + models = [_shared()] + result = _classify_deployed_models(models, now=NOW) + assert result["provisioned_floor"] == 0 + assert result["shared_only"] is True + assert "sharedResources" in result["resource_modes"] -def test_experiment_pattern_multi_model(): - """Multi-model endpoints get pattern='abandoned_experiment' in details.""" - ep = _endpoint( - deployed_models=[ - { - "id": "m1", - "dedicatedResources": { - "machineSpec": {"machineType": "n1-standard-4"}, - "minReplicaCount": 1, - }, +def test_classify_unrecognized_resource_union_skips_endpoint(): + """Model with no recognized resource union (dedicated/automatic/shared) -> skip=True (spec 9).""" + models = [ + { + "id": "m1", + "createTime": _OLD_STR, + "unknownResources": {"someField": "someValue"}, + } + ] + result = _classify_deployed_models(models, now=NOW) + assert result["skip"] is True + + +def test_classify_malformed_min_replica_count_skips_endpoint(): + models = [ + { + "id": "m1", + "createTime": _OLD_STR, + "dedicatedResources": { + "machineSpec": {"machineType": "n1-standard-4"}, + "minReplicaCount": "not-a-number", }, - { - "id": "m2", - "dedicatedResources": { - "machineSpec": {"machineType": "n1-standard-4"}, - "minReplicaCount": 1, - }, + } + ] + result = _classify_deployed_models(models, now=NOW) + assert result["skip"] is True + + +def test_classify_bad_create_time_skips_endpoint(): + """In-scope model with unparsable createTime -> skip=True (spec 7, 9).""" + models = [_dedicated(min_replica=1, create_time_str="bad-timestamp")] + result = _classify_deployed_models(models, now=NOW) + assert result["skip"] is True + + +def test_classify_future_create_time_skips_endpoint(): + """In-scope model with future createTime -> skip=True (spec 7, 9).""" + future = (NOW + timedelta(days=1)).strftime("%Y-%m-%dT%H:%M:%SZ") + models = [_dedicated(min_replica=1, create_time_str=future)] + result = _classify_deployed_models(models, now=NOW) + assert result["skip"] is True + + +def test_classify_dedicated_min_replica_count_missing_skips_endpoint(): + """Missing minReplicaCount on dedicatedResources is malformed -> skip=True (spec 9).""" + models = [ + { + "id": "m1", + "createTime": _OLD_STR, + "dedicatedResources": { + "machineSpec": {"machineType": "n1-standard-4"}, + # minReplicaCount absent }, - ] - ) - findings = _run([ep]) - assert len(findings) == 1 - assert findings[0].details["pattern"] == "abandoned_experiment" + } + ] + result = _classify_deployed_models(models, now=NOW) + assert result["skip"] is True + + +def test_classify_automatic_min_replica_count_missing_skips_endpoint(): + """Missing minReplicaCount on automaticResources is malformed -> skip=True (spec 9).""" + models = [ + { + "id": "m1", + "createTime": _OLD_STR, + "automaticResources": { + # minReplicaCount absent + }, + } + ] + result = _classify_deployed_models(models, now=NOW) + assert result["skip"] is True -def test_experiment_pattern_single_model_none(): - """Single-model endpoints have pattern=None in details.""" - ep = _endpoint(min_replica_count=1) - findings = _run([ep]) - assert len(findings) == 1 - assert findings[0].details["pattern"] is None +def test_classify_shared_only_false_when_dedicated_zero_present(): + """dedicated(min=0) + shared: floor=0 but shared_only=False (there IS a dedicated model).""" + models = [_dedicated(min_replica=0), _shared()] + result = _classify_deployed_models(models, now=NOW) + assert result["provisioned_floor"] == 0 + assert result["shared_only"] is False # dedicated model present, even if out-of-scope -# --------------------------------------------------------------------------- -# Skipping logic -# --------------------------------------------------------------------------- +def test_classify_shared_only_true_when_only_shared(): + """Only sharedResources models -> shared_only=True.""" + models = [_shared(), _shared()] + result = _classify_deployed_models(models, now=NOW) + assert result["provisioned_floor"] == 0 + assert result["shared_only"] is True -def test_active_endpoint_skipped(): - ep = _endpoint() - findings = _run([ep], has_activity=True) - assert findings == [] +def test_classify_gpu_accelerator_recognized_type(): + models = [_dedicated(min_replica=1, accel_type="NVIDIA_TESLA_T4", accel_count=1)] + result = _classify_deployed_models(models, now=NOW) + assert result["has_accelerator"] is True -def test_young_endpoint_skipped(): - ep = _endpoint(create_time=_YOUNG) - findings = _run([ep]) - assert findings == [] +def test_classify_gpu_unrecognized_type_no_accelerator(): + models = [_dedicated(min_replica=1, accel_type="CUSTOM_CHIP_XYZ", accel_count=1)] + result = _classify_deployed_models(models, now=NOW) + assert result["has_accelerator"] is False -def test_automatic_resources_endpoint_skipped(): - """automaticResources scales to zero — no always-on billing.""" - ep = _endpoint(use_automatic_resources=True) - findings = _run([ep]) - assert findings == [] +def test_classify_gpu_zero_count_no_accelerator(): + models = [_dedicated(min_replica=1, accel_type="NVIDIA_TESLA_T4", accel_count=0)] + result = _classify_deployed_models(models, now=NOW) + assert result["has_accelerator"] is False -def test_endpoint_with_no_deployed_models_skipped(): - ep = { - "name": _ENDPOINT_NAME, - "displayName": "empty", - "createTime": _OLD.strftime("%Y-%m-%dT%H:%M:%SZ"), - "deployedModels": [], - } - findings = _run([ep]) - assert findings == [] +def test_classify_unspecified_accel_type_no_accelerator(): + models = [_dedicated(min_replica=1, accel_type="ACCELERATOR_TYPE_UNSPECIFIED", accel_count=2)] + result = _classify_deployed_models(models, now=NOW) + assert result["has_accelerator"] is False -def test_zero_min_replica_dedicated_resources_skipped(): - """dedicatedResources with minReplicaCount=0 — no always-on billing.""" - ep = _endpoint(min_replica_count=0) - findings = _run([ep]) - assert findings == [] +def test_classify_multiple_dedicated_models_summed(): + m1 = _dedicated(min_replica=2, create_time_str=_OLD_STR) + m2 = _dedicated(min_replica=3, create_time_str=_OLD_STR) + result = _classify_deployed_models([m1, m2], now=NOW) + assert result["provisioned_floor"] == 5 + assert result["in_scope_count"] == 2 -# --------------------------------------------------------------------------- -# Confidence levels -# --------------------------------------------------------------------------- +def test_classify_multiple_models_capacity_floor_start_is_max(): + older = (NOW - timedelta(days=20)).strftime("%Y-%m-%dT%H:%M:%SZ") + newer = (NOW - timedelta(days=10)).strftime("%Y-%m-%dT%H:%M:%SZ") + m1 = _dedicated(min_replica=1, create_time_str=older) + m2 = _dedicated(min_replica=1, create_time_str=newer) + result = _classify_deployed_models([m1, m2], now=NOW) + expected = NOW - timedelta(days=10) + assert abs((result["capacity_floor_start"] - expected).total_seconds()) < 2 + + +def test_classify_mixed_dedicated_and_automatic(): + m1 = _dedicated(min_replica=1) + m2 = _automatic(min_replica=1) + result = _classify_deployed_models([m1, m2], now=NOW) + assert result["provisioned_floor"] == 2 + assert result["in_scope_count"] == 2 + assert "dedicatedResources" in result["resource_modes"] + assert "automaticResources" in result["resource_modes"] -def test_confidence_high_when_age_gte_threshold(): - """HIGH confidence when age > idle threshold and monitoring data confirms zero traffic.""" - ep = _endpoint(create_time=NOW - timedelta(days=_DAYS_IDLE + 5)) - # Explicit 0-count series so monitoring data is present (no_monitoring_data=False) - findings = _run([ep], request_counts={_ENDPOINT_ID: 0}) - assert len(findings) == 1 - assert findings[0].confidence.value == "high" +def test_classify_resource_modes_none_when_no_models(): + result = _classify_deployed_models([], now=NOW) + assert result["resource_modes"] == "none" -def test_confidence_medium_when_age_in_75_percent_window(): - """age=80% of threshold -> MEDIUM, but only when monitoring data is present. +# =========================================================================== +# Unit tests: _evaluate_endpoint_telemetry +# =========================================================================== - When no monitoring data exists and age < idle threshold, the stricter guard skips - the endpoint to avoid false positives from metric lag. Provide explicit zero-count - series to confirm that MEDIUM confidence fires when monitoring confirms zero traffic. - """ - age = int(_DAYS_IDLE * 0.80) # 80% of threshold -> MEDIUM - ep = _endpoint(create_time=NOW - timedelta(days=age)) - # Explicit 0-count series -> no_monitoring_data=False (confirmed zero traffic) - findings = _run([ep], request_counts={_ENDPOINT_ID: 0}) - assert len(findings) == 1 - assert findings[0].confidence.value == "medium" +def test_evaluate_no_points_returns_unresolved(): + cs, ts, mr = _evaluate_endpoint_telemetry([], _WINDOW_START, NOW) + assert cs == "unresolved" + assert ts == "unresolved" + assert mr == 0 -def test_confidence_medium_when_age_unknown(): - """Missing createTime -> age unknown -> MEDIUM confidence (when monitoring data IS present).""" - ep = _endpoint() - ep["createTime"] = "" # No timestamp — age unknown - # Provide explicit 0-count series so no_monitoring_data=False; only age is unknown - findings = _run([ep], request_counts={_ENDPOINT_ID: 0}) - assert len(findings) == 1 - assert findings[0].confidence.value == "medium" +def test_evaluate_large_leading_gap_unresolved(): + """Single late-window point: leading gap >> window/2 -> unresolved (spec 8.3.6, 8.3.8).""" + # Only one point near NOW; gap from window_start is ~14d >> 7d threshold + points = [(0, NOW - timedelta(hours=1))] + cs, ts, mr = _evaluate_endpoint_telemetry(points, _WINDOW_START, NOW) + assert cs == "unresolved" + assert ts == "unresolved" -def test_borderline_age_below_75_percent_skipped(): - """age < 75% of threshold — too borderline -> skip.""" - age = int(_DAYS_IDLE * 0.60) # 60% — below 75% cutoff - ep = _endpoint(create_time=NOW - timedelta(days=age)) - findings = _run([ep]) - assert findings == [] + +def test_evaluate_large_trailing_gap_unresolved(): + """Single early-window point: trailing gap >> window/2 -> unresolved (spec 8.3.6, 8.3.8).""" + # Only one point near window_start; gap to window_end is ~14d >> 7d threshold + points = [(0, _WINDOW_START + timedelta(hours=1))] + cs, ts, mr = _evaluate_endpoint_telemetry(points, _WINDOW_START, NOW) + assert cs == "unresolved" + assert ts == "unresolved" -# --------------------------------------------------------------------------- -# Effective window capping -# --------------------------------------------------------------------------- +def test_evaluate_large_interior_gap_unresolved(): + """Two points with huge gap between them -> unresolved (spec 8.3.6, 8.3.8).""" + # P1 near start (leading gap small), P2 near end (trailing gap small) + # But the P1->P2 interior gap is ~14d >> 7d threshold + points = [ + (0, _WINDOW_START + timedelta(hours=1)), + (0, NOW - timedelta(hours=1)), + ] + cs, ts, mr = _evaluate_endpoint_telemetry(points, _WINDOW_START, NOW) + assert cs == "unresolved" + assert ts == "unresolved" -def test_effective_window_capped_to_age(): - """If endpoint is 10 days old, effective window = 10, not 14.""" - ep = _endpoint(create_time=NOW - timedelta(days=10)) - findings = _run([ep]) - if findings: - f = findings[0] - assert f.details["idle_window_days"] == 10 +def test_evaluate_dense_zero_points_complete(): + """Three points spanning window with all gaps < window/2 -> complete, no requests.""" + cs, ts, mr = _evaluate_endpoint_telemetry(_ZERO_POINTS, _WINDOW_START, NOW) + assert cs == "complete" + assert ts == "no_observed_prediction_requests" + assert mr == 0 + + +def test_evaluate_dense_nonzero_point_observed(): + """Dense points with nonzero value -> complete, observed requests.""" + cs, ts, mr = _evaluate_endpoint_telemetry(_ACTIVE_POINTS, _WINDOW_START, NOW) + assert cs == "complete" + assert ts == "observed_prediction_requests" + assert mr == 42 + + +def test_evaluate_gap_exactly_at_threshold_is_complete(): + """Gaps exactly equal to threshold (not strictly greater) -> coverage complete.""" + # Window: 6h, threshold = 3h + # P1 at window_start+0s (leading gap = 0), P2 at window_end (trailing gap = 0) + # Interior gap = 6h = 2 * threshold; strictly > threshold -> unresolved + # Use 3 evenly spaced points instead: gaps exactly = 3h each + short_start = NOW - timedelta(hours=6) + p1 = (0, short_start) # leading gap = 0 + p2 = (0, short_start + timedelta(hours=3)) # interior gap = 3h = threshold + p3 = (0, NOW) # interior gap = 3h, trailing = 0 + cs, ts, mr = _evaluate_endpoint_telemetry([p1, p2, p3], short_start, NOW) + assert cs == "complete" # gaps == threshold, not > threshold + + +def test_evaluate_float_value_above_zero_observed(): + """A double_value of 0.7 (not truncated to int 0) -> observed_prediction_requests.""" + points = [ + (0, _WINDOW_START + timedelta(hours=1)), + (0.7, NOW - timedelta(days=7)), + (0, NOW - timedelta(hours=1)), + ] + cs, ts, mr = _evaluate_endpoint_telemetry(points, _WINDOW_START, NOW) + assert cs == "complete" + assert ts == "observed_prediction_requests" + assert mr == pytest.approx(0.7) + + +# =========================================================================== +# Integration tests: RULE_METADATA / RULE_ID +# =========================================================================== + + +def test_rule_metadata_id(): + assert RULE_METADATA["id"] == "gcp.vertex.endpoint.idle" -def test_effective_window_too_small_skipped(): - """If effective window < 3 days, skip the endpoint.""" - ep = _endpoint(create_time=NOW - timedelta(days=2)) - findings = _run([ep]) - assert findings == [] +def test_rule_metadata_category(): + assert RULE_METADATA["category"] == "ai" -# --------------------------------------------------------------------------- -# GPU detection -# --------------------------------------------------------------------------- +def test_rule_id_attribute(): + assert find_idle_vertex_endpoints.RULE_ID == "gcp.vertex.endpoint.idle" -@pytest.mark.parametrize( - "accel_type", - [ - "NVIDIA_TESLA_T4", - "NVIDIA_TESLA_V100", - "NVIDIA_TESLA_A100", - "NVIDIA_L4", - "TPU_V2", - "NVIDIA_H100_80GB", - ], -) -def test_gpu_accelerator_types_detected(accel_type): - ep = _endpoint( - machine_type="n1-standard-4", - accelerator_type=accel_type, - accelerator_count=1, - ) - findings = _run([ep]) - assert len(findings) == 1 - assert findings[0].risk.value == "high" - assert findings[0].details["is_gpu"] is True +# =========================================================================== +# Integration tests: idle endpoint detection +# =========================================================================== -# --------------------------------------------------------------------------- -# Multiple deployed models -# --------------------------------------------------------------------------- - +def test_idle_cpu_endpoint_emits_finding(): + """Dedicated CPU endpoint with minReplica=1, zero requests -> MEDIUM risk finding.""" + ep = _endpoint(deployed_models=[_dedicated(min_replica=1)]) + findings = _run([ep], telemetry={_ENDPOINT_ID: _ZERO_POINTS}) -def test_multiple_deployed_models_total_replicas_aggregated(): - """Total min_replica_count is summed across all deployed models.""" - ep = _endpoint( - deployed_models=[ - { - "id": "m1", - "dedicatedResources": { - "machineSpec": {"machineType": "n1-standard-4"}, - "minReplicaCount": 2, - "maxReplicaCount": 5, - }, - }, - { - "id": "m2", - "dedicatedResources": { - "machineSpec": {"machineType": "n1-standard-4"}, - "minReplicaCount": 1, - "maxReplicaCount": 3, - }, - }, - ] - ) - findings = _run([ep]) assert len(findings) == 1 - assert findings[0].details["min_replica_count"] == 3 + f = findings[0] + assert f.rule_id == "gcp.vertex.endpoint.idle" + assert f.provider == "gcp" + assert f.resource_type == "gcp.vertex.endpoint" + assert f.resource_id == _ENDPOINT_NAME + assert f.region == _LOCATION + assert f.risk.value == "medium" + assert f.confidence.value == "high" -def test_mixed_dedicated_and_automatic_resources(): - """Only dedicated models contribute to min_replica_count.""" +def test_idle_gpu_endpoint_emits_high_risk(): + """Dedicated GPU endpoint -> HIGH risk.""" ep = _endpoint( - deployed_models=[ - { - "id": "m1", - "automaticResources": {"minReplicaCount": 0, "maxReplicaCount": 4}, - }, - { - "id": "m2", - "dedicatedResources": { - "machineSpec": {"machineType": "n1-standard-8"}, - "minReplicaCount": 1, - "maxReplicaCount": 4, - }, - }, - ] + deployed_models=[_dedicated(min_replica=1, accel_type="NVIDIA_TESLA_T4", accel_count=1)] ) - findings = _run([ep]) - assert len(findings) == 1 - assert findings[0].details["min_replica_count"] == 1 - + findings = _run([ep], telemetry={_ENDPOINT_ID: _ZERO_POINTS}) -def test_multi_model_cost_accurate_per_model(): - """Cost is summed per deployed model, not first-machine-type × total replicas.""" - ep = _endpoint( - deployed_models=[ - { - "id": "m1", - "dedicatedResources": { - "machineSpec": {"machineType": "n1-standard-4"}, - "minReplicaCount": 1, - }, - }, - { - "id": "m2", - "dedicatedResources": { - "machineSpec": { - "machineType": "n1-standard-4", - "acceleratorType": "NVIDIA_TESLA_T4", - "acceleratorCount": 1, - }, - "minReplicaCount": 1, - }, - }, - ] - ) - findings = _run([ep]) assert len(findings) == 1 - # m1: n1-standard-4 × 1 = 138; m2: (138 + 311) × 1 = 449; total = 587 - expected = _MACHINE_MONTHLY_COST["n1-standard-4"] + ( - _MACHINE_MONTHLY_COST["n1-standard-4"] + _GPU_MONTHLY_COST_EACH["NVIDIA_TESLA_T4"] - ) - assert findings[0].estimated_monthly_cost_usd == pytest.approx(expected) - - -# --------------------------------------------------------------------------- -# Cost estimation -# --------------------------------------------------------------------------- + assert findings[0].risk.value == "high" + assert findings[0].confidence.value == "high" -def test_unknown_machine_type_uses_default_cost(): - ep = _endpoint(machine_type="custom-unknown-type") - findings = _run([ep]) +def test_estimated_monthly_cost_is_none(): + """spec 6.4: pricing varies; estimated_monthly_cost_usd must be None always.""" + ep = _endpoint(deployed_models=[_dedicated(min_replica=1)]) + findings = _run([ep], telemetry={_ENDPOINT_ID: _ZERO_POINTS}) assert len(findings) == 1 - assert findings[0].estimated_monthly_cost_usd == _DEFAULT_MACHINE_MONTHLY_COST * 1 + assert findings[0].estimated_monthly_cost_usd is None -def test_cost_scaled_by_min_replica_count(): - ep = _endpoint(machine_type="n1-standard-4", min_replica_count=3) - findings = _run([ep]) +def test_confidence_always_high(): + """spec 10.2: confidence is HIGH for all emitted findings; no tiered fallback.""" + ep = _endpoint(deployed_models=[_dedicated(min_replica=1)]) + findings = _run([ep], telemetry={_ENDPOINT_ID: _ZERO_POINTS}) assert len(findings) == 1 - assert findings[0].estimated_monthly_cost_usd == pytest.approx( - _MACHINE_MONTHLY_COST["n1-standard-4"] * 3 - ) - - -# --------------------------------------------------------------------------- -# Monitoring — batch behavior -# --------------------------------------------------------------------------- - - -def test_monitoring_error_assumes_active(): - """If monitoring raises an exception, conservatively assume active — don't flag.""" - ep = _endpoint() - findings = _run([ep], monitoring_error=True) - assert findings == [] + assert findings[0].confidence.value == "high" -def test_empty_timeseries_means_idle(): - """Empty timeseries (no data points) = no predictions = idle.""" - ep = _endpoint() - findings = _run([ep], has_activity=False) +def test_automatic_resources_min_replica_one_emits_finding(): + """spec 3.4: automaticResources.minReplicaCount >= 1 is in scope -> finding emitted.""" + ep = _endpoint(deployed_models=[_automatic(min_replica=1)]) + findings = _run([ep], telemetry={_ENDPOINT_ID: _ZERO_POINTS}) assert len(findings) == 1 + f = findings[0] + assert f.rule_id == "gcp.vertex.endpoint.idle" + assert "automaticResources" in f.details["resource_modes"] -def test_batch_monitoring_single_call_per_location(): - """Two eligible endpoints in the same location produce exactly one monitoring call.""" - ep1 = _endpoint(endpoint_id="111", display_name="ep-1") - ep2 = _endpoint(endpoint_id="222", display_name="ep-2") - - mock_session = MagicMock() - mock_credentials = MagicMock() - monitoring_client = _make_monitoring_client(request_counts={}) - - with ( - patch( - "cleancloud.providers.gcp.rules.ai.vertex_endpoint_idle._list_endpoints", - return_value=[ep1, ep2], - ), - patch( - "cleancloud.providers.gcp.rules.ai.vertex_endpoint_idle.monitoring_v3.MetricServiceClient", - return_value=monitoring_client, - ), - patch( - "cleancloud.providers.gcp.rules.ai.vertex_endpoint_idle.AuthorizedSession", - return_value=mock_session, - ), - patch( - "cleancloud.providers.gcp.rules.ai.vertex_endpoint_idle.datetime", - ) as mock_dt, - ): - mock_dt.now.return_value = NOW - mock_dt.fromisoformat.side_effect = datetime.fromisoformat - findings = find_idle_vertex_endpoints( - project_id=_PROJECT, - credentials=mock_credentials, - ) - - # Both endpoints flagged, but only one monitoring call (batched by location) - assert len(findings) == 2 - assert monitoring_client.list_time_series.call_count == 1 - - -def test_batch_monitoring_separate_call_per_location(): - """Endpoints in different locations each trigger their own monitoring call.""" - ep1 = _endpoint(endpoint_id="111", location="us-central1") - ep2 = _endpoint(endpoint_id="222", location="europe-west4") - - mock_session = MagicMock() - mock_credentials = MagicMock() - monitoring_client = _make_monitoring_client(request_counts={}) - - with ( - patch( - "cleancloud.providers.gcp.rules.ai.vertex_endpoint_idle._list_endpoints", - return_value=[ep1, ep2], - ), - patch( - "cleancloud.providers.gcp.rules.ai.vertex_endpoint_idle.monitoring_v3.MetricServiceClient", - return_value=monitoring_client, - ), - patch( - "cleancloud.providers.gcp.rules.ai.vertex_endpoint_idle.AuthorizedSession", - return_value=mock_session, - ), - patch( - "cleancloud.providers.gcp.rules.ai.vertex_endpoint_idle.datetime", - ) as mock_dt, - ): - mock_dt.now.return_value = NOW - mock_dt.fromisoformat.side_effect = datetime.fromisoformat - findings = find_idle_vertex_endpoints( - project_id=_PROJECT, - credentials=mock_credentials, - ) - - assert len(findings) == 2 - assert monitoring_client.list_time_series.call_count == 2 - - -# --------------------------------------------------------------------------- -# Region filter -# --------------------------------------------------------------------------- - - -def test_region_filter_excludes_other_locations(): - ep = _endpoint(location="europe-west1") - findings = _run([ep], region_filter="us-central1") +def test_automatic_resources_scale_to_zero_skipped(): + """automaticResources.minReplicaCount == 0 -> no always-deployed floor -> skipped.""" + ep = _endpoint(deployed_models=[_automatic(min_replica=0)]) + findings = _run([ep], telemetry={_ENDPOINT_ID: _ZERO_POINTS}) assert findings == [] -def test_region_filter_includes_matching_location(): - ep = _endpoint(location="us-central1") - findings = _run([ep], region_filter="us-central1") - assert len(findings) == 1 - - -# --------------------------------------------------------------------------- -# Pagination -# --------------------------------------------------------------------------- - - -def test_pagination_fetches_all_endpoints(): - """_list_endpoints follows nextPageToken — test that both pages are combined.""" - mock_session = MagicMock() - mock_credentials = MagicMock() - monitoring_client = _make_monitoring_client(request_counts={}) - - page1 = { - "endpoints": [ - _endpoint(endpoint_id="111", display_name="ep-1"), - ], - "nextPageToken": "token-page2", - } - page2 = { - "endpoints": [ - _endpoint(endpoint_id="222", display_name="ep-2"), - ], - } - - mock_response_1 = MagicMock() - mock_response_1.status_code = 200 - mock_response_1.json.return_value = page1 - - mock_response_2 = MagicMock() - mock_response_2.status_code = 200 - mock_response_2.json.return_value = page2 - - mock_session.get.side_effect = [mock_response_1, mock_response_2] - - with ( - patch( - "cleancloud.providers.gcp.rules.ai.vertex_endpoint_idle.monitoring_v3.MetricServiceClient", - return_value=monitoring_client, - ), - patch( - "cleancloud.providers.gcp.rules.ai.vertex_endpoint_idle.AuthorizedSession", - return_value=mock_session, - ), - patch( - "cleancloud.providers.gcp.rules.ai.vertex_endpoint_idle.datetime", - ) as mock_dt, - ): - mock_dt.now.return_value = NOW - mock_dt.fromisoformat.side_effect = datetime.fromisoformat - findings = find_idle_vertex_endpoints( - project_id=_PROJECT, - credentials=mock_credentials, - ) - - assert len(findings) == 2 - assert mock_session.get.call_count == 2 - - -# --------------------------------------------------------------------------- -# Permission error -# --------------------------------------------------------------------------- - - -def test_403_raises_permission_error(): - mock_session = MagicMock() - mock_credentials = MagicMock() - - mock_response = MagicMock() - mock_response.status_code = 403 +def test_dedicated_scale_to_zero_skipped(): + """dedicatedResources.minReplicaCount == 0 -> skipped.""" + ep = _endpoint(deployed_models=[_dedicated(min_replica=0)]) + findings = _run([ep], telemetry={_ENDPOINT_ID: _ZERO_POINTS}) + assert findings == [] - mock_session.get.return_value = mock_response - - with ( - patch( - "cleancloud.providers.gcp.rules.ai.vertex_endpoint_idle.monitoring_v3.MetricServiceClient", - ), - patch( - "cleancloud.providers.gcp.rules.ai.vertex_endpoint_idle.AuthorizedSession", - return_value=mock_session, - ), - ): - with pytest.raises(PermissionError, match="aiplatform.endpoints.list"): - find_idle_vertex_endpoints(project_id=_PROJECT, credentials=mock_credentials) +def test_shared_resources_only_skipped(): + """sharedResources only -> provisioned_floor=0 -> skipped.""" + ep = _endpoint(deployed_models=[_shared()]) + findings = _run([ep], telemetry={_ENDPOINT_ID: _ZERO_POINTS}) + assert findings == [] -def test_404_returns_empty(): - """404 means Vertex AI API not enabled — return empty findings, don't raise.""" - mock_session = MagicMock() - mock_credentials = MagicMock() - mock_response = MagicMock() - mock_response.status_code = 404 - mock_response.json.return_value = {} +def test_no_deployed_models_skipped(): + ep = _endpoint(deployed_models=[]) + findings = _run([ep], telemetry={_ENDPOINT_ID: _ZERO_POINTS}) + assert findings == [] - mock_session.get.return_value = mock_response - with ( - patch( - "cleancloud.providers.gcp.rules.ai.vertex_endpoint_idle.monitoring_v3.MetricServiceClient", - ), - patch( - "cleancloud.providers.gcp.rules.ai.vertex_endpoint_idle.AuthorizedSession", - return_value=mock_session, - ), - ): - findings = find_idle_vertex_endpoints(project_id=_PROJECT, credentials=mock_credentials) +def test_active_endpoint_skipped(): + """Any max_rate > 0 -> observed prediction requests -> skipped.""" + ep = _endpoint(deployed_models=[_dedicated(min_replica=1)]) + findings = _run([ep], telemetry={_ENDPOINT_ID: _ACTIVE_POINTS}) assert findings == [] -# --------------------------------------------------------------------------- -# Finding fields -# --------------------------------------------------------------------------- +def test_telemetry_unresolved_no_points_skipped(): + """No telemetry series at all -> telemetry_coverage_state=unresolved -> skipped.""" + ep = _endpoint(deployed_models=[_dedicated(min_replica=1)]) + findings = _run([ep], telemetry={}) # no series for endpoint + assert findings == [] -def test_finding_fields_are_complete(): - ep = _endpoint(machine_type="n1-standard-8", min_replica_count=2) - findings = _run([ep]) - assert len(findings) == 1 - f = findings[0] +def test_telemetry_unresolved_large_gap_skipped(): + """Single late-window point: leading gap >> window/2 -> unresolved -> skipped (spec 8.3.6).""" + ep = _endpoint(deployed_models=[_dedicated(min_replica=1)]) + # Only one point near window_end; leading gap ≈ idle_days - 1h >> window/2 threshold + sparse_points = [(0, NOW - timedelta(hours=1))] + findings = _run([ep], telemetry={_ENDPOINT_ID: sparse_points}) + assert findings == [] - assert f.provider == "gcp" - assert f.rule_id == "gcp.vertex.endpoint.idle" - assert f.resource_type == "gcp.vertex.endpoint" - assert f.resource_id == _ENDPOINT_NAME - assert f.region == _LOCATION - assert f.estimated_monthly_cost_usd > 0 - assert f.title.startswith("Idle Vertex AI Endpoint") - assert "zero predictions" in f.summary.lower() or "zero prediction" in f.summary.lower() - assert f.evidence is not None - assert len(f.evidence.signals_used) >= 2 - assert f.evidence.time_window - - d = f.details - assert d["endpoint_id"] == _ENDPOINT_ID - assert d["location"] == _LOCATION - assert d["machine_type"] == "n1-standard-8" - assert d["min_replica_count"] == 2 - assert d["idle_days_threshold"] == _DAYS_IDLE - assert d["request_count"] == 0 - assert d["cost_basis"] == "us-central1 baseline estimate" - assert "us-central1" in d["cost_variance"] # universal pricing disclaimer - assert isinstance(d["recommendations"], list) - assert len(d["recommendations"]) >= 1 - assert d["waste_score"] > 0 - - -def test_no_monitoring_data_adds_transparency_signal(): - """When no time series exist for an endpoint, a transparency signal is added.""" - ep = _endpoint() - # Empty counts dict — endpoint_id absent -> no_monitoring_data = True - findings = _run([ep], request_counts={}) - assert len(findings) == 1 - signals = findings[0].evidence.signals_used - assert any("no prediction request data" in s.lower() for s in signals) - assert findings[0].details["no_monitoring_data"] is True +def test_no_near_idle_findings_emitted(): + """Non-zero requests always produces no finding (no near-idle tier; spec 8.5).""" + ep = _endpoint(deployed_models=[_dedicated(min_replica=1)]) + # Low but non-zero traffic should NOT emit a finding + low_traffic_points = [(3, NOW - timedelta(hours=1))] + findings = _run([ep], telemetry={_ENDPOINT_ID: low_traffic_points}) + assert findings == [] -def test_with_monitoring_data_no_transparency_signal(): - """When count == 0 due to explicit data (series present but all zeros), the - transparency signal is not added — the series just happens to be empty.""" - # This case can't be distinguished from truly absent data with the current mock, - # but we verify no_monitoring_data == True when endpoint_id absent from counts. - ep = _endpoint() - findings = _run([ep], request_counts={}) - # no_monitoring_data == True because _ENDPOINT_ID not in counts - assert findings[0].details["no_monitoring_data"] is True +def test_no_missing_telemetry_fallback(): + """Missing telemetry (None dict) must not emit findings (no fallback; spec 8.5).""" + ep = _endpoint(deployed_models=[_dedicated(min_replica=1)]) + findings = _run([ep], telemetry=None) + assert findings == [] -def test_eligible_endpoint_ids_guard_filters_stale_series(): - """Series for endpoint IDs not in the eligible set are ignored.""" - ep = _endpoint(endpoint_id=_ENDPOINT_ID) - stale_id = "stale-endpoint-99999" +def test_monitoring_client_failure_returns_empty(): + """MetricServiceClient() raises -> no fallback -> empty findings list.""" + ep = _endpoint(deployed_models=[_dedicated(min_replica=1)]) + findings = _run([ep], client_fails=True) + assert findings == [] - mock_session = MagicMock() - mock_credentials = MagicMock() - # Monitoring returns a high-count series for stale_id AND our endpoint - monitoring_client = _make_monitoring_client(request_counts={stale_id: 500, _ENDPOINT_ID: 0}) - with ( - patch( - "cleancloud.providers.gcp.rules.ai.vertex_endpoint_idle._list_endpoints", - return_value=[ep], - ), - patch( - "cleancloud.providers.gcp.rules.ai.vertex_endpoint_idle.monitoring_v3.MetricServiceClient", - return_value=monitoring_client, - ), - patch( - "cleancloud.providers.gcp.rules.ai.vertex_endpoint_idle.AuthorizedSession", - return_value=mock_session, - ), - patch( - "cleancloud.providers.gcp.rules.ai.vertex_endpoint_idle.datetime", - ) as mock_dt, - ): - mock_dt.now.return_value = NOW - mock_dt.fromisoformat.side_effect = datetime.fromisoformat - findings = find_idle_vertex_endpoints( - project_id=_PROJECT, - credentials=mock_credentials, - ) - - # stale_id series should be ignored; our endpoint has 0 count -> flagged as idle - assert len(findings) == 1 - assert findings[0].details["request_count"] == 0 +def test_monitoring_query_failure_skips_location(): + """Query returns None -> skip all endpoints in that location.""" + ep = _endpoint(deployed_models=[_dedicated(min_replica=1)]) + findings = _run([ep], query_fails=True) + assert findings == [] -def test_multi_model_signal_added(): - """Endpoints with multiple dedicated models get an A/B-test signal.""" +def test_young_endpoint_capacity_floor_start_too_late_skipped(): + """capacity_floor_start > window_start -> full window not coverable -> skipped.""" ep = _endpoint( - deployed_models=[ - { - "id": "m1", - "dedicatedResources": { - "machineSpec": {"machineType": "n1-standard-4"}, - "minReplicaCount": 1, - }, - }, - { - "id": "m2", - "dedicatedResources": { - "machineSpec": {"machineType": "n1-standard-4"}, - "minReplicaCount": 1, - }, - }, - ] + create_time_str=_YOUNG_STR, + deployed_models=[_dedicated(min_replica=1, create_time_str=_YOUNG_STR)], ) - findings = _run([ep]) - assert len(findings) == 1 - signals = findings[0].evidence.signals_used - assert any("2 deployed models" in s for s in signals) - - -def test_single_model_no_multi_model_signal(): - """Single-model endpoints do not get the A/B-test signal.""" - ep = _endpoint(min_replica_count=1) - findings = _run([ep]) - assert len(findings) == 1 - signals = findings[0].evidence.signals_used - assert not any("deployed models" in s for s in signals) + findings = _run([ep], telemetry={_ENDPOINT_ID: _ZERO_POINTS}) + assert findings == [] -def test_multiple_replicas_signal_added(): - """Endpoints with more than 1 replica get the stronger-waste-signal note.""" - ep = _endpoint(machine_type="n1-standard-4", min_replica_count=3) - findings = _run([ep]) - assert len(findings) == 1 - signals = findings[0].evidence.signals_used - assert any("3 replicas" in s for s in signals) +def test_young_deployed_model_skips_even_if_endpoint_is_old(): + """capacity_floor_start = max(endpoint, model createTimes). Young model -> skip.""" + young_model_str = _YOUNG.strftime("%Y-%m-%dT%H:%M:%SZ") + ep = _endpoint( + create_time_str=_OLD_STR, + deployed_models=[_dedicated(min_replica=1, create_time_str=young_model_str)], + ) + findings = _run([ep], telemetry={_ENDPOINT_ID: _ZERO_POINTS}) + assert findings == [] -def test_single_replica_no_replicas_signal(): - """Single-replica endpoints do not get the replicas signal.""" - ep = _endpoint(machine_type="n1-standard-4", min_replica_count=1) - findings = _run([ep]) - assert len(findings) == 1 - signals = findings[0].evidence.signals_used - assert not any("replicas configured" in s for s in signals) +def test_endpoint_missing_name_skipped(): + ep = { + "name": "", + "createTime": _OLD_STR, + "deployedModels": [_dedicated(min_replica=1)], + } + findings = _run([ep], telemetry={_ENDPOINT_ID: _ZERO_POINTS}) + assert findings == [] -def test_all_endpoints_have_cost_variance_in_details(): - """All findings carry a cost_variance disclaimer — pricing varies by region for all machine types.""" - gpu_ep = _endpoint( - machine_type="n1-standard-4", - accelerator_type="NVIDIA_TESLA_T4", - accelerator_count=1, +def test_endpoint_bad_create_time_skipped(): + ep = _endpoint( + create_time_str="not-a-timestamp", + deployed_models=[_dedicated(min_replica=1)], ) - gpu_findings = _run([gpu_ep]) - assert len(gpu_findings) == 1 - assert "us-central1" in gpu_findings[0].details["cost_variance"] - - cpu_ep = _endpoint(machine_type="n1-standard-4") - cpu_findings = _run([cpu_ep]) - assert len(cpu_findings) == 1 - assert "us-central1" in cpu_findings[0].details["cost_variance"] + findings = _run([ep], telemetry={_ENDPOINT_ID: _ZERO_POINTS}) + assert findings == [] -def test_recommendations_always_present(): - """Every finding has a non-empty recommendations list.""" - ep = _endpoint() - findings = _run([ep]) - assert len(findings) == 1 - recs = findings[0].details["recommendations"] - assert isinstance(recs, list) - assert len(recs) >= 2 # at minimum: switch to automaticResources + delete - assert any("automaticResources" in r for r in recs) - assert any("gcloud ai endpoints delete" in r for r in recs) +def test_endpoint_future_create_time_skipped(): + future_str = (NOW + timedelta(days=1)).strftime("%Y-%m-%dT%H:%M:%SZ") + ep = _endpoint( + create_time_str=future_str, + deployed_models=[_dedicated(min_replica=1)], + ) + findings = _run([ep], telemetry={_ENDPOINT_ID: _ZERO_POINTS}) + assert findings == [] -def test_multi_replica_recommendation_included(): - """Endpoints with multiple replicas get a specific reduce-replicas recommendation.""" - ep = _endpoint(min_replica_count=3) - findings = _run([ep]) - assert len(findings) == 1 - recs = findings[0].details["recommendations"] - assert any("minReplicaCount" in r for r in recs) +def test_deployed_model_bad_create_time_skips_endpoint(): + """In-scope model with unparsable createTime -> capacity_floor_start=None -> skip.""" + ep = _endpoint(deployed_models=[_dedicated(min_replica=1, create_time_str="bad")]) + findings = _run([ep], telemetry={_ENDPOINT_ID: _ZERO_POINTS}) + assert findings == [] -def test_experiment_pattern_recommendation_included(): - """Multi-model endpoints get a consolidate recommendation.""" +def test_malformed_min_replica_count_skips_endpoint(): + """Malformed minReplicaCount -> skip=True -> endpoint skipped.""" ep = _endpoint( deployed_models=[ { "id": "m1", + "createTime": _OLD_STR, "dedicatedResources": { "machineSpec": {"machineType": "n1-standard-4"}, - "minReplicaCount": 1, + "minReplicaCount": "bad", }, - }, - { - "id": "m2", - "dedicatedResources": { - "machineSpec": {"machineType": "n1-standard-4"}, - "minReplicaCount": 1, - }, - }, + } ] ) - findings = _run([ep]) - assert len(findings) == 1 - recs = findings[0].details["recommendations"] - assert any("Consolidate" in r or "consolidate" in r for r in recs) - - -def test_min_cost_constant_is_reasonable(): - """_MIN_MONTHLY_COST_USD is set and below the cheapest known machine type.""" - assert _MIN_MONTHLY_COST_USD > 0 - # All known machine types cost more than the filter threshold - from cleancloud.providers.gcp.rules.ai.vertex_endpoint_idle import ( - _MACHINE_MONTHLY_COST, - ) - - assert all(cost >= _MIN_MONTHLY_COST_USD for cost in _MACHINE_MONTHLY_COST.values()) + findings = _run([ep], telemetry={_ENDPOINT_ID: _ZERO_POINTS}) + assert findings == [] -def test_near_idle_finding_fields(): - """Near-idle findings have correct title, request_count in details, MEDIUM confidence.""" - ep = _endpoint(machine_type="n1-standard-4", min_replica_count=1) - findings = _run([ep], request_counts={_ENDPOINT_ID: 3}) +def test_region_filter_matches_location(): + ep = _endpoint(location=_LOCATION, deployed_models=[_dedicated(min_replica=1)]) + findings = _run([ep], telemetry={_ENDPOINT_ID: _ZERO_POINTS}, region_filter=_LOCATION) assert len(findings) == 1 - f = findings[0] - assert "near-idle" in f.title.lower() or "3 prediction" in f.title.lower() - assert f.confidence.value == "medium" - assert f.details["request_count"] == 3 - assert f.estimated_monthly_cost_usd > 0 - -def test_no_monitoring_data_known_age_below_threshold_skipped(): - """No monitoring data + known age < idle threshold -> skipped (stricter guard). - - Monitoring can be absent due to metric delay or permission gaps — not safe - to flag unless we have the full observation window confirmed by age. - """ - # age=12 is >= 7 (past young-endpoint filter) but < 14 (_DAYS_IDLE) - ep = _endpoint(create_time=NOW - timedelta(days=12)) - findings = _run([ep], request_counts={}) # absent from counts -> no_monitoring_data=True +def test_region_filter_non_matching_skipped(): + ep = _endpoint(location=_LOCATION, deployed_models=[_dedicated(min_replica=1)]) + findings = _run([ep], telemetry={_ENDPOINT_ID: _ZERO_POINTS}, region_filter="europe-west1") assert findings == [] -def test_no_monitoring_data_below_double_threshold_skipped(): - """No monitoring data + age < 2×idle_threshold -> skipped (bias toward false negatives).""" - # age=20 < 28 (2*14) -> missing metrics insufficient evidence of idleness - ep = _endpoint(create_time=NOW - timedelta(days=20)) - findings = _run([ep], request_counts={}) +def test_empty_endpoints_list_returns_empty(): + findings = _run([]) assert findings == [] -def test_no_monitoring_data_at_double_threshold_flagged(): - """No monitoring data + age >= 2×idle_threshold -> flagged with transparency signal.""" - ep = _endpoint(create_time=NOW - timedelta(days=28)) # exactly 2×threshold - findings = _run([ep], request_counts={}) - assert len(findings) == 1 - assert findings[0].details["no_monitoring_data"] is True - - -def test_threshold_strategy_in_details(): - """All findings expose threshold_strategy='sqrt_replica_scaling' in details.""" - ep = _endpoint() - findings = _run([ep]) - assert len(findings) == 1 - assert findings[0].details["threshold_strategy"] == "sqrt_replica_scaling" - - -def test_sqrt_threshold_signal_in_evidence_for_multi_replica(): - """Multi-replica endpoints include a signal explaining sqrt threshold scaling.""" - ep = _endpoint(min_replica_count=3) - findings = _run([ep]) - assert len(findings) == 1 - signals = findings[0].evidence.signals_used - assert any("sqrt" in s.lower() or "sublinearly" in s.lower() for s in signals) +def test_two_endpoints_same_location_both_idle(): + ep1 = _endpoint(endpoint_id="111", deployed_models=[_dedicated(min_replica=1)]) + ep2 = _endpoint(endpoint_id="222", deployed_models=[_dedicated(min_replica=1)]) + ep1["name"] = f"projects/{_PROJECT}/locations/{_LOCATION}/endpoints/111" + ep2["name"] = f"projects/{_PROJECT}/locations/{_LOCATION}/endpoints/222" + telemetry = { + "111": _ZERO_POINTS, + "222": _ZERO_POINTS, + } + findings = _run([ep1, ep2], telemetry=telemetry) + assert len(findings) == 2 -# --------------------------------------------------------------------------- -# Metric alignment edge cases (empty / long series, recency guard) -# --------------------------------------------------------------------------- - +def test_two_endpoints_one_active_one_idle(): + ep1 = _endpoint(endpoint_id="111", deployed_models=[_dedicated(min_replica=1)]) + ep2 = _endpoint(endpoint_id="222", deployed_models=[_dedicated(min_replica=1)]) + ep1["name"] = f"projects/{_PROJECT}/locations/{_LOCATION}/endpoints/111" + ep2["name"] = f"projects/{_PROJECT}/locations/{_LOCATION}/endpoints/222" + telemetry = { + "111": _ACTIVE_POINTS, + "222": _ZERO_POINTS, + } + findings = _run([ep1, ep2], telemetry=telemetry) + assert len(findings) == 1 + assert "222" in findings[0].details["endpoint_id"] + + +def test_details_fields_present(): + """All required details keys must be present in emitted finding.""" + ep = _endpoint(deployed_models=[_dedicated(min_replica=1)]) + findings = _run([ep], telemetry={_ENDPOINT_ID: _ZERO_POINTS}) + assert len(findings) == 1 + d = findings[0].details + + required = { + "endpoint_id", + "location", + "provisioned_serving_floor", + "in_scope_model_count", + "resource_modes", + "has_accelerator", + "capacity_floor_start", + "idle_days_threshold", + "max_observed_request_rate_per_replica", + "telemetry_coverage_state", + "telemetry_state", + } + for key in required: + assert key in d, f"Missing details key: {key}" -def _make_monitoring_client_with_empty_points(endpoint_id: str = _ENDPOINT_ID): - """Return a monitoring client whose series has zero points.""" - client = MagicMock() - series = MagicMock() - series.points = [] # no data points - series.resource.labels = {"endpoint_id": endpoint_id} - client.list_time_series.return_value = [series] - return client - - -def _make_monitoring_client_with_many_points(endpoint_id: str = _ENDPOINT_ID, n: int = 10): - """Return a monitoring client whose series has more than 5 points (anomalous).""" - client = MagicMock() - series = MagicMock() - points = [] - for i in range(n): - p = MagicMock() - p.value.int64_value = 1 - p.value.double_value = 0.0 - points.append(p) - series.points = points - series.resource.labels = {"endpoint_id": endpoint_id} - client.list_time_series.return_value = [series] - return client - - -def _make_monitoring_client_with_recent_traffic(endpoint_id: str = _ENDPOINT_ID): - """Return a monitoring client where the last point timestamp is within 24 hours.""" - from datetime import timedelta - - client = MagicMock() - series = MagicMock() - point = MagicMock() - point.value.int64_value = 0 - point.value.double_value = 0.0 - - # Configure interval.end_time.ToDatetime to return a real recent datetime - recent_time = NOW - timedelta(hours=6) # 6 hours ago — within 24h window - - point.interval.end_time.ToDatetime.return_value = recent_time - series.points = [point] - series.resource.labels = {"endpoint_id": endpoint_id} - client.list_time_series.return_value = [series] - return client - - -def _run_with_monitoring_client(endpoints, monitoring_client, region_filter=None): - """Run rule with a pre-built monitoring client mock.""" - mock_session = MagicMock() - mock_credentials = MagicMock() - with ( - patch( - "cleancloud.providers.gcp.rules.ai.vertex_endpoint_idle._list_endpoints", - return_value=endpoints, - ), - patch( - "cleancloud.providers.gcp.rules.ai.vertex_endpoint_idle.monitoring_v3.MetricServiceClient", - return_value=monitoring_client, - ), - patch( - "cleancloud.providers.gcp.rules.ai.vertex_endpoint_idle.AuthorizedSession", - return_value=mock_session, - ), - patch( - "cleancloud.providers.gcp.rules.ai.vertex_endpoint_idle.datetime", - **{"now.return_value": NOW, "fromisoformat": datetime.fromisoformat}, - ), - ): - return find_idle_vertex_endpoints( - project_id=_PROJECT, - credentials=mock_credentials, - region_filter=region_filter, - ) +def test_details_telemetry_coverage_state_complete(): + ep = _endpoint(deployed_models=[_dedicated(min_replica=1)]) + findings = _run([ep], telemetry={_ENDPOINT_ID: _ZERO_POINTS}) + assert findings[0].details["telemetry_coverage_state"] == "complete" -def test_empty_series_points_treated_as_no_data(): - """Series with zero points are skipped — endpoint treated as no monitoring data. - An old (30-day) endpoint with no_monitoring_data=True and age >= _DAYS_IDLE - is still flagged (age provides sufficient evidence). The key assertion is that - no crash occurs and no_monitoring_data is correctly set to True. - """ - ep = _endpoint(create_time=NOW - timedelta(days=30)) - client = _make_monitoring_client_with_empty_points() - findings = _run_with_monitoring_client([ep], client) - # Endpoint is flagged (age=30 provides evidence) but no_monitoring_data=True - assert len(findings) == 1 - assert findings[0].details["no_monitoring_data"] is True +def test_details_telemetry_state_no_requests(): + ep = _endpoint(deployed_models=[_dedicated(min_replica=1)]) + findings = _run([ep], telemetry={_ENDPOINT_ID: _ZERO_POINTS}) + assert findings[0].details["telemetry_state"] == "no_observed_prediction_requests" -def test_long_series_points_skipped(): - """Series with >5 points are skipped as anomalous — same result as empty series.""" - ep = _endpoint(create_time=NOW - timedelta(days=30)) - client = _make_monitoring_client_with_many_points(n=10) - findings = _run_with_monitoring_client([ep], client) - # Long series skipped -> endpoint_id not in counts -> no_monitoring_data=True - assert len(findings) == 1 - assert findings[0].details["no_monitoring_data"] is True +def test_details_max_observed_request_rate_per_replica_zero(): + ep = _endpoint(deployed_models=[_dedicated(min_replica=1)]) + findings = _run([ep], telemetry={_ENDPOINT_ID: _ZERO_POINTS}) + assert findings[0].details["max_observed_request_rate_per_replica"] == 0 -def test_recent_traffic_spike_skipped(): - """Endpoint with traffic in the last 24h is NOT flagged — recency dominates. +def test_details_provisioned_serving_floor_correct(): + ep = _endpoint(deployed_models=[_dedicated(min_replica=3)]) + findings = _run([ep], telemetry={_ENDPOINT_ID: _ZERO_POINTS}) + assert findings[0].details["provisioned_serving_floor"] == 3 - An endpoint that received any traffic yesterday is considered active regardless - of how low the 14-day total looks. Prevents false positives on bursty workloads. - """ - ep = _endpoint(create_time=NOW - timedelta(days=30)) - client = _make_monitoring_client_with_recent_traffic() - findings = _run_with_monitoring_client([ep], client) - assert findings == [] +def test_details_has_accelerator_false_for_cpu(): + ep = _endpoint(deployed_models=[_dedicated(min_replica=1)]) + findings = _run([ep], telemetry={_ENDPOINT_ID: _ZERO_POINTS}) + assert findings[0].details["has_accelerator"] is False -def test_cron_pattern_very_low_count_is_medium(): - """count <= 2 (cron/batch heuristic) -> MEDIUM confidence regardless of age. - Very few requests over 14 days could be a weekly inference job, not abandonment. - Behavior-based (count <= 2), not age-based. - """ - ep = _endpoint(create_time=NOW - timedelta(days=30)) - # count=2: at the cron-protection boundary (≤ 2 = cron pattern) - findings = _run([ep], request_counts={_ENDPOINT_ID: 2}) - assert len(findings) == 1 - assert findings[0].confidence.value == "medium" +def test_details_has_accelerator_true_for_gpu(): + ep = _endpoint( + deployed_models=[_dedicated(min_replica=1, accel_type="NVIDIA_TESLA_T4", accel_count=1)] + ) + findings = _run([ep], telemetry={_ENDPOINT_ID: _ZERO_POINTS}) + assert findings[0].details["has_accelerator"] is True -def test_cron_pattern_count_above_threshold_still_medium(): - """count=3 (> 2, so NOT cron pattern) -> MEDIUM because is_near_idle, not cron.""" - ep = _endpoint(create_time=NOW - timedelta(days=30)) - findings = _run([ep], request_counts={_ENDPOINT_ID: 3}) +def test_details_idle_days_threshold_matches_param(): + ep = _endpoint(deployed_models=[_dedicated(min_replica=1)]) + findings = _run([ep], telemetry={_ENDPOINT_ID: _ZERO_POINTS}, idle_days=21) + # idle_days=21 -> window_start = NOW-21d, threshold=10.5d. + # _ZERO_POINTS: P1 at NOW-14d+1h (leading gap ≈7d < 10.5d), gaps ~7d < 10.5d. Complete. + # _OLD (30d ago) < window_start (21d ago) -> endpoint is eligible. assert len(findings) == 1 - assert findings[0].confidence.value == "medium" - + assert findings[0].details["idle_days_threshold"] == 21 -def test_effective_threshold_minimum_is_one(): - """effective_threshold is always >= 1, even with unusual base_threshold values.""" - # With 1 replica, threshold = max(1, int(base * 1.0)) = base (always >= 1) - ep = _endpoint(min_replica_count=1) - findings = _run([ep]) +@pytest.mark.parametrize( + "accel_type", + [ + "NVIDIA_TESLA_T4", + "NVIDIA_TESLA_V100", + "NVIDIA_TESLA_A100", + "NVIDIA_L4", + "NVIDIA_H100_80GB", + "TPU_V2", + "TPU_V3", + ], +) +def test_known_accelerator_types_produce_high_risk(accel_type): + ep = _endpoint( + deployed_models=[_dedicated(min_replica=1, accel_type=accel_type, accel_count=1)] + ) + findings = _run([ep], telemetry={_ENDPOINT_ID: _ZERO_POINTS}) assert len(findings) == 1 - assert findings[0].details["effective_threshold"] >= 1 + assert findings[0].risk.value == "high" -def test_requests_per_replica_in_details(): - """requests_per_replica is present in details and correctly computed.""" - ep = _endpoint(min_replica_count=2) - # count=0, replicas=2 -> requests_per_replica = 0 / 2 = 0.0 - findings = _run([ep], request_counts={_ENDPOINT_ID: 0}) - assert len(findings) == 1 - assert findings[0].details["requests_per_replica"] == pytest.approx(0.0) +def test_request_metric_type_constant(): + assert _REQUEST_METRIC_TYPE == "aiplatform.googleapis.com/prediction/online/request_count" -def test_cost_confidence_estimate_in_details(): - """cost_confidence='estimate' is present in all findings.""" - ep = _endpoint() - findings = _run([ep], request_counts={_ENDPOINT_ID: 0}) - assert len(findings) == 1 - assert findings[0].details["cost_confidence"] == "estimate" +def test_request_metric_resource_type_constant(): + assert _REQUEST_METRIC_RESOURCE_TYPE == "aiplatform.googleapis.com/Endpoint" diff --git a/tests/e2e/gcp/test_gcp_ai_rules_smoke.py b/tests/e2e/gcp/test_gcp_ai_rules_smoke.py index 1f3d8b8..3d78489 100644 --- a/tests/e2e/gcp/test_gcp_ai_rules_smoke.py +++ b/tests/e2e/gcp/test_gcp_ai_rules_smoke.py @@ -62,6 +62,54 @@ def test_gcp_ai_rules_run_without_error(): assert f.detected_at and isinstance(f.detected_at, datetime) +@pytest.mark.e2e +@pytest.mark.gcp +def test_vertex_endpoint_idle_returns_list_of_findings(): + """Smoke test: gcp.vertex.endpoint.idle runs without error and returns typed findings.""" + session = create_gcp_session() + projects = session.list_projects() + assert projects, "No accessible GCP projects found -- check ADC credentials" + + project_id = projects[0]["id"] + credentials = session.credentials + + try: + findings = find_idle_vertex_endpoints(project_id=project_id, credentials=credentials) + except PermissionError as e: + pytest.fail(f"Missing IAM permissions: {e}") + + assert isinstance(findings, list) + for f in findings: + assert isinstance(f, Finding) + assert f.rule_id == "gcp.vertex.endpoint.idle" + assert f.resource_type == "gcp.vertex.endpoint" + assert f.provider == "gcp" + assert f.resource_id + assert f.region + assert f.detected_at and isinstance(f.detected_at, datetime) + # spec 6.4: pricing varies; no flat estimate + assert f.estimated_monthly_cost_usd is None + # spec 10.2: confidence is always HIGH; no tiered fallback + assert f.confidence.value == "high" + assert f.risk.value in ("high", "medium") + # required details fields + assert "endpoint_id" in f.details + assert "location" in f.details + assert "provisioned_serving_floor" in f.details + assert f.details["provisioned_serving_floor"] >= 1 + assert "in_scope_model_count" in f.details + assert "resource_modes" in f.details + assert "has_accelerator" in f.details + assert "capacity_floor_start" in f.details + assert "idle_days_threshold" in f.details + assert "max_observed_request_rate_per_replica" in f.details + assert f.details["max_observed_request_rate_per_replica"] == 0 + assert "telemetry_coverage_state" in f.details + assert f.details["telemetry_coverage_state"] == "complete" + assert "telemetry_state" in f.details + assert f.details["telemetry_state"] == "no_observed_prediction_requests" + + @pytest.mark.e2e @pytest.mark.gcp def test_vertex_training_job_long_running_returns_list_of_findings(): @@ -136,6 +184,8 @@ def test_tpu_idle_returns_list_of_findings(): except PermissionError as e: pytest.fail(f"Missing IAM permissions: {e}") + # gcp.tpu.idle currently emits no findings (join barrier, spec 8.3): the loop + # below is vacuously empty today but must stay correct for when emission is unblocked. assert isinstance(findings, list) for f in findings: assert isinstance(f, Finding) @@ -145,15 +195,17 @@ def test_tpu_idle_returns_list_of_findings(): assert f.resource_id assert f.region assert f.detected_at and isinstance(f.detected_at, datetime) - assert f.estimated_monthly_cost_usd is not None - assert f.estimated_monthly_cost_usd > 0 + assert f.estimated_monthly_cost_usd is None # pricing varies; no flat estimate assert f.confidence.value in ("high", "medium", "low") assert f.risk.value in ("critical", "high", "medium") assert "node_id" in f.details - assert "chip_count" in f.details - assert "hourly_cost_usd" in f.details + assert "zone" in f.details + assert "tpu_type" in f.details assert "idle_days_threshold" in f.details - assert "pricing_scope" in f.details + assert "duty_cycle_threshold_pct" in f.details + assert "telemetry_join_state" in f.details + assert "telemetry_coverage_state" in f.details + assert "telemetry_state" in f.details @pytest.mark.e2e