diff --git a/cleancloud.yaml b/cleancloud.yaml index bd6331e..c7b71c6 100644 --- a/cleancloud.yaml +++ b/cleancloud.yaml @@ -61,9 +61,8 @@ rules: aws.rds.instance.idle: enabled: true - min_cost: 100 # suppress RDS findings below $100/month estimated cost params: - idle_days: 21 # require 21 days idle before flagging (default: 14) + idle_days_threshold: 21 # require 21 days idle before flagging (default: 14) gcp.sql.instance.idle: enabled: true diff --git a/cleancloud/providers/aws/rules/ai/ec2_gpu_idle.py b/cleancloud/providers/aws/rules/ai/ec2_gpu_idle.py index 8a1ecf2..818a152 100644 --- a/cleancloud/providers/aws/rules/ai/ec2_gpu_idle.py +++ b/cleancloud/providers/aws/rules/ai/ec2_gpu_idle.py @@ -397,7 +397,7 @@ def _list_gpu_metrics(cloudwatch, instance_id: str) -> list: Dimensions=[{"Name": "InstanceId", "Value": instance_id}], ) return resp.get("Metrics", []) - except ClientError: + except Exception: return [] @@ -434,7 +434,7 @@ def _get_max_gpu_utilisation( gpu_max = max(dp["Maximum"] for dp in datapoints) if max_util is None or gpu_max > max_util: max_util = gpu_max - except ClientError: + except Exception: continue return max_util @@ -467,5 +467,5 @@ def _get_avg_cpu_utilisation( if not datapoints: return None return max(dp["Maximum"] for dp in datapoints) - except ClientError: + except Exception: return None diff --git a/cleancloud/providers/aws/rules/ai/sagemaker_endpoint_idle.py b/cleancloud/providers/aws/rules/ai/sagemaker_endpoint_idle.py index 91d5b56..d69fc75 100644 --- a/cleancloud/providers/aws/rules/ai/sagemaker_endpoint_idle.py +++ b/cleancloud/providers/aws/rules/ai/sagemaker_endpoint_idle.py @@ -434,7 +434,7 @@ def _check_invocations( queried_with_variants=False, fetch_failed=False, ) - except ClientError: + except Exception: return InvocationCheckResult( has_traffic=True, active_variants=[], @@ -471,7 +471,7 @@ def _check_invocations( else: idle_variants.append(variant_name) - except ClientError: + except Exception: # CloudWatch API failure — treat this variant as active and surface the failure. return InvocationCheckResult( has_traffic=True, @@ -561,7 +561,7 @@ def _describe_endpoint( slcfg = cv.get("ServerlessConfig") if slcfg: serverless_cfg_by_variant[cv["VariantName"]] = slcfg - except ClientError: + except Exception: pass # config inaccessible — costs/GPU will use defaults accumulated_cost = 0.0 @@ -631,7 +631,7 @@ def _describe_endpoint( total_provisioned_concurrency, ) - except ClientError: + except Exception: # Unknown state — return zero instances so the endpoint is skipped rather # than flagged with assumed cost and instance count. return None, False, 0, 0, None, [], 0 diff --git a/cleancloud/providers/aws/rules/ami_old.py b/cleancloud/providers/aws/rules/ami_old.py index 06d065f..b9a34d5 100644 --- a/cleancloud/providers/aws/rules/ami_old.py +++ b/cleancloud/providers/aws/rules/ami_old.py @@ -507,7 +507,7 @@ def _get_last_launched_time(ec2, ami_id: str) -> Tuple[Optional[datetime], bool] if not isinstance(value, str) or not value: return None, False return datetime.fromisoformat(value.replace("Z", "+00:00")), False - except ClientError: + except Exception: return None, True @@ -527,7 +527,7 @@ def _check_active_instances(ec2, ami_id: str) -> Tuple[bool, bool]: ) found = any(r.get("Instances") for r in resp.get("Reservations", [])) return found, False - except ClientError: + except Exception: return False, True @@ -572,11 +572,11 @@ def _build_lt_index(ec2) -> Tuple[Dict[str, List[str]], bool]: v_lt_id = v.get("LaunchTemplateId") if image_id and v_lt_id: index.setdefault(image_id, set()).add(v_lt_id) - except ClientError: + except Exception: continue # best-effort per LT return {k: sorted(v) for k, v in index.items()}, lt_truncated - except ClientError: + except Exception: return {}, True @@ -610,5 +610,5 @@ def _build_lc_index(autoscaling) -> Tuple[Dict[str, List[str]], bool]: break kwargs["NextToken"] = nxt return {k: sorted(v) for k, v in index.items()}, lc_truncated - except ClientError: + except Exception: return {}, True diff --git a/cleancloud/providers/aws/rules/ebs_snapshot_old.py b/cleancloud/providers/aws/rules/ebs_snapshot_old.py index bca6187..4a2f41a 100644 --- a/cleancloud/providers/aws/rules/ebs_snapshot_old.py +++ b/cleancloud/providers/aws/rules/ebs_snapshot_old.py @@ -37,7 +37,6 @@ from typing import List, Optional, Set, Tuple import boto3 -from botocore.exceptions import BotoCoreError, ClientError from cleancloud.core.confidence import ConfidenceLevel from cleancloud.core.evidence import Evidence @@ -68,7 +67,7 @@ def _build_ami_snapshot_index(ec2) -> Tuple[Set[str], bool]: snap_id = bdm.get("Ebs", {}).get("SnapshotId") if snap_id: referenced.add(snap_id) - except (ClientError, BotoCoreError): + except Exception: return referenced, True return referenced, False @@ -91,7 +90,7 @@ def _check_external_sharing(ec2, snap_id: str) -> Tuple[bool, bool]: if perm.get("UserId"): # explicit cross-account return True, False return False, False - except (ClientError, BotoCoreError): + except Exception: return False, True diff --git a/cleancloud/providers/aws/rules/ec2_sg_unused.py b/cleancloud/providers/aws/rules/ec2_sg_unused.py index 8a45e87..1452cd4 100644 --- a/cleancloud/providers/aws/rules/ec2_sg_unused.py +++ b/cleancloud/providers/aws/rules/ec2_sg_unused.py @@ -262,7 +262,7 @@ def find_unused_security_groups( ) if name: vpc_names[vpc["VpcId"]] = name - except (ClientError, BotoCoreError): + except Exception: pass # VPC names are display-only; don't fail the rule # --- Step 7: Apply exclusion rules and emit findings --- diff --git a/cleancloud/providers/aws/rules/ec2_stopped.py b/cleancloud/providers/aws/rules/ec2_stopped.py index 0b4bbb3..d8f5e39 100644 --- a/cleancloud/providers/aws/rules/ec2_stopped.py +++ b/cleancloud/providers/aws/rules/ec2_stopped.py @@ -331,7 +331,7 @@ def _get_volume_sizes(ec2, volume_ids: List[str]) -> Dict[str, int]: size = vol.get("Size") if vid and size is not None: sizes[vid] = size - except (ClientError, BotoCoreError): + except Exception: pass return sizes diff --git a/cleancloud/providers/aws/rules/elastic_ip_unattached.py b/cleancloud/providers/aws/rules/elastic_ip_unattached.py index 465e22b..66fde96 100644 --- a/cleancloud/providers/aws/rules/elastic_ip_unattached.py +++ b/cleancloud/providers/aws/rules/elastic_ip_unattached.py @@ -1,8 +1,44 @@ +""" +Rule: aws.ec2.elastic_ip.unattached + + (spec — docs/specs/aws/elastic_ip_unattached.md) + +Intent: + Detect Elastic IP address records that are currently allocated to the account + in the scanned Region and are not currently associated with an instance or + network interface. + +Exclusions: + - resource_id absent (malformed identity) + - any canonical association field present (currently associated) + +Detection: + - resource_id present + - association_id, instance_id, network_interface_id, private_ip_address all absent + +Key rules: + - This is a review-candidate rule, not a delete-safe rule. + - No temporal threshold — current unattached state is the sole eligibility signal. + - Do not use AllocationTime (undocumented field). + - All four canonical association fields must be checked, not only AssociationId. + - Missing/non-iterable Addresses response fails the rule. + - Do not hardcode a fixed monthly cost estimate. + +Blind spots: + - future planned attachment or operational reserve intent not known + - DNS / allowlist / manual failover dependencies + - application-level use of the reserved public IP + - service-managed lifecycle expectations outside current association state + +APIs: + - ec2:DescribeAddresses +""" + from datetime import datetime, timezone -from typing import List +from typing import List, Optional import boto3 -from botocore.exceptions import ClientError +from botocore.exceptions import BotoCoreError, ClientError from cleancloud.core.confidence import ConfidenceLevel from cleancloud.core.evidence import Evidence @@ -10,136 +46,191 @@ from cleancloud.core.risk import RiskLevel -def find_unattached_elastic_ips( - session: boto3.Session, - region: str, - days_unattached: int = 30, -) -> List[Finding]: - """ - Find Elastic IPs allocated 30+ days ago and currently unattached. - - Unattached Elastic IPs incur small hourly charges when not associated. +def _str(value) -> Optional[str]: + """Return value if it is a non-empty string, else None.""" + return value if isinstance(value, str) and value else None - IMPORTANT: AWS does not expose "unattached since" timestamp, so we use - allocation age as a proxy. An EIP allocated 30+ days ago and currently - unattached is worth reviewing. - SAFE RULE (review-only): - - EIP does not have AssociationId (not attached) - - EIP allocation age >= days_unattached threshold (NOT unattached duration) - - Classic EIPs without AllocationTime are flagged immediately (conservative) +def _normalize_address(address: dict) -> Optional[dict]: + """Normalize a raw SDK address dict to the canonical field shape. - IAM permissions: - - ec2:DescribeAddresses + Returns None when the item must be skipped (non-dict, absent stable identity). + All rule logic must operate only on the returned dict. """ - ec2 = session.client("ec2", region_name=region) + if not isinstance(address, dict): + return None + + # Identity — resource_id: AllocationId → PublicIp → CarrierIp → absent (skip) + allocation_id = _str(address.get("AllocationId")) + public_ip = _str(address.get("PublicIp")) + carrier_ip = _str(address.get("CarrierIp")) + + resource_id = allocation_id or public_ip or carrier_ip + if not resource_id: + return None + + # Association fields — any present means currently associated + association_id = _str(address.get("AssociationId")) + instance_id = _str(address.get("InstanceId")) + network_interface_id = _str(address.get("NetworkInterfaceId")) + private_ip_address = _str(address.get("PrivateIpAddress")) + + # Context fields — absent → null; never block evaluation + domain = _str(address.get("Domain")) + network_interface_owner_id = _str(address.get("NetworkInterfaceOwnerId")) + network_border_group = _str(address.get("NetworkBorderGroup")) + public_ipv4_pool = _str(address.get("PublicIpv4Pool")) + customer_owned_ip = _str(address.get("CustomerOwnedIp")) + customer_owned_ipv4_pool = _str(address.get("CustomerOwnedIpv4Pool")) + subnet_id = _str(address.get("SubnetId")) + + # ServiceManaged — string enum ("alb", "nlb", "rnat", "rds", …); normalize as string + service_managed: Optional[str] = _str(address.get("ServiceManaged")) + + # Tags — prefer list; degrade to empty if absent or wrong type + tags_raw = address.get("Tags") + tags: list = tags_raw if isinstance(tags_raw, list) else [] + + return { + "resource_id": resource_id, + "allocation_id": allocation_id, + "public_ip": public_ip, + "carrier_ip": carrier_ip, + "association_id": association_id, + "instance_id": instance_id, + "network_interface_id": network_interface_id, + "private_ip_address": private_ip_address, + "domain": domain, + "network_interface_owner_id": network_interface_owner_id, + "network_border_group": network_border_group, + "public_ipv4_pool": public_ipv4_pool, + "customer_owned_ip": customer_owned_ip, + "customer_owned_ipv4_pool": customer_owned_ipv4_pool, + "subnet_id": subnet_id, + "service_managed": service_managed, + "tags": tags, + } + +def find_unattached_elastic_ips( + session: boto3.Session, + region: str, +) -> List[Finding]: + ec2 = session.client("ec2", region_name=region) now = datetime.now(timezone.utc) findings: List[Finding] = [] + # --- Step 1: Retrieve all Elastic IP records --- try: - # DescribeAddresses is non-paginated by AWS (no paginator exists). - # Returns all Elastic IPs in a single call. response = ec2.describe_addresses() - for eip in response.get("Addresses", []): - # Skip if attached to an instance or network interface - if "AssociationId" in eip: - continue - - # Calculate age since allocation - allocation_time = eip.get("AllocationTime") - domain = eip.get("Domain", "vpc") - is_classic = domain == "standard" - - if not allocation_time: - if is_classic: - # Genuine EC2-Classic EIP without AllocationTime — flag conservatively - age_days = None - else: - # VPC EIP without AllocationTime — cannot determine age, skip - continue - else: - age_days = (now - allocation_time).days - - # Apply age threshold (skip if too young) - if age_days is not None and age_days < days_unattached: - continue - - # Build evidence - signals_used = ["Elastic IP is not associated with any instance or network interface"] - if age_days is not None: - signals_used.append( - f"Elastic IP was allocated {age_days} days ago and is currently unattached" - ) - if is_classic: - signals_used.append( - "Classic EIP without AllocationTime (age unknown, flagged conservatively)" - ) - signals_used.append( - "EC2-Classic is deprecated; unattached Classic EIPs are almost always legacy leftovers" - ) - - evidence = Evidence( - signals_used=signals_used, - signals_not_checked=[ - "Unattached duration (AWS does not expose detach timestamp)", - "Previous attachment history", - "Application-level usage", - "Manual operational workflows", - "Future planned attachments", - "Disaster recovery intent", - ], - time_window=( - f"{days_unattached} days since allocation" - if age_days is not None - else "Unknown (Classic EIP, no AllocationTime)" - ), - ) + except ClientError as exc: + code = exc.response["Error"]["Code"] + if code in ("UnauthorizedOperation", "AccessDenied"): + raise PermissionError("Missing required IAM permission: ec2:DescribeAddresses") from exc + raise + except BotoCoreError: + raise - # Build details - details = { - "public_ip": eip.get("PublicIp"), - "domain": eip.get("Domain", "vpc"), - "is_classic": is_classic, + # --- Step 2: Validate top-level response integrity --- + raw_addresses = response.get("Addresses") + if raw_addresses is None or not isinstance(raw_addresses, list): + raise RuntimeError( + "DescribeAddresses response is missing a usable top-level Addresses field — " + "cannot reliably determine EIP association state" + ) + + # --- Steps 3–5: Normalize, apply exclusions, emit --- + for raw_address in raw_addresses: + a = _normalize_address(raw_address) + if a is None: + continue # SKIP: absent stable identity + + # EXCLUSION: currently associated + if ( + a["association_id"] is not None + or a["instance_id"] is not None + or a["network_interface_id"] is not None + or a["private_ip_address"] is not None + ): + continue + + # --- Detection path: unattached-eip-review-candidate --- + + evidence = Evidence( + signals_used=[ + f"Address {a['resource_id']} is currently not associated per DescribeAddresses", + "Address remains allocated to the account until explicitly released", + "AWS recommends release only when the address is no longer needed " + "and is not currently associated", + ], + signals_not_checked=[ + "Future planned attachment or operational reserve intent not known", + "DNS / allowlist / manual failover dependencies", + "Application-level use of the reserved public IP", + "Exact monthly pricing from the current pricing page", + "Service-managed lifecycle expectations outside current association state", + ], + time_window=None, + ) + + details: dict = { + "evaluation_path": "unattached-eip-review-candidate", + "resource_id": a["resource_id"], + "allocation_id": a["allocation_id"], + "public_ip": a["public_ip"], + "carrier_ip": a["carrier_ip"], + "domain": a["domain"], + "currently_associated": False, + "association_id": None, + "instance_id": None, + "network_interface_id": None, + "private_ip_address": None, + } + if a["network_interface_owner_id"] is not None: + details["network_interface_owner_id"] = a["network_interface_owner_id"] + if a["network_border_group"] is not None: + details["network_border_group"] = a["network_border_group"] + if a["public_ipv4_pool"] is not None: + details["public_ipv4_pool"] = a["public_ipv4_pool"] + if a["customer_owned_ip"] is not None: + details["customer_owned_ip"] = a["customer_owned_ip"] + if a["customer_owned_ipv4_pool"] is not None: + details["customer_owned_ipv4_pool"] = a["customer_owned_ipv4_pool"] + if a["subnet_id"] is not None: + details["subnet_id"] = a["subnet_id"] + if a["service_managed"] is not None: + details["service_managed"] = a["service_managed"] + if a["tags"]: + details["tags"] = { + t.get("Key"): t.get("Value") for t in a["tags"] if isinstance(t, dict) } - if age_days is not None: - details["age_days"] = age_days - details["allocation_time"] = allocation_time.isoformat() - - if "Tags" in eip: - details["tags"] = eip["Tags"] - - findings.append( - Finding( - provider="aws", - rule_id="aws.ec2.elastic_ip.unattached", - resource_type="aws.ec2.elastic_ip", - resource_id=eip.get("AllocationId") or eip.get("PublicIp"), - region=region, - estimated_monthly_cost_usd=3.75, - title="Unattached Elastic IP (Review Recommended)", - summary=( - f"Elastic IP allocated {age_days} days ago and currently unattached (incurs hourly charges)" - if age_days is not None - else "Classic Elastic IP currently unattached (incurs hourly charges, allocation age unknown)" - ), - reason=( - f"Elastic IP is {age_days} days old and currently unattached, incurring charges" - if age_days is not None - else "Classic Elastic IP currently unattached, incurring charges (allocation age unknown)" - ), - risk=RiskLevel.LOW, - confidence=ConfidenceLevel.HIGH, # Deterministic state: no AssociationId - detected_at=now, - evidence=evidence, - details=details, - ) + findings.append( + Finding( + provider="aws", + rule_id="aws.ec2.elastic_ip.unattached", + resource_type="aws.ec2.elastic_ip", + resource_id=a["resource_id"], + region=region, + title="Unattached Elastic IP review candidate", + summary=( + f"Elastic IP {a['resource_id']}" + + ( + f" ({a['public_ip']})" + if a["public_ip"] and a["public_ip"] != a["resource_id"] + else "" + ) + + " is currently not associated with any instance or network interface; " + "review for possible release" + ), + reason="Address has no current association per DescribeAddresses", + risk=RiskLevel.LOW, + confidence=ConfidenceLevel.HIGH, + detected_at=now, + evidence=evidence, + details=details, + estimated_monthly_cost_usd=None, ) - - except ClientError as e: - if e.response["Error"]["Code"] == "UnauthorizedOperation": - raise PermissionError("Missing required IAM permission: ec2:DescribeAddresses") from e - raise + ) return findings diff --git a/cleancloud/providers/aws/rules/elb_idle.py b/cleancloud/providers/aws/rules/elb_idle.py index 1d216f3..4d290be 100644 --- a/cleancloud/providers/aws/rules/elb_idle.py +++ b/cleancloud/providers/aws/rules/elb_idle.py @@ -1,525 +1,726 @@ +""" +Rule: aws.elbv2.alb.idle +Rule: aws.elbv2.nlb.idle +Rule: aws.elb.clb.idle + + (spec — docs/specs/aws/elb_idle.md) + +Intent: + Detect ALB, NLB, and CLB load balancers that are at least + idle_days_threshold days old and show no trusted CloudWatch evidence of + client traffic during the full lookback window, so they can be reviewed + as potential cleanup candidates. + +Exclusions: + - resource_id absent (malformed identity) + - lb_family == "unsupported" (gateway LB or unknown type) + - created_time absent or not safely comparable + - age_days < idle_days_threshold (too new to evaluate) + - ELBv2 state_code not "active" or "active_impaired" + - trusted traffic present (any CloudWatch signal > 0) + - ELBv2 ARN dimension unparsable + +Detection: + - resource_id present, lb_family in {"alb","nlb","clb"} + - age_days >= idle_days_threshold + - ELBv2: state_code "active" or "active_impaired" + - all traffic signals absent during full lookback window + +Key rules: + - ALB: RequestCount Sum>0, ProcessedBytes Sum>0, or ActiveConnectionCount Sum>0 + - NLB: NewFlowCount Sum>0, ProcessedBytes Sum>0, or ActiveFlowCount Maximum>0 + - NLB: missing datapoints over full window = FAIL RULE (not zero) + - CLB: RequestCount Sum>0 or EstimatedProcessedBytes Sum>0 + - Any metric read failure = FAIL RULE; no LOW-confidence path + - ELBv2 dimension strictly from ARN suffix after loadbalancer/; unparsable = SKIP ITEM + - Backend registration is contextual only + - estimated_monthly_cost_usd = None + +Blind spots: + - planned future usage or blue/green staging + - seasonal traffic patterns outside the current lookback window + - DNS / allowlist / manual failover dependencies + - NLB traffic rejected by security groups (not in CloudWatch) + +APIs: + - elbv2:DescribeLoadBalancers + - elb:DescribeLoadBalancers + - cloudwatch:GetMetricStatistics + - elbv2:DescribeTargetGroups (contextual) + - elbv2:DescribeTargetHealth (contextual) +""" + from datetime import datetime, timedelta, timezone -from typing import List +from typing import List, Optional import boto3 -from botocore.exceptions import ClientError +from botocore.exceptions import BotoCoreError, ClientError from cleancloud.core.confidence import ConfidenceLevel from cleancloud.core.evidence import Evidence from cleancloud.core.finding import Finding from cleancloud.core.risk import RiskLevel +_DEFAULT_IDLE_DAYS_THRESHOLD = 14 -def find_idle_load_balancers( - session: boto3.Session, - region: str, - idle_days: int = 14, -) -> List[Finding]: + +# --------------------------------------------------------------------------- +# Normalization helpers +# --------------------------------------------------------------------------- + + +def _str(value) -> Optional[str]: + """Return value if it is a non-empty string, else None.""" + return value if isinstance(value, str) and value else None + + +def _normalize_elbv2(lb: dict, idle_days_threshold: int, now_utc: datetime) -> Optional[dict]: + """Normalize a raw ELBv2 SDK dict to canonical fields. + + Returns None when the item must be skipped (non-dict, absent identity). + All rule logic must operate only on the returned dict. """ - Find idle Elastic Load Balancers (ALB, NLB, CLB) with no traffic. + if not isinstance(lb, dict): + return None - ELBs have a base hourly charge regardless of usage (~$16-22/month). - Idle load balancers with no traffic are a clear cost optimization signal. + arn = _str(lb.get("LoadBalancerArn")) + if not arn: + return None # SKIP: no stable identity - Detection logic: - - LB is older than `idle_days` days - - Zero traffic over the `idle_days` period (CloudWatch metrics) - - No registered targets (ALB/NLB) or no registered instances (CLB) + lb_type = _str(lb.get("Type")) + if lb_type == "application": + lb_family = "alb" + elif lb_type == "network": + lb_family = "nlb" + else: + lb_family = "unsupported" # gateway or unknown + + name = _str(lb.get("LoadBalancerName")) + + # created_time — must be timezone-aware for age calculation. + # Naive datetimes are not safely comparable and must not be coerced; leave absent. + created_time_raw = lb.get("CreatedTime") + created_time: Optional[datetime] = None + if isinstance(created_time_raw, datetime) and created_time_raw.tzinfo is not None: + created_time = created_time_raw.astimezone(timezone.utc) + + age_days: Optional[int] = None + if created_time is not None: + age_days = int((now_utc - created_time).total_seconds() // 86400) + + # state_code from nested State dict + state_raw = lb.get("State") + state_code: Optional[str] = None + if isinstance(state_raw, dict): + state_code = _str(state_raw.get("Code")) + + scheme = _str(lb.get("Scheme")) + dns_name = _str(lb.get("DNSName")) + vpc_id = _str(lb.get("VpcId")) + + return { + "resource_id": arn, + "lb_family": lb_family, + "load_balancer_name": name, + "load_balancer_arn": arn, + "created_time": created_time, + "age_days": age_days, + "scheme": scheme, + "dns_name": dns_name, + "vpc_id": vpc_id, + "state_code": state_code, + "idle_days_threshold": idle_days_threshold, + } + + +def _normalize_clb(lb: dict, idle_days_threshold: int, now_utc: datetime) -> Optional[dict]: + """Normalize a raw CLB SDK dict to canonical fields. + + Returns None when the item must be skipped (non-dict, absent identity). + """ + if not isinstance(lb, dict): + return None + + name = _str(lb.get("LoadBalancerName")) + if not name: + return None # SKIP: no stable identity + + # Naive datetimes are not safely comparable and must not be coerced; leave absent. + created_time_raw = lb.get("CreatedTime") + created_time: Optional[datetime] = None + if isinstance(created_time_raw, datetime) and created_time_raw.tzinfo is not None: + created_time = created_time_raw.astimezone(timezone.utc) + + age_days: Optional[int] = None + if created_time is not None: + age_days = int((now_utc - created_time).total_seconds() // 86400) + + scheme = _str(lb.get("Scheme")) + dns_name = _str(lb.get("DNSName")) + # CLB uses VPCId (capital VPC), not VpcId + vpc_id = _str(lb.get("VPCId")) + + instances_raw = lb.get("Instances") + instances: list = instances_raw if isinstance(instances_raw, list) else [] + + return { + "resource_id": name, + "lb_family": "clb", + "load_balancer_name": name, + "load_balancer_arn": None, + "created_time": created_time, + "age_days": age_days, + "scheme": scheme, + "dns_name": dns_name, + "vpc_id": vpc_id, + "state_code": None, + "idle_days_threshold": idle_days_threshold, + "instances": instances, + } + + +# --------------------------------------------------------------------------- +# CloudWatch dimension extraction +# --------------------------------------------------------------------------- + + +def _extract_elbv2_dimension(lb_arn: str) -> Optional[str]: + """Extract the CloudWatch LoadBalancer dimension value from an ELBv2 ARN. + + Strictly uses the suffix after 'loadbalancer/'. Returns None if + the suffix cannot be reliably extracted — caller must SKIP the item. - Confidence: - - HIGH: Zero traffic AND no targets/instances - - MEDIUM: Zero traffic only + ARN format: arn:aws:elasticloadbalancing:region:account:loadbalancer/app/name/id + Dimension: app/name/id + """ + parts = lb_arn.split("loadbalancer/", 1) + if len(parts) == 2 and parts[1]: + return parts[1] + return None - IAM permissions: - - elasticloadbalancing:DescribeLoadBalancers - - elasticloadbalancing:DescribeTargetGroups - - elasticloadbalancing:DescribeTargetHealth - - cloudwatch:GetMetricStatistics + +# --------------------------------------------------------------------------- +# CloudWatch metric fetching +# --------------------------------------------------------------------------- + + +def _get_metric_datapoints( + cloudwatch, + namespace: str, + metric_name: str, + statistic: str, + dimension_name: str, + dimension_value: str, + start_time: datetime, + end_time: datetime, +) -> List[dict]: + """Fetch CloudWatch metric datapoints. + + Returns the raw list of datapoints (may be empty for ALB/CLB; see NLB caller). + Raises PermissionError on permission errors, re-raises ClientError/BotoCoreError + for all other failures — caller treats these as FAIL RULE. """ - cloudwatch = session.client("cloudwatch", region_name=region) - now = datetime.now(timezone.utc) - findings: List[Finding] = [] + try: + response = cloudwatch.get_metric_statistics( + Namespace=namespace, + MetricName=metric_name, + Dimensions=[{"Name": dimension_name, "Value": dimension_value}], + StartTime=start_time, + EndTime=end_time, + Period=86400, + Statistics=[statistic], + ) + return response.get("Datapoints", []) + except ClientError as exc: + code = exc.response["Error"]["Code"] + if code in ("AccessDenied", "UnauthorizedOperation"): + raise PermissionError( + "Missing required IAM permission: cloudwatch:GetMetricStatistics" + ) from exc + raise + except BotoCoreError: + raise - # Scan ALB/NLB via elbv2 - findings.extend(_scan_elbv2(session, region, cloudwatch, now, idle_days)) - # Scan CLB via elb - findings.extend(_scan_clb(session, region, cloudwatch, now, idle_days)) +def _check_alb_traffic( + cloudwatch, + dimension_value: str, + start_time: datetime, + end_time: datetime, +) -> bool: + """Return True if ALB has trusted traffic over the window, False if confirmed zero. - return findings + Checks: RequestCount Sum, ProcessedBytes Sum, ActiveConnectionCount Sum. + Missing datapoints treated as zero (ALB only reports when traffic is present). + Raises on metric read failure → FAIL RULE. + """ + namespace = "AWS/ApplicationELB" + dim = "LoadBalancer" + + for metric_name in ("RequestCount", "ProcessedBytes", "ActiveConnectionCount"): + dps = _get_metric_datapoints( + cloudwatch, namespace, metric_name, "Sum", dim, dimension_value, start_time, end_time + ) + if any(dp.get("Sum", 0) > 0 for dp in dps): + return True + + return False + + +def _check_nlb_traffic( + cloudwatch, + dimension_value: str, + start_time: datetime, + end_time: datetime, + expected_days: int, +) -> bool: + """Return True if NLB has trusted traffic over the window, False if confirmed zero. + + Checks: NewFlowCount Sum, ProcessedBytes Sum, ActiveFlowCount Maximum. + NLB metrics are documented as always reported; incomplete coverage (fewer + datapoints than the full window warrants) means the zero-traffic claim is + not trustworthy → raise RuntimeError (FAIL RULE). + Raises on metric read failure → FAIL RULE. + """ + namespace = "AWS/NetworkELB" + dim = "LoadBalancer" + # Spec requires full-window coverage with no gaps; no tolerance applied. + min_datapoints = expected_days + + for metric_name in ("NewFlowCount", "ProcessedBytes"): + dps = _get_metric_datapoints( + cloudwatch, namespace, metric_name, "Sum", dim, dimension_value, start_time, end_time + ) + if len(dps) < min_datapoints: + raise RuntimeError( + f"NLB {metric_name} metric returned {len(dps)} datapoint(s) for a " + f"{expected_days}-day window — coverage is incomplete, " + "cannot confirm zero traffic" + ) + if any(dp.get("Sum", 0) > 0 for dp in dps): + return True + + dps = _get_metric_datapoints( + cloudwatch, + namespace, + "ActiveFlowCount", + "Maximum", + dim, + dimension_value, + start_time, + end_time, + ) + if len(dps) < min_datapoints: + raise RuntimeError( + f"NLB ActiveFlowCount metric returned {len(dps)} datapoint(s) for a " + f"{expected_days}-day window — coverage is incomplete, " + "cannot confirm zero traffic" + ) + if any(dp.get("Maximum", 0) > 0 for dp in dps): + return True + + return False + + +def _check_clb_traffic( + cloudwatch, + lb_name: str, + start_time: datetime, + end_time: datetime, +) -> bool: + """Return True if CLB has trusted traffic over the window, False if confirmed zero. + + Checks: RequestCount Sum, EstimatedProcessedBytes Sum. + Missing datapoints treated as zero (CLB only reports when traffic is present). + Raises on metric read failure → FAIL RULE. + """ + namespace = "AWS/ELB" + dim = "LoadBalancerName" + + for metric_name in ("RequestCount", "EstimatedProcessedBytes"): + dps = _get_metric_datapoints( + cloudwatch, namespace, metric_name, "Sum", dim, lb_name, start_time, end_time + ) + if any(dp.get("Sum", 0) > 0 for dp in dps): + return True + + return False + + +# --------------------------------------------------------------------------- +# Backend registration context (best-effort; failure degrades context not rule) +# --------------------------------------------------------------------------- + + +def _get_elbv2_backend_context(elbv2, lb_arn: str) -> tuple: + """Return (registered_target_count, target_group_count, enrichment_succeeded). + + On any error returns (0, 0, False) — caller sets has_registered_targets = None. + Pagination of target groups is exhausted; target health is retrieved per group. + """ + try: + paginator = elbv2.get_paginator("describe_target_groups") + target_groups = [] + for page in paginator.paginate(LoadBalancerArn=lb_arn): + target_groups.extend(page.get("TargetGroups", [])) + + tg_count = len(target_groups) + total_targets = 0 + for tg in target_groups: + tg_arn = _str(tg.get("TargetGroupArn")) + if not tg_arn: + continue + health_resp = elbv2.describe_target_health(TargetGroupArn=tg_arn) + total_targets += len(health_resp.get("TargetHealthDescriptions", [])) + return total_targets, tg_count, True + except (ClientError, BotoCoreError, Exception): + return 0, 0, False + + +# --------------------------------------------------------------------------- +# ELBv2 (ALB + NLB) scanner +# --------------------------------------------------------------------------- def _scan_elbv2( session: boto3.Session, region: str, cloudwatch, - now: datetime, - idle_days: int, + now_utc: datetime, + idle_days_threshold: int, ) -> List[Finding]: - """Scan ALB and NLB load balancers for idle resources.""" elbv2 = session.client("elbv2", region_name=region) findings: List[Finding] = [] + start_time = now_utc - timedelta(days=max(idle_days_threshold, 1)) try: paginator = elbv2.get_paginator("describe_load_balancers") + pages = list(paginator.paginate()) + except ClientError as exc: + code = exc.response["Error"]["Code"] + if code in ("AccessDenied", "UnauthorizedOperation"): + raise PermissionError( + "Missing required IAM permission: elbv2:DescribeLoadBalancers" + ) from exc + raise + except BotoCoreError: + raise - for page in paginator.paginate(): - for lb in page.get("LoadBalancers", []): - lb_arn = lb["LoadBalancerArn"] - lb_name = lb.get("LoadBalancerName", lb_arn) - lb_type = lb.get("Type", "application") # application or network - - # Calculate age - create_time = lb.get("CreatedTime") - age_days = 0 - if create_time: - try: - age_days = (now - create_time).days - except TypeError: - pass - - # Skip if younger than threshold - if age_days < idle_days: - continue - - # Check traffic via CloudWatch - has_traffic, traffic_fetch_failed = _check_elbv2_traffic( - cloudwatch, lb_arn, lb_type, idle_days - ) - # has_traffic=True with fetch_failed=False → confirmed traffic, skip. - # has_traffic=True with fetch_failed=True → metric unreadable; create LOW-confidence - # finding so the operator knows to verify manually rather than silently suppress. - if has_traffic and not traffic_fetch_failed: - continue - - # Check registered targets - has_targets = _check_elbv2_targets(elbv2, lb_arn) - - # Determine confidence - if traffic_fetch_failed: - # Metric read failed — traffic status unknown; operator must verify - confidence = ConfidenceLevel.LOW - elif not has_targets: - confidence = ConfidenceLevel.HIGH - else: - confidence = ConfidenceLevel.MEDIUM - - type_label = "ALB" if lb_type == "application" else "NLB" - rule_id = "aws.elbv2.alb.idle" if lb_type == "application" else "aws.elbv2.nlb.idle" - primary_metric = "RequestCount" if lb_type == "application" else "NewFlowCount" - scheme = lb.get("Scheme", "unknown") - - signals = [ - f"Load balancer type: {type_label}", - f"Scheme: {scheme}", - f"State: {lb.get('State', {}).get('Code', 'unknown')}", + for page in pages: + for raw_lb in page.get("LoadBalancers", []): + lb = _normalize_elbv2(raw_lb, idle_days_threshold, now_utc) + if lb is None: + continue # SKIP: non-dict or absent identity + + # EXCLUSION: unsupported family (gateway or unknown) + if lb["lb_family"] == "unsupported": + continue + + # EXCLUSION: unusable created_time + if lb["created_time"] is None or lb["age_days"] is None: + continue + + # EXCLUSION: too new + if lb["age_days"] < idle_days_threshold: + continue + + # EXCLUSION: unsupported ELBv2 state + if lb["state_code"] not in ("active", "active_impaired"): + continue + + # Derive CloudWatch dimension — SKIP ITEM if unparsable + dimension_value = _extract_elbv2_dimension(lb["load_balancer_arn"]) + if dimension_value is None: + continue # SKIP: ARN dimension unparsable + + # --- Traffic check (raises → FAIL RULE) --- + if lb["lb_family"] == "alb": + has_traffic = _check_alb_traffic(cloudwatch, dimension_value, start_time, now_utc) + traffic_signals_checked = [ + "RequestCount:Sum", + "ProcessedBytes:Sum", + "ActiveConnectionCount:Sum", ] - if not traffic_fetch_failed: - signals.insert( - 0, - f"Zero {primary_metric} and ProcessedBytes for {idle_days} days (CloudWatch)", - ) - - if not has_targets: - signals.append("No registered targets") - if age_days > 0: - signals.append(f"Load balancer is {age_days} days old") - - signals_not_checked = [ - "Planned future usage", - "Blue/green deployment scenarios", - "Seasonal traffic patterns", - "Internal health-check-only usage", + rule_id = "aws.elbv2.alb.idle" + label = "ALB" + resource_type = "aws.elbv2.load_balancer" + else: + has_traffic = _check_nlb_traffic( + cloudwatch, dimension_value, start_time, now_utc, idle_days_threshold + ) + traffic_signals_checked = [ + "NewFlowCount:Sum", + "ProcessedBytes:Sum", + "ActiveFlowCount:Maximum", ] - if traffic_fetch_failed: - signals_not_checked.insert( - 0, - f"Traffic metrics ({primary_metric}, ProcessedBytes) — CloudWatch fetch " - "failed (transient/throttle error); traffic status unverified", - ) - - evidence = Evidence( - signals_used=signals, - signals_not_checked=signals_not_checked, - time_window=f"{idle_days} days", + rule_id = "aws.elbv2.nlb.idle" + label = "NLB" + resource_type = "aws.elbv2.load_balancer" + + if has_traffic: + continue # SKIP: trusted traffic present + + # --- Backend context (best-effort) --- + target_count, tg_count, enrichment_ok = _get_elbv2_backend_context( + elbv2, lb["load_balancer_arn"] + ) + if enrichment_ok: + has_registered_targets: Optional[bool] = target_count > 0 + details_target_count: Optional[int] = target_count + details_tg_count: Optional[int] = tg_count + else: + # Enrichment failed — context unknown; do not fabricate zero counts + has_registered_targets = None + details_target_count = None + details_tg_count = None + + # --- Confidence --- + if has_registered_targets is False: + confidence = ConfidenceLevel.HIGH + else: + # has targets OR unknown → MEDIUM + confidence = ConfidenceLevel.MEDIUM + + created_time_str = lb["created_time"].isoformat() if lb["created_time"] else None + + evidence = Evidence( + signals_used=[ + f"Load balancer has been running for {lb['age_days']} days, " + f"exceeding the {idle_days_threshold}-day idle evaluation threshold", + f"No trusted CloudWatch traffic signal observed over the " + f"{idle_days_threshold}-day lookback window", + *( + ["No registered targets found"] + if has_registered_targets is False + else ( + [f"{target_count} registered target(s) still present"] + if has_registered_targets + else [] + ) + ), + ], + signals_not_checked=[ + "Planned future usage or blue/green staging", + "Seasonal traffic patterns outside the current lookback window", + "DNS / allowlist / manual failover dependencies still pointing at the load balancer", + "NLB traffic rejected by security groups, which is not captured in CloudWatch", + ], + time_window=f"{idle_days_threshold} days", + ) + + details = { + "evaluation_path": "idle-load-balancer-review-candidate", + "lb_family": lb["lb_family"], + "resource_id": lb["resource_id"], + "load_balancer_name": lb["load_balancer_name"], + "load_balancer_arn": lb["load_balancer_arn"], + "scheme": lb["scheme"], + "dns_name": lb["dns_name"], + "vpc_id": lb["vpc_id"], + "created_time": created_time_str, + "age_days": lb["age_days"], + "idle_days_threshold": idle_days_threshold, + "traffic_window_days": idle_days_threshold, + "traffic_signals_checked": traffic_signals_checked, + "traffic_detected": False, + "state_code": lb["state_code"], + "has_registered_targets": has_registered_targets, + "registered_target_count": details_target_count, + "target_group_count": details_tg_count, + } + + findings.append( + Finding( + provider="aws", + rule_id=rule_id, + resource_type=resource_type, + resource_id=lb["resource_id"], + region=region, + title=f"Idle {label} review candidate", + summary=( + f"{label} '{lb['load_balancer_name']}' has had no trusted CloudWatch " + f"traffic signal over the last {idle_days_threshold} days; " + "review for possible cleanup" + ), + reason=( + f"{label} has no trusted CloudWatch traffic signal in the last " + f"{idle_days_threshold} days" + ), + risk=RiskLevel.MEDIUM, + confidence=confidence, + detected_at=now_utc, + evidence=evidence, + details=details, + estimated_monthly_cost_usd=None, ) + ) - if traffic_fetch_failed: - title = f"{type_label} Requires Traffic Verification" - summary = ( - f"{type_label} '{lb_name}' could not be verified as idle — " - f"CloudWatch traffic metrics were unreadable (transient/throttle error)." - ) - reason = f"{type_label} traffic metrics could not be fetched; idle status is unconfirmed" - else: - title = f"Idle {type_label} (No Traffic for {idle_days}+ Days)" - summary = ( - f"{type_label} '{lb_name}' has had zero traffic for " - f"{idle_days}+ days and is incurring base charges." - ) - reason = f"{type_label} has zero traffic for {idle_days}+ days" - - findings.append( - Finding( - provider="aws", - rule_id=rule_id, - resource_type="aws.elbv2.load_balancer", - resource_id=lb_arn, - region=region, - estimated_monthly_cost_usd=18.0, - title=title, - summary=summary, - reason=reason, - risk=RiskLevel.MEDIUM, - confidence=confidence, - detected_at=now, - evidence=evidence, - details={ - "name": lb_name, - "type": lb_type, - "scheme": scheme, - "state": lb.get("State", {}).get("Code", "unknown"), - "dns_name": lb.get("DNSName"), - "vpc_id": lb.get("VpcId"), - "age_days": age_days, - "has_targets": has_targets, - "idle_days_threshold": idle_days, - "estimated_monthly_cost": ( - "~$16-22/month base cost (us-east-1 on-demand; " - "region-dependent; excludes LCU/NLCU usage charges)" - ), - }, - ) - ) + return findings - except ClientError as e: - code = e.response["Error"]["Code"] - if code in ("UnauthorizedOperation", "AccessDenied"): - raise PermissionError( - "Missing required IAM permissions: " - "elasticloadbalancing:DescribeLoadBalancers, " - "elasticloadbalancing:DescribeTargetGroups, " - "elasticloadbalancing:DescribeTargetHealth, " - "cloudwatch:GetMetricStatistics" - ) from e - raise - return findings +# --------------------------------------------------------------------------- +# CLB scanner +# --------------------------------------------------------------------------- def _scan_clb( session: boto3.Session, region: str, cloudwatch, - now: datetime, - idle_days: int, + now_utc: datetime, + idle_days_threshold: int, ) -> List[Finding]: - """Scan Classic Load Balancers for idle resources.""" elb = session.client("elb", region_name=region) findings: List[Finding] = [] + start_time = now_utc - timedelta(days=max(idle_days_threshold, 1)) try: paginator = elb.get_paginator("describe_load_balancers") - - for page in paginator.paginate(): - for lb in page.get("LoadBalancerDescriptions", []): - lb_name = lb["LoadBalancerName"] - - # Calculate age - create_time = lb.get("CreatedTime") - age_days = 0 - if create_time: - try: - age_days = (now - create_time).days - except TypeError: - pass - - # Skip if younger than threshold - if age_days < idle_days: - continue - - # Check traffic via CloudWatch - has_traffic, traffic_fetch_failed = _check_clb_traffic( - cloudwatch, lb_name, idle_days - ) - # has_traffic=True with fetch_failed=False → confirmed traffic, skip. - # has_traffic=True with fetch_failed=True → metric unreadable; create LOW-confidence - # finding so the operator knows to verify manually rather than silently suppress. - if has_traffic and not traffic_fetch_failed: - continue - - # Check registered instances - instances = lb.get("Instances", []) - has_instances = len(instances) > 0 - scheme = lb.get("Scheme", "unknown") - - # Determine confidence - if traffic_fetch_failed: - confidence = ConfidenceLevel.LOW - elif not has_instances: - confidence = ConfidenceLevel.HIGH - else: - confidence = ConfidenceLevel.MEDIUM - - signals = [ - "Load balancer type: CLB", - f"Scheme: {scheme}", - ] - if not traffic_fetch_failed: - signals.insert( - 0, - f"Zero RequestCount and EstimatedProcessedBytes for {idle_days} days (CloudWatch)", - ) - - if not has_instances: - signals.append("No registered instances") - else: - signals.append(f"{len(instances)} registered instance(s)") - if age_days > 0: - signals.append(f"Load balancer is {age_days} days old") - - signals_not_checked = [ - "Planned future usage", - "Blue/green deployment scenarios", - "Seasonal traffic patterns", - "Internal health-check-only usage", - ] - if traffic_fetch_failed: - signals_not_checked.insert( - 0, - "Traffic metrics (RequestCount, EstimatedProcessedBytes) — CloudWatch fetch " - "failed (transient/throttle error); traffic status unverified", - ) - - evidence = Evidence( - signals_used=signals, - signals_not_checked=signals_not_checked, - time_window=f"{idle_days} days", - ) - - if traffic_fetch_failed: - title = "CLB Requires Traffic Verification" - summary = ( - f"CLB '{lb_name}' could not be verified as idle — " - "CloudWatch traffic metrics were unreadable (transient/throttle error)." - ) - reason = "CLB traffic metrics could not be fetched; idle status is unconfirmed" - else: - title = f"Idle CLB (No Traffic for {idle_days}+ Days)" - summary = ( - f"CLB '{lb_name}' has had zero traffic for " - f"{idle_days}+ days and is incurring base charges." - ) - reason = f"CLB has zero traffic for {idle_days}+ days" - - findings.append( - Finding( - provider="aws", - rule_id="aws.elb.clb.idle", - resource_type="aws.elb.load_balancer", - resource_id=lb_name, - region=region, - estimated_monthly_cost_usd=18.0, - title=title, - summary=summary, - reason=reason, - risk=RiskLevel.MEDIUM, - confidence=confidence, - detected_at=now, - evidence=evidence, - details={ - "name": lb_name, - "type": "classic", - "scheme": scheme, - "dns_name": lb.get("DNSName"), - "vpc_id": lb.get("VPCId"), - "age_days": age_days, - "has_instances": has_instances, - "instance_count": len(instances), - "idle_days_threshold": idle_days, - "estimated_monthly_cost": ( - "~$16-22/month base cost (us-east-1 on-demand; " - "region-dependent; excludes LCU usage charges)" - ), - }, - ) - ) - - except ClientError as e: - code = e.response["Error"]["Code"] - if code in ("UnauthorizedOperation", "AccessDenied"): + pages = list(paginator.paginate()) + except ClientError as exc: + code = exc.response["Error"]["Code"] + if code in ("AccessDenied", "UnauthorizedOperation"): raise PermissionError( - "Missing required IAM permissions: " - "elasticloadbalancing:DescribeLoadBalancers, " - "cloudwatch:GetMetricStatistics" - ) from e + "Missing required IAM permission: elb:DescribeLoadBalancers" + ) from exc + raise + except BotoCoreError: raise - return findings - - -def _check_elbv2_traffic(cloudwatch, lb_arn: str, lb_type: str, days: int) -> tuple: - """Check if an ALB/NLB has had any traffic in the past `days` days. - - ALB: checks both RequestCount and ProcessedBytes. - - RequestCount only increments when a target is chosen — fixed-response, redirect, - and pre-routing-rejection actions leave it at zero even with real traffic. - - ProcessedBytes captures all bytes processed by the ALB regardless of routing outcome. - - NLB: checks both NewFlowCount and ProcessedBytes. - - NewFlowCount only counts flows successfully established to targets — traffic that - hits the NLB listener but doesn't reach a target (e.g. health check gaps) is missed. - - ProcessedBytes always reflects total bytes received/sent by the NLB. - - Either metric > 0 is treated as traffic (OR logic, conservative for false-positive avoidance). - - Returns (has_traffic: bool, fetch_failed: bool). - fetch_failed is True when a transient/throttle error prevented a clean metric read. - """ - now = datetime.now(timezone.utc) - start_time = now - timedelta(days=max(days, 1)) - dimension_value = _extract_elbv2_dimension(lb_arn) - - if lb_type == "application": - namespace = "AWS/ApplicationELB" - primary_metric = "RequestCount" - else: - namespace = "AWS/NetworkELB" - primary_metric = "NewFlowCount" - - def _fetch(metric_name: str) -> tuple: - return _get_metric_sum( - cloudwatch, namespace, metric_name, "LoadBalancer", dimension_value, start_time, now - ) - - primary_val, primary_err = _fetch(primary_metric) - if primary_val > 0: - return True, primary_err + for page in pages: + for raw_lb in page.get("LoadBalancerDescriptions", []): + lb = _normalize_clb(raw_lb, idle_days_threshold, now_utc) + if lb is None: + continue # SKIP: non-dict or absent identity + + # EXCLUSION: unusable created_time + if lb["created_time"] is None or lb["age_days"] is None: + continue + + # EXCLUSION: too new + if lb["age_days"] < idle_days_threshold: + continue + + # --- Traffic check (raises → FAIL RULE) --- + has_traffic = _check_clb_traffic( + cloudwatch, lb["load_balancer_name"], start_time, now_utc + ) + if has_traffic: + continue # SKIP: trusted traffic present + + # --- Backend context from normalized item --- + instances = lb["instances"] + registered_instance_count = len(instances) + has_registered_instances = registered_instance_count > 0 + + # --- Confidence --- + confidence = ( + ConfidenceLevel.HIGH if not has_registered_instances else ConfidenceLevel.MEDIUM + ) + + created_time_str = lb["created_time"].isoformat() if lb["created_time"] else None + + evidence = Evidence( + signals_used=[ + f"Load balancer has been running for {lb['age_days']} days, " + f"exceeding the {idle_days_threshold}-day idle evaluation threshold", + f"No trusted CloudWatch traffic signal observed over the " + f"{idle_days_threshold}-day lookback window", + *( + ["No registered instances found"] + if not has_registered_instances + else [f"{registered_instance_count} registered instance(s) still present"] + ), + ], + signals_not_checked=[ + "Planned future usage or blue/green staging", + "Seasonal traffic patterns outside the current lookback window", + "DNS / allowlist / manual failover dependencies still pointing at the load balancer", + ], + time_window=f"{idle_days_threshold} days", + ) + + details = { + "evaluation_path": "idle-load-balancer-review-candidate", + "lb_family": "clb", + "resource_id": lb["resource_id"], + "load_balancer_name": lb["load_balancer_name"], + "load_balancer_arn": None, + "scheme": lb["scheme"], + "dns_name": lb["dns_name"], + "vpc_id": lb["vpc_id"], + "created_time": created_time_str, + "age_days": lb["age_days"], + "idle_days_threshold": idle_days_threshold, + "traffic_window_days": idle_days_threshold, + "traffic_signals_checked": ["RequestCount:Sum", "EstimatedProcessedBytes:Sum"], + "traffic_detected": False, + "has_registered_instances": has_registered_instances, + "registered_instance_count": registered_instance_count, + } + + findings.append( + Finding( + provider="aws", + rule_id="aws.elb.clb.idle", + resource_type="aws.elb.load_balancer", + resource_id=lb["resource_id"], + region=region, + title="Idle CLB review candidate", + summary=( + f"CLB '{lb['load_balancer_name']}' has had no trusted CloudWatch " + f"traffic signal over the last {idle_days_threshold} days; " + "review for possible cleanup" + ), + reason=( + f"CLB has no trusted CloudWatch traffic signal in the last " + f"{idle_days_threshold} days" + ), + risk=RiskLevel.MEDIUM, + confidence=confidence, + detected_at=now_utc, + evidence=evidence, + details=details, + estimated_monthly_cost_usd=None, + ) + ) - processed_val, processed_err = _fetch("ProcessedBytes") - if processed_val > 0: - return True, processed_err + return findings - return False, (primary_err or processed_err) +# --------------------------------------------------------------------------- +# Public entry point +# --------------------------------------------------------------------------- -def _check_clb_traffic(cloudwatch, lb_name: str, days: int) -> tuple: - """Check if a CLB has had any traffic in the past `days` days. - Checks both RequestCount (HTTP/HTTPS listeners) and EstimatedProcessedBytes - (all protocols including TCP/SSL). A CLB with only TCP/SSL listeners will - always report zero RequestCount, so checking only that metric would produce - false positives for any active TCP CLB. +def find_idle_load_balancers( + session: boto3.Session, + region: str, + idle_days_threshold: int = _DEFAULT_IDLE_DAYS_THRESHOLD, +) -> List[Finding]: + """Find idle ALB, NLB, and CLB load balancers with no trusted CloudWatch traffic. - Returns (has_traffic: bool, fetch_failed: bool). - fetch_failed is True when a transient/throttle error prevented a clean metric read. + Each branch (ELBv2 and CLB) is evaluated independently. A failure in one + branch does not prevent the other from running. If either branch fails the + exception is re-raised after both have been attempted. """ - now = datetime.now(timezone.utc) - start_time = now - timedelta(days=max(days, 1)) - - # HTTP/HTTPS traffic - request_count, req_err = _get_metric_sum( - cloudwatch, - "AWS/ELB", - "RequestCount", - "LoadBalancerName", - lb_name, - start_time, - now, - ) - if request_count > 0: - return True, req_err - - # TCP/SSL traffic (covers all protocols including HTTP/HTTPS) - processed_bytes, proc_err = _get_metric_sum( - cloudwatch, - "AWS/ELB", - "EstimatedProcessedBytes", - "LoadBalancerName", - lb_name, - start_time, - now, - ) - return processed_bytes > 0, (req_err or proc_err) - - -def _check_elbv2_targets(elbv2, lb_arn: str) -> bool: - """Check if an ALB/NLB has any registered targets. + cloudwatch = session.client("cloudwatch", region_name=region) + now_utc = datetime.now(timezone.utc) + findings: List[Finding] = [] + first_exc: Optional[BaseException] = None - describe_target_health only returns targets that ARE registered in the target - group — unregistered targets are simply absent from the response. Therefore - any non-empty TargetHealthDescriptions list means there are registered targets, - regardless of their health state (healthy/unhealthy/draining/unused all count). - """ try: - tg_resp = elbv2.describe_target_groups(LoadBalancerArn=lb_arn) - for tg in tg_resp.get("TargetGroups", []): - tg_arn = tg["TargetGroupArn"] - health_resp = elbv2.describe_target_health(TargetGroupArn=tg_arn) - if health_resp.get("TargetHealthDescriptions"): - return True - except ClientError: - # If we can't check targets, assume they exist to avoid false positives - return True - return False - - -def _extract_elbv2_dimension(lb_arn: str) -> str: - """ - Extract the CloudWatch dimension value from an ELBv2 ARN. - - ARN format: arn:aws:elasticloadbalancing:region:account:loadbalancer/app/name/id - Dimension value: app/name/id (or net/name/id for NLB) - """ - parts = lb_arn.split("loadbalancer/", 1) - if len(parts) == 2: - return parts[1] - return lb_arn - + findings.extend(_scan_elbv2(session, region, cloudwatch, now_utc, idle_days_threshold)) + except Exception as exc: + first_exc = exc -def _get_metric_sum( - cloudwatch, - namespace: str, - metric_name: str, - dimension_name: str, - dimension_value: str, - start_time: datetime, - end_time: datetime, -) -> tuple: - """Get sum of a CloudWatch metric over the time period. - - Returns (has_traffic: int, fetch_error: bool). - - has_traffic: 1 if any datapoint had Sum > 0, else 0. - - fetch_error: True if a non-permission error occurred (throttle, transient, etc.). - When fetch_error is True, has_traffic is 1 (conservative — avoids false positives), - but the caller should surface this to the operator via signals_not_checked. - """ try: - response = cloudwatch.get_metric_statistics( - Namespace=namespace, - MetricName=metric_name, - Dimensions=[ - { - "Name": dimension_name, - "Value": dimension_value, - } - ], - StartTime=start_time, - EndTime=end_time, - Period=86400, # 1 day in seconds - Statistics=["Sum"], - ) + findings.extend(_scan_clb(session, region, cloudwatch, now_utc, idle_days_threshold)) + except Exception as exc: + if first_exc is None: + first_exc = exc - datapoints = response.get("Datapoints", []) - if any(dp.get("Sum", 0) > 0 for dp in datapoints): - return 1, False - return 0, False + if first_exc is not None: + raise first_exc - except ClientError as e: - if e.response["Error"]["Code"] in ("AccessDenied", "UnauthorizedOperation"): - raise PermissionError( - "Missing required IAM permissions: cloudwatch:GetMetricStatistics" - ) from e - # Other errors (throttle, transient): assume traffic to avoid false positives, - # but flag the error so the caller can surface it. - return 1, True + return findings diff --git a/cleancloud/providers/aws/rules/eni_detached.py b/cleancloud/providers/aws/rules/eni_detached.py index a93409d..8615c3e 100644 --- a/cleancloud/providers/aws/rules/eni_detached.py +++ b/cleancloud/providers/aws/rules/eni_detached.py @@ -1,162 +1,265 @@ +""" +Rule: aws.ec2.eni.detached + + (spec — docs/specs/aws/eni_detached.md) + +Intent: + Detect network interfaces that are currently not attached according to the + EC2 DescribeNetworkInterfaces contract, so they can be reviewed as possible + cleanup candidates if no longer needed. + +Exclusions: + - network_interface_id absent (malformed identity) + - normalized_status absent (missing current-state signal) + - normalized_status != "available" (attached or other non-eligible state) + - attachment_status is not null/absent or "detached" (any other value including + unknown/malformed strings is treated as inconsistent — SKIP ITEM) + +Detection: + - network_interface_id present + - normalized_status == "available" + - attachment_status absent, null, or "detached" + +Key rules: + - Top-level Status is the sole state authority; attachment_status is validation only. + - No temporal threshold — current not-attached state is the sole eligibility signal. + - No exclusion for interface_type, requester_managed, or operator_managed. + - Do not use CreateTime or any age/duration field for eligibility. + - estimated_monthly_cost_usd = None. + - Confidence: HIGH. + - Risk: LOW. + +Blind spots: + - how long the ENI has been in a not-currently-attached state + - previous attachment history + - whether an AWS service expects to recycle or clean up this ENI + - application, failover, or operational intent + - exact pricing impact + +APIs: + - ec2:DescribeNetworkInterfaces +""" + from datetime import datetime, timezone -from typing import List +from typing import List, Optional import boto3 -from botocore.exceptions import ClientError +from botocore.exceptions import BotoCoreError, ClientError from cleancloud.core.confidence import ConfidenceLevel from cleancloud.core.evidence import Evidence from cleancloud.core.finding import Finding from cleancloud.core.risk import RiskLevel +# --- Module-level constants --- -def find_detached_enis( - session: boto3.Session, - region: str, - max_age_days: int = 60, -) -> List[Finding]: - """ - Find Elastic Network Interfaces (ENIs) currently detached and 60+ days old. +# Sole eligible top-level ENI status per EC2 documented contract. +_ELIGIBLE_STATUS = "available" - Detached ENIs incur small hourly charges and are often forgotten - after failed deployments or when infrastructure is torn down incompletely. +# Attachment states that are consistent with an available (not-currently-attached) ENI. +# Any attachment_status outside this set is treated as inconsistent → SKIP ITEM. +_ALLOWED_ATTACHMENT_STATUSES: frozenset = frozenset({None, "detached"}) - IMPORTANT: AWS does not expose "detached since" timestamp, so we use ENI - creation age as a proxy. This is conservative - an ENI created 60 days ago - and currently detached is worth reviewing even if it was recently detached. +_FINDING_TITLE = "ENI not currently attached review candidate" - SAFE RULE (review-only): - - ENI Status == 'available' (not attached) - - ENI creation age >= max_age_days threshold (NOT detached duration) - - Excludes AWS infrastructure ENIs (NAT Gateway, Load Balancers, VPC Endpoints) - - INCLUDES requester-managed ENIs (Lambda, ECS, RDS) - these are user resources! +_SIGNAL_NOT_CURRENTLY_ATTACHED = ( + "ENI top-level Status is 'available' (not currently attached per EC2 documented contract)" +) +_SIGNAL_REQUESTER_MANAGED = "ENI is requester-managed (created by an AWS service on your behalf)" - IAM permissions: - - ec2:DescribeNetworkInterfaces +_SIGNALS_NOT_CHECKED = ( + "How long the ENI has been in a not-currently-attached state", + "Previous attachment history", + "Whether an AWS service expects to recycle or clean up this ENI", + "Application, failover, or operational intent", + "Exact pricing impact", +) + + +def _str(value: object) -> Optional[str]: + """Return value as str only when it is a non-empty string; else None.""" + return value if isinstance(value, str) and value else None + + +def _bool_only(value: object) -> Optional[bool]: + """Return value only when it is an actual bool; else None.""" + return value if isinstance(value, bool) else None + + +def _normalize_eni(eni: object) -> Optional[dict]: + """Normalize a raw DescribeNetworkInterfaces item to the canonical field shape. + + Returns None when the item is not a dict or required identity/state fields + are absent — the caller must skip the item. All rule logic must operate + only on the returned normalized dict. """ - ec2 = session.client("ec2", region_name=region) + if not isinstance(eni, dict): + return None - now = datetime.now(timezone.utc) - findings: List[Finding] = [] + # --- Identity fields (required; absent → skip item) --- + network_interface_id = _str(eni.get("NetworkInterfaceId")) + if network_interface_id is None: + return None - try: - # Note: describe_network_interfaces supports pagination - paginator = ec2.get_paginator("describe_network_interfaces") + # --- State fields (required; absent → skip item) --- + normalized_status = _str(eni.get("Status")) + if normalized_status is None: + return None - for page in paginator.paginate(): - for eni in page.get("NetworkInterfaces", []): - # Only consider detached ENIs - if eni.get("Status") != "available": - continue - - # Exclude AWS infrastructure ENIs using InterfaceType - # These are ENIs for AWS infrastructure that users don't manage - interface_type = eni.get("InterfaceType", "interface") - if interface_type in [ - "nat_gateway", # NAT Gateway ENI (AWS infrastructure) - "load_balancer", # ELB/ALB/NLB ENI (AWS infrastructure) - "gateway_load_balancer", # Gateway Load Balancer - "gateway_load_balancer_endpoint", # GWLB endpoint - "vpc_endpoint", # VPC endpoint interface (AWS infrastructure) - ]: - continue - - # Note: We DO want to flag RequesterManaged ENIs with InterfaceType="interface" - # These are user resources created by Lambda, ECS, RDS, etc. - common waste! - - # Calculate age since creation - create_time = eni.get("CreateTime") - if create_time is None: - age_days = 0 - else: - try: - age_days = (now - create_time).days - except TypeError: - age_days = 0 - - # Apply age threshold (skip if too young) - if age_days < max_age_days: - continue - - # Build evidence (be honest about what we're measuring) - signals_used = [ - "ENI status is 'available' (currently detached)", - f"ENI was created {age_days} days ago and is currently detached", - ] - - # Note: We cannot measure "detached duration" because AWS doesn't expose DetachTime - # We use creation age as a conservative proxy - - if eni.get("RequesterManaged"): - signals_used.append( - "ENI is requester-managed (created by AWS service such as Lambda/ECS)" - ) - - # Check if ENI has any tags - tags = eni.get("TagSet", []) - if not tags: - signals_used.append("ENI has no tags (ownership unclear)") - - evidence = Evidence( - signals_used=signals_used, - signals_not_checked=[ - "Detached duration (AWS does not expose DetachTime)", - "Previous attachment history", - "AWS Hyperplane ENI reuse behavior (undocumented retention)", - "Future planned attachments", - "Application-level usage", - "Manual operational workflows", - ], - time_window=f"{max_age_days} days since creation", - ) + # --- Attachment fields (all optional → null) --- + raw_attachment = eni.get("Attachment") + if isinstance(raw_attachment, dict): + attachment_status = _str(raw_attachment.get("Status")) + attachment_id = _str(raw_attachment.get("AttachmentId")) + attachment_instance_id = _str(raw_attachment.get("InstanceId")) + attachment_instance_owner_id = _str(raw_attachment.get("InstanceOwnerId")) + else: + attachment_status = None + attachment_id = None + attachment_instance_id = None + attachment_instance_owner_id = None + + # --- Ownership / service-context fields (optional → null) --- + interface_type = _str(eni.get("InterfaceType")) + requester_managed = _bool_only(eni.get("RequesterManaged")) + + raw_operator = eni.get("Operator") + if isinstance(raw_operator, dict): + operator_managed = _bool_only(raw_operator.get("Managed")) + operator_principal = _str(raw_operator.get("Principal")) + else: + operator_managed = None + operator_principal = None + + # --- Network / resource-metadata fields (optional → null / []) --- + description = _str(eni.get("Description")) + availability_zone = _str(eni.get("AvailabilityZone")) + subnet_id = _str(eni.get("SubnetId")) + vpc_id = _str(eni.get("VpcId")) + private_ip_address = _str(eni.get("PrivateIpAddress")) + + raw_association = eni.get("Association") + public_ip = _str(raw_association.get("PublicIp")) if isinstance(raw_association, dict) else None + + raw_tag_set = eni.get("TagSet") + tag_set: list = raw_tag_set if isinstance(raw_tag_set, list) else [] + + return { + "resource_id": network_interface_id, + "network_interface_id": network_interface_id, + "normalized_status": normalized_status, + "attachment_status": attachment_status, + "attachment_id": attachment_id, + "attachment_instance_id": attachment_instance_id, + "attachment_instance_owner_id": attachment_instance_owner_id, + "interface_type": interface_type, + "requester_managed": requester_managed, + "operator_managed": operator_managed, + "operator_principal": operator_principal, + "description": description, + "availability_zone": availability_zone, + "subnet_id": subnet_id, + "vpc_id": vpc_id, + "private_ip_address": private_ip_address, + "public_ip": public_ip, + "tag_set": tag_set, + } - # Build details - details = { - "status": eni.get("Status"), - "age_days": age_days, - "create_time": create_time.isoformat() if create_time else None, - "interface_type": interface_type, - "requester_managed": eni.get("RequesterManaged", False), - "vpc_id": eni.get("VpcId"), - "subnet_id": eni.get("SubnetId"), - "availability_zone": eni.get("AvailabilityZone"), - } - - description = eni.get("Description", "") - if description: - details["description"] = description - - if tags: - details["tags"] = tags - - # Include private IP if present - private_ips = eni.get("PrivateIpAddresses", []) - if private_ips: - details["private_ip"] = private_ips[0].get("PrivateIpAddress") - - findings.append( - Finding( - provider="aws", - rule_id="aws.ec2.eni.detached", - resource_type="aws.ec2.network_interface", - resource_id=eni["NetworkInterfaceId"], - region=region, - title="Detached Network Interface (Review Recommended)", - summary=f"ENI created {age_days} days ago and currently detached (incurs small hourly charges)", - reason=f"ENI is {age_days} days old and currently in detached state, incurring charges", - risk=RiskLevel.LOW, - confidence=ConfidenceLevel.MEDIUM, # Medium because we can't measure detached duration - detected_at=now, - evidence=evidence, - details=details, - ) - ) - except ClientError as e: - if e.response["Error"]["Code"] == "UnauthorizedOperation": +def find_detached_enis( + session: boto3.Session, + region: str, +) -> List[Finding]: + ec2 = session.client("ec2", region_name=region) + + try: + paginator = ec2.get_paginator("describe_network_interfaces") + pages = list(paginator.paginate()) + except ClientError as exc: + if exc.response["Error"]["Code"] == "UnauthorizedOperation": raise PermissionError( "Missing required IAM permission: ec2:DescribeNetworkInterfaces" - ) from e + ) from exc + raise + except BotoCoreError: raise + now = datetime.now(timezone.utc) + findings: List[Finding] = [] + + for page in pages: + for raw_eni in page.get("NetworkInterfaces", []): + # --- Step 1: Normalize --- + n = _normalize_eni(raw_eni) + if n is None: + continue + + # --- Step 2: EXCLUSION RULES --- + + # EXCLUSION: top-level status must be the sole eligible state + if n["normalized_status"] != _ELIGIBLE_STATUS: + continue + + # EXCLUSION: attachment_status must be in the allowed set (None or "detached"). + # Any other value — known conflict statuses or unknown/malformed strings — + # is inconsistent with the available state → SKIP ITEM. + if n["attachment_status"] not in _ALLOWED_ATTACHMENT_STATUSES: + continue + + # --- Detection path: detached-eni-review-candidate --- + + signals_used = [_SIGNAL_NOT_CURRENTLY_ATTACHED] + if n["requester_managed"] is True: + signals_used.append(_SIGNAL_REQUESTER_MANAGED) + if n["operator_managed"] is True: + principal = n["operator_principal"] or "unknown" + signals_used.append(f"ENI is operator-managed (operator principal: {principal})") + + findings.append( + Finding( + provider="aws", + rule_id="aws.ec2.eni.detached", + resource_type="aws.ec2.network_interface", + resource_id=n["network_interface_id"], + region=region, + estimated_monthly_cost_usd=None, + title=_FINDING_TITLE, + summary=( + f"ENI {n['network_interface_id']} Status is 'available' — " + "not currently attached per DescribeNetworkInterfaces" + ), + reason=( + "ENI Status is 'available' — not currently attached " + "per DescribeNetworkInterfaces" + ), + risk=RiskLevel.LOW, + confidence=ConfidenceLevel.HIGH, + detected_at=now, + evidence=Evidence( + signals_used=signals_used, + signals_not_checked=list(_SIGNALS_NOT_CHECKED), + ), + details={ + "evaluation_path": "detached-eni-review-candidate", + "network_interface_id": n["network_interface_id"], + "normalized_status": n["normalized_status"], + "attachment_status": n["attachment_status"], + "attachment_id": n["attachment_id"], + "attachment_instance_id": n["attachment_instance_id"], + "attachment_instance_owner_id": n["attachment_instance_owner_id"], + "interface_type": n["interface_type"], + "requester_managed": n["requester_managed"], + "operator_managed": n["operator_managed"], + "operator_principal": n["operator_principal"], + "availability_zone": n["availability_zone"], + "subnet_id": n["subnet_id"], + "vpc_id": n["vpc_id"], + "private_ip_address": n["private_ip_address"], + "public_ip": n["public_ip"], + "description": n["description"], + "tag_set": n["tag_set"], + }, + ) + ) + return findings diff --git a/cleancloud/providers/aws/rules/nat_gateway_idle.py b/cleancloud/providers/aws/rules/nat_gateway_idle.py index ba9b270..101f981 100644 --- a/cleancloud/providers/aws/rules/nat_gateway_idle.py +++ b/cleancloud/providers/aws/rules/nat_gateway_idle.py @@ -1,385 +1,378 @@ +""" +Rule: aws.ec2.nat_gateway.idle + + (spec — docs/specs/aws/nat_gateway_idle.md) + +Intent: + Detect NAT Gateways that are currently available, old enough to evaluate, + and show no trusted CloudWatch traffic/activity evidence during the + configured observation window, so they can be reviewed as possible cleanup + candidates. + +Exclusions: + - nat_gateway_id absent (malformed identity) + - normalized_state absent (missing current-state signal) + - normalized_state != "available" + - create_time_utc absent, naive, or in the future + - age_days < idle_days_threshold (too new to evaluate) + - any required CloudWatch metric has no datapoints (insufficient evidence) + - any required metric shows activity > 0 + +Detection: + - nat_gateway_id present, normalized_state == "available" + - age_days >= idle_days_threshold + - all 5 required CloudWatch metrics return datapoints and are all zero + +Key rules: + - Missing CloudWatch datapoints → SKIP ITEM (not zero). + - CloudWatch API failure → FAIL RULE (not LOW-confidence finding). + - 5 required metrics: BytesOutToDestination, BytesInFromSource, + BytesInFromDestination, BytesOutToSource (Sum), ActiveConnectionCount (Maximum). + - Route-table context is contextual only; absence does not substitute + for CloudWatch evidence. + - Naive CreateTime → SKIP ITEM. + - estimated_monthly_cost_usd = None. + - Confidence: HIGH (no route ref confirmed) or MEDIUM (route ref or unavailable). + - Risk: MEDIUM. + +Blind spots: + - planned future usage or DR/failover intent + - seasonal or cyclical usage outside the observation window + - organizational ownership or business intent + - exact region-specific pricing impact + +APIs: + - ec2:DescribeNatGateways + - cloudwatch:GetMetricStatistics + - ec2:DescribeRouteTables (contextual) +""" + from datetime import datetime, timedelta, timezone -from typing import List +from typing import List, Optional, Tuple import boto3 -from botocore.exceptions import ClientError +from botocore.exceptions import BotoCoreError, ClientError from cleancloud.core.confidence import ConfidenceLevel from cleancloud.core.evidence import Evidence from cleancloud.core.finding import Finding from cleancloud.core.risk import RiskLevel +# --- Module-level constants --- -def find_idle_nat_gateways( - session: boto3.Session, - region: str, - idle_days: int = 14, -) -> List[Finding]: - """ - Find NAT Gateways with no traffic for `idle_days` days. - - NAT Gateways incur a fixed hourly charge (~$0.045/hr ≈ $32.85/month) regardless of - connectivity type (public or private), plus per-GB data processing fees - ($0.045/GB for public, $0.01/GB for private). The hourly cost alone makes idle - gateways a meaningful waste. - - Detection logic: - - NAT Gateway state is 'available' - - Older than `idle_days` (noise-reduction heuristic — new gateways may not have had - time for operators to configure routing; this is NOT an AWS-defined grace period) - - All four CloudWatch byte metrics are zero over `idle_days` - - Not referenced by any VPC route table (corroborating idle signal) - - Notes on accuracy: - - CloudWatch NAT Gateway metrics are eventually consistent and can lag by minutes - to hours. More importantly, datapoints can be absent entirely for periods of low - activity — CloudWatch omits zero-value datapoints rather than publishing them. - Missing datapoints are treated as zero by this rule, which means low-but-nonzero - traffic could be missed if it falls within a gap in metric publication. - - Daily (86400s) granularity is used; within-day bursts contribute to that day's Sum, - but a burst that happens to fall in a metric-publication gap may not appear. - - Zero traffic may be intentional: DR/failover, pre-warmed infrastructure, or - seasonal traffic patterns. Always review before acting. - - Elastic IPs associated with a public NAT Gateway may incur idle charges even - after the gateway is deleted; check and release them separately. - - Route table references are a corroborating signal only — a referenced route table - does not prove the gateway is actively used; it only means it is reachable. - - IAM permissions: - - ec2:DescribeNatGateways - - ec2:DescribeRouteTables - - cloudwatch:GetMetricStatistics - """ - ec2 = session.client("ec2", region_name=region) - cloudwatch = session.client("cloudwatch", region_name=region) +_DEFAULT_IDLE_DAYS_THRESHOLD = 14 +_ELIGIBLE_STATE = "available" +_CW_NAMESPACE = "AWS/NATGateway" +_CW_DIM = "NatGatewayId" - now = datetime.now(timezone.utc) - findings: List[Finding] = [] +# Required metrics in evaluation order: (metric_name, statistic, detail_key) +_REQUIRED_METRICS: Tuple = ( + ("BytesOutToDestination", "Sum", "bytes_out_to_destination"), + ("BytesInFromSource", "Sum", "bytes_in_from_source"), + ("BytesInFromDestination", "Sum", "bytes_in_from_destination"), + ("BytesOutToSource", "Sum", "bytes_out_to_source"), + ("ActiveConnectionCount", "Maximum", "active_connection_count_max"), +) - try: - paginator = ec2.get_paginator("describe_nat_gateways") +_FINDING_TITLE = "Idle NAT Gateway review candidate" - for page in paginator.paginate(): - for nat_gw in page.get("NatGateways", []): - # Only check available gateways - state = nat_gw.get("State") - if state != "available": - continue - - nat_gw_id = nat_gw["NatGatewayId"] - connectivity_type = nat_gw.get("ConnectivityType", "public") - - # Calculate age - create_time = nat_gw.get("CreateTime") - age_days = 0 - if create_time: - try: - age_days = (now - create_time).days - except TypeError: - pass - - # Noise-reduction heuristic: skip recently created gateways. - # New gateways may not have had time for route tables to be configured. - # This is NOT an AWS-defined grace period — adjust idle_days as needed. - if age_days < idle_days: - continue - - # Check CloudWatch metrics for traffic — all 4 direction metrics - ( - has_traffic, - fetch_failed, - bytes_out_dest, - bytes_in_src, - bytes_in_dest, - bytes_out_src, - ) = _check_nat_gateway_traffic(cloudwatch, nat_gw_id, idle_days) - - # has_traffic=True with fetch_failed=False → confirmed traffic, skip. - # has_traffic=True with fetch_failed=True → metric unreadable; create a - # LOW-confidence finding so the operator knows to verify manually. - if has_traffic and not fetch_failed: - continue - - # Check route table associations — a NAT GW not referenced by any route - # table is not reachable from any subnet (strong corroborating idle signal). - in_route_tables, route_table_check_failed = _check_route_table_references( - ec2, nat_gw_id - ) +_SIGNALS_NOT_CHECKED = ( + "Planned future usage or infrastructure pre-warming intent", + "Disaster recovery or failover intent — zero traffic may be intentional", + "Seasonal or cyclical usage patterns outside the observation window", + "Organizational ownership or business intent", + "Exact region-specific pricing impact", +) - # Get VPC and subnet info - vpc_id = nat_gw.get("VpcId") - subnet_id = nat_gw.get("SubnetId") - - # Get Elastic IP info - addresses = nat_gw.get("NatGatewayAddresses", []) - eip_info = [] - for addr in addresses: - eip_info.append( - { - "allocation_id": addr.get("AllocationId"), - "public_ip": addr.get("PublicIp"), - "private_ip": addr.get("PrivateIp"), - } - ) - - if fetch_failed: - confidence = ConfidenceLevel.LOW - risk = RiskLevel.MEDIUM - title = "NAT Gateway Requires Traffic Verification" - summary = ( - f"NAT Gateway '{nat_gw_id}' could not be verified as idle — " - "CloudWatch traffic metrics were unreadable (transient/throttle error)." - ) - reason = "NAT Gateway traffic metrics could not be fetched; idle status is unconfirmed" - elif not in_route_tables and not route_table_check_failed: - # Zero traffic confirmed AND no route table references the gateway — - # two independent signals agree; HIGH confidence and risk warranted. - confidence = ConfidenceLevel.HIGH - risk = RiskLevel.HIGH - title = f"Idle NAT Gateway (No Traffic for {idle_days}+ Days, Not Routed)" - summary = ( - f"NAT Gateway '{nat_gw_id}' has had no traffic for {idle_days}+ days " - "and is not referenced by any route table — it is unreachable and billing." - ) - reason = ( - f"NAT Gateway has zero traffic for {idle_days}+ days " - "and is not referenced by any VPC route table" - ) - else: - confidence = ConfidenceLevel.MEDIUM - risk = RiskLevel.MEDIUM - title = f"Idle NAT Gateway (No Traffic for {idle_days}+ Days)" - summary = ( - f"NAT Gateway '{nat_gw_id}' has had no traffic for " - f"{idle_days}+ days and is incurring ~$32.85/month in base charges." - ) - reason = f"NAT Gateway has zero traffic for {idle_days}+ days" - - signals = [] - if fetch_failed: - signals.append( - "CloudWatch traffic metrics unreadable (transient/throttle error) — " - "traffic status unverified" - ) - else: - signals.append( - f"No traffic detected for {idle_days} days (all 4 CloudWatch direction metrics; " - "note: metrics are eventually consistent and may lag by minutes to hours)" - ) - signals.append(f"BytesOutToDestination: {bytes_out_dest} bytes") - signals.append(f"BytesInFromSource: {bytes_in_src} bytes") - signals.append(f"BytesInFromDestination: {bytes_in_dest} bytes") - signals.append(f"BytesOutToSource: {bytes_out_src} bytes") - - signals.append(f"NAT Gateway state is '{state}'") - signals.append(f"Connectivity type: {connectivity_type}") - - if not in_route_tables and not route_table_check_failed: - signals.append( - "Not referenced by any VPC route table — gateway is unreachable from all subnets" - ) - elif in_route_tables: - signals.append("Referenced by at least one VPC route table") - - if age_days > 0: - signals.append(f"NAT Gateway is {age_days} days old") - - signals_not_checked = [ - "Planned future usage", - "Disaster recovery or failover intent — zero traffic may be intentional for DR standby", - "Blue/green deployment scenarios", - "Seasonal traffic patterns", - "Development/staging environment cycles", - ] - if fetch_failed: - signals_not_checked.insert( - 0, - "Traffic metrics (BytesOutToDestination, BytesInFromSource, " - "BytesInFromDestination, BytesOutToSource) — CloudWatch fetch failed; " - "traffic status unverified", - ) - if route_table_check_failed: - signals_not_checked.append( - "Route table associations — DescribeRouteTables failed; " - "could not confirm whether gateway is referenced" - ) - if connectivity_type == "public" and eip_info: - signals_not_checked.append( - "Elastic IP idle charges — associated EIPs may incur additional cost " - "even after the NAT Gateway is deleted; release them separately" - ) - - evidence = Evidence( - signals_used=signals, - signals_not_checked=signals_not_checked, - time_window=f"{idle_days} days", - ) - data_processing_note = ( - "$0.045/GB for public, $0.01/GB for private" - if connectivity_type == "public" - else "$0.01/GB (private NAT Gateway)" - ) - tags = nat_gw.get("Tags", []) - name_tag = next((t["Value"] for t in tags if t.get("Key") == "Name"), None) - - findings.append( - Finding( - provider="aws", - rule_id="aws.ec2.nat_gateway.idle", - resource_type="aws.ec2.nat_gateway", - resource_id=nat_gw_id, - region=region, - estimated_monthly_cost_usd=32.85, - title=title, - summary=summary, - reason=reason, - risk=risk, - confidence=confidence, - detected_at=now, - evidence=evidence, - details={ - "name": name_tag, - "connectivity_type": connectivity_type, - "state": state, - "age_days": age_days, - "create_time": (create_time.isoformat() if create_time else None), - "vpc_id": vpc_id, - "subnet_id": subnet_id, - "elastic_ips": eip_info, - "in_route_tables": in_route_tables, - "bytes_out_to_destination": bytes_out_dest, - "bytes_in_from_source": bytes_in_src, - "bytes_in_from_destination": bytes_in_dest, - "bytes_out_to_source": bytes_out_src, - "idle_days_threshold": idle_days, - "estimated_monthly_cost": ( - f"~$32.85/month base hourly cost (us-east-1 on-demand; " - f"region-dependent; excludes data processing charges: " - f"{data_processing_note})" - ), - "tags": tags, - }, - ) - ) +def _str(value: object) -> Optional[str]: + """Return value as str only when it is a non-empty string; else None.""" + return value if isinstance(value, str) and value else None - except ClientError as e: - code = e.response["Error"]["Code"] - if code in ("UnauthorizedOperation", "AccessDenied"): - raise PermissionError( - "Missing required IAM permissions: " - "ec2:DescribeNatGateways, ec2:DescribeRouteTables, cloudwatch:GetMetricStatistics" - ) from e - raise - return findings +def _choose_period(idle_days: int) -> int: + """Return a deterministic Period compliant with CloudWatch retention rules. + idle_days * 86400 is a multiple of 60, 300, and 3600, satisfying all three + CloudWatch retention constraints for the chosen lookback window. + """ + return idle_days * 86400 -def _check_route_table_references(ec2, nat_gw_id: str) -> tuple: - """Check whether any VPC route table has a route pointing to this NAT Gateway. - A NAT Gateway not referenced by any route table is unreachable from all subnets - and is therefore a strong corroborating idle signal. +def _normalize_nat_gw(item: object, now_utc: datetime) -> Optional[dict]: + """Normalize a raw DescribeNatGateways item to the canonical field shape. - Returns (in_route_tables: bool, check_failed: bool). - check_failed is True if DescribeRouteTables raised a non-permission error. + Returns None when the item is not a dict or required identity/state/age fields + are absent or invalid — the caller must skip the item. + All rule logic must operate only on the returned normalized dict. """ - try: - response = ec2.describe_route_tables( - Filters=[{"Name": "route.nat-gateway-id", "Values": [nat_gw_id]}] - ) - return len(response.get("RouteTables", [])) > 0, False - except ClientError as e: - code = e.response["Error"]["Code"] - if code in ("AccessDenied", "UnauthorizedOperation"): - # Surface as a check failure rather than raising — missing this permission - # degrades the signal but should not abort the scan. - return False, True - return False, True - - -def _check_nat_gateway_traffic( + if not isinstance(item, dict): + return None + + # --- Identity fields (required; absent → skip) --- + nat_gateway_id = _str(item.get("NatGatewayId")) + if nat_gateway_id is None: + return None + + # --- State fields (required; absent → skip) --- + normalized_state = _str(item.get("State")) + if normalized_state is None: + return None + + # --- CreateTime (required; absent, naive, or future → skip) --- + raw_ct = item.get("CreateTime") + if not isinstance(raw_ct, datetime): + return None + if raw_ct.tzinfo is None: + # Naive datetime — cannot safely compare to UTC; treat as absent → skip. + return None + create_time_utc = raw_ct.astimezone(timezone.utc) + if create_time_utc > now_utc: + # Future CreateTime is invalid → skip. + return None + age_days = int((now_utc - create_time_utc).total_seconds() // 86400) + + # --- Core context fields (optional → null / []) --- + connectivity_type = _str(item.get("ConnectivityType")) + availability_mode = _str(item.get("AvailabilityMode")) + vpc_id = _str(item.get("VpcId")) + subnet_id = _str(item.get("SubnetId")) + + raw_addresses = item.get("NatGatewayAddresses") + nat_gateway_addresses = raw_addresses if isinstance(raw_addresses, list) else [] + + raw_appliances = item.get("AttachedAppliances") + attached_appliances = raw_appliances if isinstance(raw_appliances, list) else [] + + raw_tags = item.get("Tags") + tag_set: list = raw_tags if isinstance(raw_tags, list) else [] + + return { + "resource_id": nat_gateway_id, + "nat_gateway_id": nat_gateway_id, + "normalized_state": normalized_state, + "create_time_utc": create_time_utc, + "age_days": age_days, + "connectivity_type": connectivity_type, + "availability_mode": availability_mode, + "vpc_id": vpc_id, + "subnet_id": subnet_id, + "nat_gateway_addresses": nat_gateway_addresses, + "attached_appliances": attached_appliances, + "auto_scaling_ips": _str(item.get("AutoScalingIps")), + "auto_provision_zones": _str(item.get("AutoProvisionZones")), + "tag_set": tag_set, + } + + +def _get_metric_value( cloudwatch, nat_gw_id: str, - days: int, -) -> tuple: - """ - Check if NAT Gateway has had any traffic in the past `days` days. - - AWS publishes four directional metrics for NAT Gateways: - - BytesOutToDestination: private subnet → internet (outbound requests) - - BytesInFromSource: private subnet → NAT GW (client-side inbound) - - BytesInFromDestination: internet → NAT GW (return traffic) - - BytesOutToSource: NAT GW → private subnet (return traffic to client) - - All four are checked to avoid missing asymmetric or long-lived connections - where only return-path traffic falls within the observation window. + metric_name: str, + statistic: str, + start_time: datetime, + end_time: datetime, + period: int, +) -> Optional[float]: + """Fetch a single metric over the observation window. - Returns (has_traffic, fetch_failed, bytes_out_dest, bytes_in_src, bytes_in_dest, bytes_out_src). - fetch_failed is True if any metric fetch encountered a transient/throttle error. - When fetch_failed is True, has_traffic is True (conservative) — but the caller - should surface this uncertainty rather than silently treating the gateway as active. + Returns None if no datapoints (insufficient evidence → caller must SKIP ITEM). + Returns the aggregated value (>= 0.0) if datapoints are present. + Raises ClientError / BotoCoreError / PermissionError on API failure (caller → FAIL RULE). """ - now = datetime.now(timezone.utc) - start_time = now - timedelta(days=days) - - def _fetch(metric_name: str) -> tuple: - return _get_metric_sum( - cloudwatch, "AWS/NATGateway", metric_name, "NatGatewayId", nat_gw_id, start_time, now + try: + resp = cloudwatch.get_metric_statistics( + Namespace=_CW_NAMESPACE, + MetricName=metric_name, + Dimensions=[{"Name": _CW_DIM, "Value": nat_gw_id}], + StartTime=start_time, + EndTime=end_time, + Period=period, + Statistics=[statistic], ) + except ClientError as exc: + if exc.response["Error"]["Code"] in ("AccessDenied", "UnauthorizedOperation"): + raise PermissionError( + "Missing required IAM permission: cloudwatch:GetMetricStatistics" + ) from exc + raise + except BotoCoreError: + raise - out_dest, err1 = _fetch("BytesOutToDestination") - in_src, err2 = _fetch("BytesInFromSource") - in_dest, err3 = _fetch("BytesInFromDestination") - out_src, err4 = _fetch("BytesOutToSource") + datapoints = resp.get("Datapoints", []) + if not datapoints: + return None # No datapoints → insufficient evidence → SKIP ITEM - fetch_failed = err1 or err2 or err3 or err4 - has_traffic = (out_dest > 0) or (in_src > 0) or (in_dest > 0) or (out_src > 0) - return has_traffic, fetch_failed, out_dest, in_src, in_dest, out_src + if statistic == "Sum": + return sum(dp.get("Sum", 0.0) for dp in datapoints) + if statistic == "Maximum": + return max(dp.get("Maximum", 0.0) for dp in datapoints) + # Fallback for any other statistic (not expected in this rule) + return sum(dp.get(statistic, 0.0) for dp in datapoints) -def _get_metric_sum( - cloudwatch, - namespace: str, - metric_name: str, - dimension_name: str, - dimension_value: str, - start_time: datetime, - end_time: datetime, -) -> tuple: - """Get sum of a CloudWatch metric over the time period. - - Returns (value: int, fetch_error: bool). - - value: total bytes summed across all datapoints (0 if no data). - - fetch_error: True if a non-permission error occurred (throttle, transient, etc.). - When fetch_error is True, value is 1 (conservative — avoids false positives), - but the caller should surface this to the operator via evidence. +def _check_route_table_references(ec2, nat_gw_id: str) -> Tuple[Optional[bool], bool]: + """Check whether any VPC route table references this NAT Gateway. + + Returns (route_table_referenced, check_succeeded): + - (False, True) — no route table references found + - (True, True) — at least one route table references the NAT Gateway + - (None, False) — DescribeRouteTables failed; context unavailable """ try: - response = cloudwatch.get_metric_statistics( - Namespace=namespace, - MetricName=metric_name, - Dimensions=[ - { - "Name": dimension_name, - "Value": dimension_value, - } - ], - StartTime=start_time, - EndTime=end_time, - Period=86400, # 1 day in seconds - Statistics=["Sum"], + response = ec2.describe_route_tables( + Filters=[{"Name": "route.nat-gateway-id", "Values": [nat_gw_id]}] ) + return len(response.get("RouteTables", [])) > 0, True + except Exception: + return None, False - datapoints = response.get("Datapoints", []) - total = sum(dp.get("Sum", 0) for dp in datapoints) - return int(total), False - except ClientError as e: - if e.response["Error"]["Code"] in ("AccessDenied", "UnauthorizedOperation"): +def find_idle_nat_gateways( + session: boto3.Session, + region: str, + idle_days_threshold: int = _DEFAULT_IDLE_DAYS_THRESHOLD, +) -> List[Finding]: + ec2 = session.client("ec2", region_name=region) + cloudwatch = session.client("cloudwatch", region_name=region) + + try: + paginator = ec2.get_paginator("describe_nat_gateways") + pages = list(paginator.paginate()) + except ClientError as exc: + if exc.response["Error"]["Code"] in ("AccessDenied", "UnauthorizedOperation"): raise PermissionError( - "Missing required IAM permissions: cloudwatch:GetMetricStatistics" - ) from e - # Other errors (throttle, transient): assume traffic to avoid false positives, - # but flag the error so the caller can surface it. - return 1, True + "Missing required IAM permission: ec2:DescribeNatGateways" + ) from exc + raise + except BotoCoreError: + raise + + now = datetime.now(timezone.utc) + window_start = now - timedelta(seconds=idle_days_threshold * 86400) + period = _choose_period(idle_days_threshold) + findings: List[Finding] = [] + + for page in pages: + for raw_item in page.get("NatGateways", []): + # --- Step 1: Normalize --- + n = _normalize_nat_gw(raw_item, now) + if n is None: + continue + + # --- Step 2: EXCLUSION RULES --- + + # EXCLUSION: state must be available + if n["normalized_state"] != _ELIGIBLE_STATE: + continue + + # EXCLUSION: too young to evaluate + if n["age_days"] < idle_days_threshold: + continue + + # --- Step 3: CloudWatch metrics (FAIL RULE on error; SKIP ITEM if no data) --- + metric_values: dict = {} + skip_insufficient = False + + for metric_name, statistic, detail_key in _REQUIRED_METRICS: + value = _get_metric_value( + cloudwatch, + n["nat_gateway_id"], + metric_name, + statistic, + window_start, + now, + period, + ) + if value is None: + # No datapoints for this metric → insufficient evidence → SKIP ITEM + skip_insufficient = True + break + metric_values[detail_key] = value + + if skip_insufficient: + continue + + # EXCLUSION: any metric shows activity + if any(v > 0 for v in metric_values.values()): + continue + + # --- Step 4: Route-table context (optional; failure degrades context only) --- + route_table_referenced, rt_check_ok = _check_route_table_references( + ec2, n["nat_gateway_id"] + ) + + # --- Step 5: Confidence --- + if rt_check_ok and route_table_referenced is False: + confidence = ConfidenceLevel.HIGH + rt_signal = "No VPC route table references this NAT Gateway" + elif rt_check_ok and route_table_referenced is True: + confidence = ConfidenceLevel.MEDIUM + rt_signal = "At least one VPC route table still references this NAT Gateway" + else: + confidence = ConfidenceLevel.MEDIUM + rt_signal = "Route-table context unavailable (DescribeRouteTables failed)" + + reason = ( + f"NAT Gateway has no trusted CloudWatch traffic signal " + f"in the last {idle_days_threshold} days" + ) + + signals_used = [ + f"NAT Gateway State is '{_ELIGIBLE_STATE}' (able to process traffic)", + f"Age is {n['age_days']} days, meeting the {idle_days_threshold}-day threshold", + "All 5 required CloudWatch activity metrics returned datapoints and are zero " + f"for the {idle_days_threshold}-day observation window " + "(CleanCloud-derived idle heuristic)", + rt_signal, + ] + + findings.append( + Finding( + provider="aws", + rule_id="aws.ec2.nat_gateway.idle", + resource_type="aws.ec2.nat_gateway", + resource_id=n["nat_gateway_id"], + region=region, + estimated_monthly_cost_usd=None, + title=_FINDING_TITLE, + summary=( + f"NAT Gateway {n['nat_gateway_id']} has no trusted CloudWatch " + f"traffic signal in the last {idle_days_threshold} days" + ), + reason=reason, + risk=RiskLevel.MEDIUM, + confidence=confidence, + detected_at=now, + evidence=Evidence( + signals_used=signals_used, + signals_not_checked=list(_SIGNALS_NOT_CHECKED), + time_window=f"{idle_days_threshold} days", + ), + details={ + "evaluation_path": "idle-nat-gateway-review-candidate", + "nat_gateway_id": n["nat_gateway_id"], + "normalized_state": n["normalized_state"], + "create_time": n["create_time_utc"].isoformat(), + "age_days": n["age_days"], + "idle_days_threshold": idle_days_threshold, + "connectivity_type": n["connectivity_type"], + "availability_mode": n["availability_mode"], + "vpc_id": n["vpc_id"], + "subnet_id": n["subnet_id"], + "bytes_out_to_destination": metric_values["bytes_out_to_destination"], + "bytes_in_from_source": metric_values["bytes_in_from_source"], + "bytes_in_from_destination": metric_values["bytes_in_from_destination"], + "bytes_out_to_source": metric_values["bytes_out_to_source"], + "active_connection_count_max": metric_values["active_connection_count_max"], + "route_table_referenced": route_table_referenced, + "nat_gateway_addresses": n["nat_gateway_addresses"], + "attached_appliances": n["attached_appliances"], + "auto_scaling_ips": n["auto_scaling_ips"], + "auto_provision_zones": n["auto_provision_zones"], + "tag_set": n["tag_set"], + }, + ) + ) + + return findings diff --git a/cleancloud/providers/aws/rules/rds_idle.py b/cleancloud/providers/aws/rules/rds_idle.py index d0e6287..dc4183d 100644 --- a/cleancloud/providers/aws/rules/rds_idle.py +++ b/cleancloud/providers/aws/rules/rds_idle.py @@ -1,542 +1,350 @@ +""" +Rule: aws.rds.instance.idle + + (spec — docs/specs/aws/rds_idle.md) + +Intent: + Detect provisioned standalone DB instances that are currently available, old + enough to evaluate, and show no trusted CloudWatch client-connection activity + for the configured observation window, so they can be reviewed as possible + cleanup candidates. + +Exclusions: + - db_instance_id absent (malformed identity) + - normalized_status absent (missing current-state signal) + - normalized_status != "available" + - db_cluster_identifier present (cluster member) + - read_replica_source_db_instance_identifier present (read replica) + - read_replica_source_db_cluster_identifier present (cross-cluster read replica) + - instance_create_time_utc absent, naive, or in the future + - age_days < idle_days_threshold (too new to evaluate) + - DatabaseConnections returns no datapoints (insufficient evidence) + - any DatabaseConnections Maximum > 0 + +Detection: + - db_instance_id present, normalized_status == "available" + - standalone (not a cluster member or read replica) + - age_days >= idle_days_threshold + - DatabaseConnections Maximum returns datapoints and all are zero + +Key rules: + - DatabaseConnections Maximum is the sole required activity metric. + - Missing CloudWatch datapoints → SKIP ITEM (not zero). + - CloudWatch API failure → FAIL RULE (not LOW-confidence finding). + - No CPU or I/O thresholds — not required for baseline eligibility. + - estimated_monthly_cost_usd = None. + - Confidence: MEDIUM always. + - Risk: MEDIUM always. + +Blind spots: + - Sessions without network connections that the database hasn't cleaned up + - Sessions created by the database engine for its own purposes + - Sessions created by parallel execution capabilities or job schedulers + - Amazon RDS connections + - RDS Proxy, PgBouncer, and application connection pools that can hide real + usage while keeping observed client connection counts low or zero + - Planned future usage or disaster recovery intent + - Exact region-specific pricing impact + +APIs: + - rds:DescribeDBInstances + - cloudwatch:GetMetricStatistics +""" + from datetime import datetime, timedelta, timezone from typing import List, Optional import boto3 -from botocore.exceptions import ClientError +from botocore.exceptions import BotoCoreError, ClientError from cleancloud.core.confidence import ConfidenceLevel from cleancloud.core.evidence import Evidence from cleancloud.core.finding import Finding from cleancloud.core.risk import RiskLevel +# --- Module-level constants --- -def find_idle_rds_instances( - session: boto3.Session, - region: str, - idle_days: int = 14, -) -> List[Finding]: - """ - Find RDS instances with zero database connections for `idle_days` days. - - RDS instances incur significant hourly charges depending on instance class - and engine. Cost estimates in this rule are based on MySQL/PostgreSQL us-east-1 - on-demand pricing — Oracle, SQL Server, and other engines have different rates. - - Detection logic: - - Instance status is 'available' - - Instance is older than `idle_days` days - - CloudWatch DatabaseConnections metric sum is 0 over `idle_days` period - - Not a read replica (ReadReplicaSourceDBInstanceIdentifier is empty) - - Not an Aurora cluster member (DBClusterIdentifier is empty) - - Confidence tiers: - - MEDIUM: Zero connections + low peak CPU + low storage I/O (three-signal agreement) - - LOW: Zero connections only, or CPU/IO data unavailable - - Risk tiers: - - HIGH: MEDIUM confidence (multiple corroborating signals) - - MEDIUM: LOW confidence (connections only, or metrics partially unavailable) - - Notes on accuracy: - - DatabaseConnections == 0 does not guarantee no activity. Connection poolers - (RDS Proxy, PgBouncer, application-level pools) may route queries without - maintaining persistent connections visible to CloudWatch. Always verify - application-level usage before acting on this finding. - - CloudWatch publishes DatabaseConnections as a daily Sum. Zero datapoints - (not zero values) means metric visibility is absent — this rule surfaces - those as LOW-confidence "metrics unavailable" findings rather than skipping, - so operators know the instance was not verified. - - Storage cost estimate uses gp2/gp3 at ~$0.115/GB-month (us-east-1). io1/io2 - volumes are more expensive (~$0.125/GB + IOPS charge). Multi-AZ doubling is - approximate: actual billing includes standby compute + storage nuances. - - Automated backups and snapshots may justify retaining an otherwise idle instance. - - IAM permissions: - - rds:DescribeDBInstances - - cloudwatch:GetMetricStatistics - """ - rds = session.client("rds", region_name=region) - cloudwatch = session.client("cloudwatch", region_name=region) - - now = datetime.now(timezone.utc) - findings: List[Finding] = [] +_DEFAULT_IDLE_DAYS_THRESHOLD = 14 +_ELIGIBLE_STATUS = "available" +_CW_NAMESPACE = "AWS/RDS" +_CW_DIM = "DBInstanceIdentifier" - try: - paginator = rds.get_paginator("describe_db_instances") +_FINDING_TITLE = "Idle RDS instance review candidate" - for page in paginator.paginate(): - for instance in page.get("DBInstances", []): - # Only check available instances - status = instance.get("DBInstanceStatus") - if status != "available": - continue - - db_instance_id = instance["DBInstanceIdentifier"] - - # Skip read replicas - if instance.get("ReadReplicaSourceDBInstanceIdentifier"): - continue - - # Skip Aurora cluster members — Aurora instances are managed at - # the cluster level and may show zero connections on individual - # reader/writer nodes even when the cluster is active. - if instance.get("DBClusterIdentifier"): - continue - - tags = instance.get("TagList", []) - - # Calculate age - create_time = instance.get("InstanceCreateTime") - age_days = 0 - if create_time: - try: - age_days = (now - create_time).days - except TypeError: - pass - - # Skip if instance is younger than the idle threshold - if age_days < idle_days: - continue - - start_time = now - timedelta(days=idle_days) - - # Check CloudWatch metrics for connections - total_connections, conn_datapoints = _get_metric_sum( - cloudwatch, - "AWS/RDS", - "DatabaseConnections", - "DBInstanceIdentifier", - db_instance_id, - start_time, - now, - ) +_SIGNALS_NOT_CHECKED = ( + "Sessions without network connections that the database hasn't cleaned up", + "Sessions created by the database engine for its own purposes", + "Sessions created by parallel execution capabilities", + "Sessions created by the database engine job scheduler", + "Amazon RDS connections", + "RDS Proxy, PgBouncer, and application connection pools that can hide real " + "usage while keeping observed client connection counts low or zero", + "Planned future usage or disaster recovery intent", + "Exact region-specific pricing impact", +) - if total_connections > 0: - continue - - # Gather instance details (needed for both finding paths below) - engine = instance.get("Engine", "unknown") - engine_version = instance.get("EngineVersion", "unknown") - instance_class = instance.get("DBInstanceClass", "unknown") - multi_az = instance.get("MultiAZ", False) - storage_gb = instance.get("AllocatedStorage", 0) - compute_cost = _estimate_monthly_cost(instance_class, multi_az) - compute_cost_usd = _estimate_monthly_cost_usd(instance_class, multi_az) - storage_cost_usd = round(storage_gb * 0.115, 2) if storage_gb else 0.0 - storage_cost_usd = storage_cost_usd * 2 if multi_az else storage_cost_usd - total_cost_usd = ( - (compute_cost_usd + storage_cost_usd) if compute_cost_usd is not None else None - ) - if conn_datapoints == 0: - # Zero datapoints means CloudWatch has no visibility — we cannot - # distinguish "truly idle" from "metrics not published". Surface a - # LOW-confidence finding so the operator knows to verify manually. - findings.append( - Finding( - provider="aws", - rule_id="aws.rds.instance.idle", - resource_type="aws.rds.instance", - resource_id=db_instance_id, - region=region, - estimated_monthly_cost_usd=total_cost_usd, - title="RDS Instance Requires Connection Verification", - summary=( - f"RDS instance '{db_instance_id}' ({engine}, {instance_class}) " - f"has no CloudWatch connection data — idle status is unconfirmed." - ), - reason=( - "DatabaseConnections metric returned zero datapoints; " - "idle status cannot be confirmed" - ), - risk=RiskLevel.MEDIUM, - confidence=ConfidenceLevel.LOW, - detected_at=now, - evidence=Evidence( - signals_used=[ - f"Instance status is '{status}'", - f"Engine: {engine} {engine_version}", - f"Instance class: {instance_class}", - f"Instance is {age_days} days old", - ], - signals_not_checked=[ - "DatabaseConnections — CloudWatch returned zero datapoints; " - "metric may not be published for this instance", - "CPU utilisation", - "Storage I/O (ReadIOPS / WriteIOPS)", - "Planned future usage", - "Disaster recovery intent", - "Automated backups or snapshots that may justify retention", - ], - time_window=f"{idle_days} days", - ), - details={ - "engine": f"{engine} {engine_version}", - "instance_class": instance_class, - f"connections_{idle_days}d": None, - "connections_datapoints": 0, - "metrics_note": ( - "DatabaseConnections returned zero datapoints — " - "metric visibility absent; idle status unconfirmed" - ), - "estimated_compute_cost": compute_cost, - "estimated_storage_cost": ( - f"~${storage_cost_usd:.2f}/month " - "(gp2/gp3 approx ~$0.115/GB; io1/io2 higher)" - ), - "multi_az": multi_az, - "allocated_storage_gb": storage_gb, - "age_days": age_days, - "idle_days_threshold": idle_days, - **({"tags": {t["Key"]: t["Value"] for t in tags}} if tags else {}), - }, - ) - ) - continue - - # Corroborating signal 1: peak CPU utilisation. - # Use Maximum (not Average) to catch bursty workloads — a single - # high-CPU day within the window means the instance was active. - peak_cpu, cpu_datapoints = _get_peak_cpu( - cloudwatch, db_instance_id, start_time, now - ) +def _str(value: object) -> Optional[str]: + """Return value as str only when it is a non-empty string; else None.""" + return value if isinstance(value, str) and value else None - if peak_cpu is not None and peak_cpu >= 5.0: - # CPU active despite zero connections — unusual but skip to avoid FP - continue - # Corroborating signal 2: storage I/O (ReadIOPS + WriteIOPS). - # If connections == 0 but IOPS > 0, a background process or connection - # pooler may be active. If IOPS == 0, it corroborates idle. - has_io, read_iops, write_iops, io_datapoints = _get_storage_io( - cloudwatch, db_instance_id, start_time, now - ) +def _choose_period(idle_days: int) -> int: + """Return a deterministic Period compliant with CloudWatch retention rules. - if has_io: - # Storage I/O active despite zero connections — skip to avoid FP - continue - - signals_not_checked = [ - "Planned future usage", - "Disaster recovery intent", - "Seasonal traffic patterns", - "Application deployment cycles", - ( - "Connection poolers or proxies (RDS Proxy, PgBouncer) — " - "may route queries without visible persistent connections" - ), - "External readers or indirect usage patterns", - "Automated backups or snapshots that may justify retention", - ] - - signals = [ - f"Zero database connections for {idle_days} days " - f"({conn_datapoints} of up to {idle_days} daily datapoints)", - f"DatabaseConnections sum: {total_connections}", - f"Instance status is '{status}'", - f"Engine: {engine} {engine_version}", - f"Instance class: {instance_class}", - ] - - cpu_confirmed = False - if peak_cpu is not None: - signals.append( - f"Peak daily CPU utilisation: {peak_cpu:.1f}% " - f"(threshold: 5%) — corroborating idle signal" - ) - cpu_confirmed = True - else: - signals_not_checked.append("CPU utilisation (metric unavailable)") - - io_confirmed = False - if io_datapoints > 0: - signals.append( - f"Storage I/O: ReadIOPS={read_iops}, WriteIOPS={write_iops} " - f"— corroborating idle signal" - ) - io_confirmed = True - else: - signals_not_checked.append("Storage I/O (ReadIOPS / WriteIOPS — no data)") - - if age_days > 0: - signals.append(f"Instance is {age_days} days old") - - # MEDIUM confidence only when all three signals agree: zero connections, - # low peak CPU, and low storage I/O. Any missing or inconclusive - # corroborating signal leaves confidence at LOW. - # Risk mirrors confidence: HIGH for MEDIUM confidence, MEDIUM for LOW. - if cpu_confirmed and io_confirmed: - confidence = ConfidenceLevel.MEDIUM - risk = RiskLevel.HIGH - else: - confidence = ConfidenceLevel.LOW - risk = RiskLevel.MEDIUM - - evidence = Evidence( - signals_used=signals, - signals_not_checked=signals_not_checked, - time_window=f"{idle_days} days", - ) + idle_days * 86400 is a multiple of 60, 300, and 3600, satisfying all three + CloudWatch retention constraints for the chosen lookback window. + """ + return idle_days * 86400 - findings.append( - Finding( - provider="aws", - rule_id="aws.rds.instance.idle", - resource_type="aws.rds.instance", - resource_id=db_instance_id, - region=region, - estimated_monthly_cost_usd=total_cost_usd, - title=f"Idle RDS Instance (No Connections for {idle_days}+ Days)", - summary=( - f"RDS instance '{db_instance_id}' ({engine}, {instance_class}) " - f"has had zero database connections for {idle_days}+ days." - ), - reason=f"RDS instance has zero connections for {idle_days}+ days", - risk=risk, - confidence=confidence, - detected_at=now, - evidence=evidence, - details={ - "engine": f"{engine} {engine_version}", - "instance_class": instance_class, - f"connections_{idle_days}d": total_connections, - "connections_datapoints": conn_datapoints, - "peak_cpu_pct": round(peak_cpu, 2) if peak_cpu is not None else None, - "read_iops": read_iops, - "write_iops": write_iops, - "estimated_compute_cost": ( - compute_cost - + " (MySQL/PostgreSQL us-east-1 rate; engine-dependent)" - if compute_cost and "varies" not in compute_cost - else compute_cost - ), - "estimated_storage_cost": ( - f"~${storage_cost_usd:.2f}/month " - "(gp2/gp3 approx ~$0.115/GB; io1/io2 higher; Multi-AZ doubling approximate)" - ), - "multi_az": multi_az, - "allocated_storage_gb": storage_gb, - "age_days": age_days, - "idle_days_threshold": idle_days, - **({"tags": {t["Key"]: t["Value"] for t in tags}} if tags else {}), - }, - ) - ) - except ClientError as e: - code = e.response["Error"]["Code"] - if code in ("UnauthorizedOperation", "AccessDenied"): - raise PermissionError( - "Missing required IAM permissions: " - "rds:DescribeDBInstances, cloudwatch:GetMetricStatistics" - ) from e - raise +def _normalize_db_instance(item: object, now_utc: datetime) -> Optional[dict]: + """Normalize a raw DescribeDBInstances item to the canonical field shape. - return findings + Returns None when required identity/state/age fields are absent or invalid — + the caller must skip the item. All rule logic must operate only on the + returned normalized dict. + """ + if not isinstance(item, dict): + return None + + # --- Identity fields (required; absent → skip) --- + db_instance_id = _str(item.get("DBInstanceIdentifier")) + if db_instance_id is None: + return None + + # --- State fields (required; absent → skip) --- + normalized_status = _str(item.get("DBInstanceStatus")) + if normalized_status is None: + return None + + # --- InstanceCreateTime (required; absent, naive, or future → skip) --- + raw_ct = item.get("InstanceCreateTime") + if not isinstance(raw_ct, datetime): + return None + if raw_ct.tzinfo is None: + # Naive datetime — cannot safely compare to UTC; treat as absent → skip. + return None + instance_create_time_utc = raw_ct.astimezone(timezone.utc) + if instance_create_time_utc > now_utc: + # Future InstanceCreateTime is invalid → skip. + return None + age_days = int((now_utc - instance_create_time_utc).total_seconds() // 86400) + + # --- Scope fields (optional → null) --- + db_cluster_identifier = _str(item.get("DBClusterIdentifier")) + read_replica_source_db_instance_identifier = _str( + item.get("ReadReplicaSourceDBInstanceIdentifier") + ) + read_replica_source_db_cluster_identifier = _str( + item.get("ReadReplicaSourceDBClusterIdentifier") + ) + + # --- Core context fields (optional → null / []) --- + engine = _str(item.get("Engine")) + engine_version = _str(item.get("EngineVersion")) + db_instance_class = _str(item.get("DBInstanceClass")) + storage_type = _str(item.get("StorageType")) + dbi_resource_id = _str(item.get("DbiResourceId")) + db_instance_arn = _str(item.get("DBInstanceArn")) + + raw_multi_az = item.get("MultiAZ") + multi_az = raw_multi_az if isinstance(raw_multi_az, bool) else None + + raw_storage = item.get("AllocatedStorage") + allocated_storage_gib = raw_storage if isinstance(raw_storage, int) else None + + raw_tags = item.get("TagList") + tag_set: list = raw_tags if isinstance(raw_tags, list) else [] + + return { + "resource_id": db_instance_id, + "db_instance_id": db_instance_id, + "normalized_status": normalized_status, + "instance_create_time_utc": instance_create_time_utc, + "age_days": age_days, + "db_cluster_identifier": db_cluster_identifier, + "read_replica_source_db_instance_identifier": read_replica_source_db_instance_identifier, + "read_replica_source_db_cluster_identifier": read_replica_source_db_cluster_identifier, + "engine": engine, + "engine_version": engine_version, + "db_instance_class": db_instance_class, + "multi_az": multi_az, + "allocated_storage_gib": allocated_storage_gib, + "storage_type": storage_type, + "dbi_resource_id": dbi_resource_id, + "db_instance_arn": db_instance_arn, + "tag_set": tag_set, + } -def _get_metric_sum( +def _get_database_connections_max( cloudwatch, - namespace: str, - metric_name: str, - dimension_name: str, - dimension_value: str, + db_instance_id: str, start_time: datetime, end_time: datetime, -) -> tuple: - """Get sum of a CloudWatch metric over the time period. - - Returns (value, datapoint_count): - - value: 1 if any datapoint has Sum > 0, else 0 - - datapoint_count: number of datapoints returned (0 = no metric visibility) + period: int, +) -> Optional[float]: + """Fetch DatabaseConnections Maximum over the observation window. - Zero datapoints is distinct from all-zero datapoints — the caller should - handle datapoint_count == 0 as "unknown" rather than "confirmed idle". + Returns None if no datapoints (insufficient evidence → caller must SKIP ITEM). + Returns the maximum value (>= 0.0) if datapoints are present. + Raises ClientError / BotoCoreError / PermissionError on API failure (caller → FAIL RULE). """ try: - response = cloudwatch.get_metric_statistics( - Namespace=namespace, - MetricName=metric_name, - Dimensions=[ - { - "Name": dimension_name, - "Value": dimension_value, - } - ], + resp = cloudwatch.get_metric_statistics( + Namespace=_CW_NAMESPACE, + MetricName="DatabaseConnections", + Dimensions=[{"Name": _CW_DIM, "Value": db_instance_id}], StartTime=start_time, EndTime=end_time, - Period=86400, # 1 day in seconds - Statistics=["Sum"], + Period=period, + Statistics=["Maximum"], ) - - datapoints = response.get("Datapoints", []) - count = len(datapoints) - # Use any() instead of sum() — missing datapoints are omitted by - # CloudWatch (not returned as 0), so summing could mask gaps. - # any() is safer: if any single day had connections, it's not idle. - if any(dp.get("Sum", 0) > 0 for dp in datapoints): - return 1, count - return 0, count - - except ClientError as e: - if e.response["Error"]["Code"] in ("AccessDenied", "UnauthorizedOperation"): + except ClientError as exc: + if exc.response["Error"]["Code"] in ("AccessDenied", "UnauthorizedOperation"): raise PermissionError( - "Missing required IAM permissions: cloudwatch:GetMetricStatistics" - ) from e - # Other errors (throttle, transient): assume connections to avoid false positives - return 1, -1 - - -def _get_peak_cpu( - cloudwatch, db_instance_id: str, start_time: datetime, end_time: datetime -) -> tuple: - """Return (peak_cpu_pct, datapoint_count) for the RDS instance over the window. - - Uses Maximum statistic (not Average) to catch bursty workloads — a single - high-CPU day means the instance was active during that window. + "Missing required IAM permission: cloudwatch:GetMetricStatistics" + ) from exc + raise + except BotoCoreError: + raise - Returns (None, 0) on error — caller treats None as CPU signal unavailable. - """ - try: - response = cloudwatch.get_metric_statistics( - Namespace="AWS/RDS", - MetricName="CPUUtilization", - Dimensions=[{"Name": "DBInstanceIdentifier", "Value": db_instance_id}], - StartTime=start_time, - EndTime=end_time, - Period=86400, - Statistics=["Maximum"], - ) - datapoints = response.get("Datapoints", []) - if not datapoints: - return None, 0 - peak = max(dp["Maximum"] for dp in datapoints) - return peak, len(datapoints) - except ClientError: - return None, 0 + datapoints = resp.get("Datapoints", []) + if not datapoints: + return None # No datapoints → insufficient evidence → SKIP ITEM + return max(dp.get("Maximum", 0.0) for dp in datapoints) -def _get_storage_io( - cloudwatch, db_instance_id: str, start_time: datetime, end_time: datetime -) -> tuple: - """Return (has_io, read_iops_sum, write_iops_sum, datapoint_count). - Checks ReadIOPS and WriteIOPS over the window. Any non-zero IOPS means - the storage was active, which is a strong signal of actual database usage - even if DatabaseConnections appears zero (e.g. via connection poolers). +def find_idle_rds_instances( + session: boto3.Session, + region: str, + idle_days_threshold: int = _DEFAULT_IDLE_DAYS_THRESHOLD, +) -> List[Finding]: + rds = session.client("rds", region_name=region) + cloudwatch = session.client("cloudwatch", region_name=region) - datapoint_count is the combined count from both metrics; 0 means no data. - """ try: + paginator = rds.get_paginator("describe_db_instances") + pages = list(paginator.paginate()) + except ClientError as exc: + if exc.response["Error"]["Code"] in ("AccessDenied", "UnauthorizedOperation"): + raise PermissionError( + "Missing required IAM permission: rds:DescribeDBInstances" + ) from exc + raise + except BotoCoreError: + raise - def _fetch(metric_name: str) -> tuple: - response = cloudwatch.get_metric_statistics( - Namespace="AWS/RDS", - MetricName=metric_name, - Dimensions=[{"Name": "DBInstanceIdentifier", "Value": db_instance_id}], - StartTime=start_time, - EndTime=end_time, - Period=86400, - Statistics=["Sum"], - ) - datapoints = response.get("Datapoints", []) - total = sum(dp.get("Sum", 0) for dp in datapoints) - return int(total), len(datapoints) - - read_iops, read_count = _fetch("ReadIOPS") - write_iops, write_count = _fetch("WriteIOPS") - has_io = (read_iops > 0) or (write_iops > 0) - return has_io, read_iops, write_iops, read_count + write_count - - except ClientError: - # On error, assume no IO (don't skip the finding) but return 0 datapoints - # so the caller knows IO was not verified. - return False, 0, 0, 0 - + now = datetime.now(timezone.utc) + window_start = now - timedelta(seconds=idle_days_threshold * 86400) + period = _choose_period(idle_days_threshold) + findings: List[Finding] = [] -def _estimate_monthly_cost(instance_class: str, multi_az: bool) -> str: - """Rough monthly cost estimate based on instance class. + for page in pages: + for raw_item in page.get("DBInstances", []): + # --- Step 1: Normalize --- + n = _normalize_db_instance(raw_item, now) + if n is None: + continue + + # --- Step 2: EXCLUSION RULES --- + + # EXCLUSION: status must be available + if n["normalized_status"] != _ELIGIBLE_STATUS: + continue + + # EXCLUSION: not standalone — cluster member or any form of read replica + if n["db_cluster_identifier"] is not None: + continue + if n["read_replica_source_db_instance_identifier"] is not None: + continue + if n["read_replica_source_db_cluster_identifier"] is not None: + continue + + # EXCLUSION: too young to evaluate + if n["age_days"] < idle_days_threshold: + continue + + # --- Step 3: CloudWatch (FAIL RULE on error; SKIP ITEM if no data) --- + db_connections_max = _get_database_connections_max( + cloudwatch, + n["db_instance_id"], + window_start, + now, + period, + ) - Rates are approximate MySQL/PostgreSQL us-east-1 on-demand pricing. - Oracle, SQL Server, and other engines have different (often higher) costs. - """ - cost_map = { - "db.t3.micro": 12, - "db.t3.small": 24, - "db.t3.medium": 49, - "db.t3.large": 97, - "db.t3.xlarge": 194, - "db.t4g.micro": 11, - "db.t4g.small": 22, - "db.t4g.medium": 44, - "db.t4g.large": 88, - "db.t4g.xlarge": 175, - "db.r5.large": 172, - "db.r5.xlarge": 344, - "db.r5.2xlarge": 688, - "db.r6g.large": 155, - "db.r6g.xlarge": 310, - "db.r6i.large": 184, - "db.r6i.xlarge": 368, - "db.r6i.2xlarge": 736, - "db.r7g.large": 175, - "db.r7g.xlarge": 350, - "db.r7g.2xlarge": 700, - "db.m5.large": 125, - "db.m5.xlarge": 250, - "db.m6g.large": 113, - "db.m6g.xlarge": 225, - "db.m6i.large": 139, - "db.m6i.xlarge": 277, - "db.m6i.2xlarge": 554, - "db.m7g.large": 130, - "db.m7g.xlarge": 260, - "db.m7g.2xlarge": 520, - } + if db_connections_max is None: + # No datapoints → insufficient trusted evidence → SKIP ITEM + continue + + # EXCLUSION: observed client connections + if db_connections_max > 0: + continue + + # --- Step 4: EMIT --- + signals_used = [ + f"DB instance Status is '{_ELIGIBLE_STATUS}' (able to accept connections)", + "DB instance is standalone — not a DB cluster member or read replica", + f"Age is {n['age_days']} days, meeting the {idle_days_threshold}-day threshold", + f"DatabaseConnections Maximum was zero across all datapoints in the " + f"{idle_days_threshold}-day observation window " + "(CleanCloud-derived idle heuristic based on observed client network connections)", + ] + + findings.append( + Finding( + provider="aws", + rule_id="aws.rds.instance.idle", + resource_type="aws.rds.instance", + resource_id=n["db_instance_id"], + region=region, + estimated_monthly_cost_usd=None, + title=_FINDING_TITLE, + summary=( + f"RDS instance {n['db_instance_id']} has no observed client " + f"connection activity in the last {idle_days_threshold} days" + ), + reason=( + f"DB instance has no observed client connection activity " + f"via DatabaseConnections Maximum in the last {idle_days_threshold} days" + ), + risk=RiskLevel.MEDIUM, + confidence=ConfidenceLevel.MEDIUM, + detected_at=now, + evidence=Evidence( + signals_used=signals_used, + signals_not_checked=list(_SIGNALS_NOT_CHECKED), + time_window=f"{idle_days_threshold} days", + ), + details={ + "evaluation_path": "idle-rds-instance-review-candidate", + "db_instance_id": n["db_instance_id"], + "normalized_status": n["normalized_status"], + "instance_create_time": n["instance_create_time_utc"].isoformat(), + "age_days": n["age_days"], + "idle_days_threshold": idle_days_threshold, + "engine": n["engine"], + "engine_version": n["engine_version"], + "db_instance_class": n["db_instance_class"], + "database_connections_max": db_connections_max, + "db_cluster_identifier": n["db_cluster_identifier"], + "read_replica_source_db_instance_identifier": n[ + "read_replica_source_db_instance_identifier" + ], + "read_replica_source_db_cluster_identifier": n[ + "read_replica_source_db_cluster_identifier" + ], + "multi_az": n["multi_az"], + "allocated_storage_gib": n["allocated_storage_gib"], + "storage_type": n["storage_type"], + "dbi_resource_id": n["dbi_resource_id"], + "db_instance_arn": n["db_instance_arn"], + "tag_set": n["tag_set"], + }, + ) + ) - base_cost = cost_map.get(instance_class) - if base_cost: - total = base_cost * 2 if multi_az else base_cost - return f"~${total}/month (region dependent)" - return "Cost varies by instance class (region dependent)" - - -def _estimate_monthly_cost_usd(instance_class: str, multi_az: bool) -> Optional[float]: - """Numeric monthly cost estimate for aggregation.""" - cost_map = { - "db.t3.micro": 12, - "db.t3.small": 24, - "db.t3.medium": 49, - "db.t3.large": 97, - "db.t3.xlarge": 194, - "db.t4g.micro": 11, - "db.t4g.small": 22, - "db.t4g.medium": 44, - "db.t4g.large": 88, - "db.t4g.xlarge": 175, - "db.r5.large": 172, - "db.r5.xlarge": 344, - "db.r5.2xlarge": 688, - "db.r6g.large": 155, - "db.r6g.xlarge": 310, - "db.r6i.large": 184, - "db.r6i.xlarge": 368, - "db.r6i.2xlarge": 736, - "db.r7g.large": 175, - "db.r7g.xlarge": 350, - "db.r7g.2xlarge": 700, - "db.m5.large": 125, - "db.m5.xlarge": 250, - "db.m6g.large": 113, - "db.m6g.xlarge": 225, - "db.m6i.large": 139, - "db.m6i.xlarge": 277, - "db.m6i.2xlarge": 554, - "db.m7g.large": 130, - "db.m7g.xlarge": 260, - "db.m7g.2xlarge": 520, - } - base_cost = cost_map.get(instance_class) - if base_cost: - return float(base_cost * 2 if multi_az else base_cost) - return None + return findings diff --git a/docs/configuration.md b/docs/configuration.md index 9737ff7..7a5259d 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -166,9 +166,9 @@ See [rules.md](rules.md) for the full list of rule IDs and their supported param | Param | Rule ID | Default | Description | |---|---|---|---| -| `idle_days` | `aws.elbv2.load_balancer.idle` | 14 | Days of zero traffic before flagging | -| `idle_days` | `aws.ec2.nat_gateway.idle` | 14 | Days of zero traffic before flagging | -| `idle_days` | `aws.rds.instance.idle` | 14 | Days of no connections before flagging | +| `idle_days_threshold` | `aws.elbv2.load_balancer.idle` | 14 | Days of zero traffic before flagging | +| `idle_days_threshold` | `aws.ec2.nat_gateway.idle` | 14 | Days of zero traffic before flagging | +| `idle_days_threshold` | `aws.rds.instance.idle` | 14 | Days of no connections before flagging | | `idle_days` | `aws.sagemaker.endpoint.idle` | 14 | Days of zero invocations before flagging | | `idle_days` | `aws.sagemaker.notebook.idle` | 14 | Days since last control-plane activity before flagging | | `idle_days` | `azure.aml.compute.idle` | 14 | Days of no runs before flagging | diff --git a/docs/specs/aws/elastic_ip_unattached.md b/docs/specs/aws/elastic_ip_unattached.md new file mode 100644 index 0000000..1555725 --- /dev/null +++ b/docs/specs/aws/elastic_ip_unattached.md @@ -0,0 +1,350 @@ +# aws.ec2.elastic_ip.unattached — Canonical Rule Specification + +## 1. Intent + +Detect Elastic IP address records that are currently allocated to the account in the +scanned Region and are not currently associated with an instance or network interface, +so they can be reviewed for possible release if no longer needed. + +This is a **read-only review-candidate rule**. It is not a delete-safe rule and not +proof that release is operationally safe. + +--- + +## 2. AWS API Grounding + +Based on official EC2/VPC API and user-guide behavior. + +### Key DescribeAddresses fields + +| Field | Behaviour | +|---|---| +| `AllocationId` | Unique allocation identifier; present for VPC-domain addresses | +| `PublicIp` | Public IPv4 address string; always present | +| `CarrierIp` | Carrier IP for Wavelength zones; present when applicable | +| `AssociationId` | Present when currently associated with an instance or ENI | +| `InstanceId` | Present when associated with a specific instance | +| `NetworkInterfaceId` | Present when associated with a specific ENI | +| `PrivateIpAddress` | Present when currently associated | +| `Domain` | `"vpc"` or `"standard"` | +| `NetworkBorderGroup` | Network border group the address is in | +| `PublicIpv4Pool` | BYOIP pool identifier | +| `CustomerOwnedIp` | Customer-owned IP for Outposts | +| `CustomerOwnedIpv4Pool` | Customer-owned IP pool | +| `SubnetId` | Subnet for Wavelength addresses | +| `NetworkInterfaceOwnerId` | Owner of the associated ENI | +| `ServiceManaged` | Whether AWS manages the association on behalf of a service | +| `Tags` | Key-value tags | + +### Critical AWS facts + +1. **No `AllocationTime`** — the documented `Address` shape does not include + `AllocationTime`, `AssociationTime`, `DisassociationTime`, or any canonical + `unattached_since`/`allocated_since` timestamp. + +2. **Billing** — AWS charges for all public IPv4 addresses, including Elastic IPs, + whether associated or unassociated. Unattached state alone is not a unique billing + trigger. + +3. **DescribeAddresses** is non-paginated; one successful call returns all addresses + for the scanned Region and caller scope. + +4. **Region-specific** — Elastic IPs are Region-scoped. Results from one Region + cannot prove absence from another. + +5. **Association signals** — an address can be associated via `AssociationId`, + `InstanceId`, `NetworkInterfaceId`, or `PrivateIpAddress`. All four must be absent + for the address to be considered currently unattached. + +### Rule-design consequence + +- Current association state is the only baseline eligibility signal this rule can + prove from `DescribeAddresses`. +- No temporal predicate (allocation age, unattached duration) may be required for + baseline eligibility. +- Undocumented fields such as `AllocationTime` must not be used. + +--- + +## 3. Scope + +**Included:** +- All addresses returned by `DescribeAddresses` with a stable `resource_id` +- `currently_associated == False` (all canonical association fields absent) + +**Excluded:** +- Addresses missing `AllocationId`, `PublicIp`, and `CarrierIp` (no stable identity) +- Addresses with any canonical association field present + +--- + +## 4. Canonical Definitions + +| Term | Definition | +|---|---| +| `resource_id` | `AllocationId` → `PublicIp` → `CarrierIp` → absent (skip item if absent) | +| `currently_associated` | `True` when any of `association_id`, `instance_id`, `network_interface_id`, `private_ip_address` is present | +| `currently_unattached` | All four canonical association fields absent | + +--- + +## 5. Signal Model (Strict Separation) + +### Normalization Contract + +All rule logic must operate on normalized fields only. + +**Identity fields:** + +| Field | Derivation | +|---|---| +| `resource_id` | `address.AllocationId` → `address.PublicIp` → `address.CarrierIp` → absent (skip item if absent) | +| `allocation_id` | `address.AllocationId` → `null` | +| `public_ip` | `address.PublicIp` → `null` | +| `carrier_ip` | `address.CarrierIp` → `null` | + +**Association fields (all must be absent for currently_unattached):** + +| Field | Derivation | +|---|---| +| `association_id` | `address.AssociationId` → `null` | +| `instance_id` | `address.InstanceId` → `null` | +| `network_interface_id` | `address.NetworkInterfaceId` → `null` | +| `private_ip_address` | `address.PrivateIpAddress` → `null` | + +**Context fields:** + +| Field | Derivation | +|---|---| +| `domain` | `address.Domain` → `null` | +| `network_interface_owner_id` | `address.NetworkInterfaceOwnerId` → `null` | +| `network_border_group` | `address.NetworkBorderGroup` → `null` | +| `public_ipv4_pool` | `address.PublicIpv4Pool` → `null` | +| `customer_owned_ip` | `address.CustomerOwnedIp` → `null` | +| `customer_owned_ipv4_pool` | `address.CustomerOwnedIpv4Pool` → `null` | +| `subnet_id` | `address.SubnetId` → `null` | +| `service_managed` | `address.ServiceManaged` → `null` | +| `tags` | `address.Tags` → `[]` | + +String-valued fields must be normalized only from non-empty strings. +Malformed or unexpected field types must not be converted into positive eligibility evidence. + +### A. EXCLUSION_RULES + +| Condition | Result | +|---|---| +| `resource_id` absent | **SKIP** (malformed identity) | +| any canonical association field present | **SKIP** (currently associated) | + +There must be **no** exclusion for `service_managed`, tags, `domain`, BYOIP fields, +`network_border_group`, or `public_ipv4_pool`. + +### B. DETECTION_SIGNAL + +| Condition | Result | +|---|---| +| `resource_id` present, all association fields absent | **EMIT** | + +### C. CONTEXTUAL_SIGNALS (non-detecting) + +All context fields are evidence/details only. `network_interface_owner_id` and +`service_managed` are contextual and must not affect eligibility. + +--- + +## 6. Evaluation Order (Mandatory) + +1. Call `DescribeAddresses` once for the scanned Region; fail rule on error. +2. Validate that the top-level `Addresses` field is present and iterable; fail rule if not. +3. Normalize each address item; skip items that return `None`. +4. For each normalized address, apply EXCLUSION_RULES sequentially. +5. Emit findings for remaining eligible addresses. + +No raw AWS field access after Step 3. + +--- + +## 7. Confidence Model + +| Condition | Confidence | +|---|---| +| All exclusion checks passed | `HIGH` | + +High confidence refers to current unattached state, not to release safety or +business irrelevance. `DescribeAddresses` deterministically reports association state. + +--- + +## 8. Risk Model + +| Condition | Risk | +|---|---| +| Finding emitted | `LOW` | + +--- + +## 9. Cost Model + +AWS charges for all public IPv4 addresses regardless of association state. Unattached +state alone is not a unique billing trigger. + +- Do not present unattached state as a unique billing trigger. +- Do not hardcode a fixed estimate such as `$3.75/month`. +- `estimated_monthly_cost_usd` must be `None`. + +--- + +## 10. Failure Behavior + +### Required API + +- `ec2:DescribeAddresses` — failure → **FAIL RULE** + +### Response integrity + +- `Addresses` key absent from response → **FAIL RULE** +- `Addresses` value not iterable as a list → **FAIL RULE** + +### Item-level + +- Address missing stable identity (`resource_id` absent) → **SKIP** (not FAIL RULE) +- Malformed contextual fields → **SKIP** that field; never fail the rule + +--- + +## 11. Blind Spots + +Every finding must disclose in `signals_not_checked`: + +1. Future planned attachment or operational reserve intent not known +2. DNS / allowlist / manual failover dependencies +3. Application-level use of the reserved public IP +4. Exact monthly pricing from the current pricing page +5. Service-managed lifecycle expectations outside current association state + +--- + +## 12. Evidence Contract + +Every finding **must** include all of the following (null allowed, never omitted): + +| Field | Requirement | +|---|---| +| `evaluation_path` | Exactly `"unattached-eip-review-candidate"` | +| `resource_id` | Always present | +| `allocation_id` | Present or `null` | +| `public_ip` | Present or `null` | +| `carrier_ip` | Present or `null` | +| `domain` | Present or `null` | +| `currently_associated` | Always `false` | +| `association_id` | Always `null` | +| `instance_id` | Always `null` | +| `network_interface_id` | Always `null` | +| `private_ip_address` | Always `null` | + +Optional contextual fields: +- `network_interface_owner_id`, `network_border_group`, `public_ipv4_pool`, + `customer_owned_ip`, `customer_owned_ipv4_pool`, `subnet_id`, `service_managed`, `tags` + +--- + +## 13. Title and Reason Contract + +| Field | Value | +|---|---| +| `title` | `"Unattached Elastic IP review candidate"` | +| `reason` | `"Address has no current association per DescribeAddresses"` | + +**Hard rules:** +- Do NOT call the address "safe to release" +- Do NOT claim an allocation age or unattached duration +- Do NOT use `AllocationTime` as evidence + +--- + +## 14. API and IAM Contract + +**Required:** `ec2:DescribeAddresses` + +### API usage constraints + +- `DescribeAddresses` has no documented pagination; one call defines the full address set +- No undocumented fields (`AllocationTime`, etc.) may be used + +--- + +## 15. Acceptance Scenarios + +### Must emit + +1. VPC EIP with `AllocationId`, `PublicIp`, no association fields → EMIT HIGH +2. Standard-domain address with no `InstanceId` or other association field → EMIT HIGH +3. BYOIP / customer-owned / `service_managed` contextual fields present, no association fields → EMIT +4. `CarrierIp` only (no `AllocationId`, no `PublicIp`) → EMIT; `CarrierIp` is `resource_id` + +### Must skip + +1. Address with `AssociationId` → SKIP +2. Address with `NetworkInterfaceId` but no `AssociationId` → SKIP +3. Address with `InstanceId` but no `AssociationId` → SKIP +4. Address with `PrivateIpAddress` but no `AssociationId` → SKIP +5. Address missing `AllocationId`, `PublicIp`, and `CarrierIp` → SKIP + +### Must fail + +1. `DescribeAddresses` unauthorized or request failure → FAIL RULE +2. Response missing `Addresses` key → FAIL RULE +3. Response `Addresses` not a list → FAIL RULE + +### Must NOT happen + +1. Temporal threshold applied to baseline eligibility +2. `AllocationTime` used for any eligibility or evidence logic +3. `$3.75` or any hardcoded cost in `estimated_monthly_cost_usd` +4. `domain == "standard"` used as an exclusion +5. `service_managed` used as an exclusion +6. `AssociationId` as the sole association check (other association fields ignored) + +--- + +## 16. In-File Contract + +``` +Rule: aws.ec2.elastic_ip.unattached + +Intent: + Detect Elastic IP address records that are currently allocated to the account + in the scanned Region and are not currently associated with an instance or + network interface. + +Exclusions: + - resource_id absent (malformed identity) + - any canonical association field present (currently associated) + +Detection: + - resource_id present + - association_id, instance_id, network_interface_id, private_ip_address all absent + +Key rules: + - This is a review-candidate rule, not a delete-safe rule. + - No temporal threshold — current unattached state is the sole eligibility signal. + - Do not use AllocationTime (undocumented field). + - All four canonical association fields must be checked, not only AssociationId. + - Missing/non-iterable Addresses response fails the rule. + - Do not hardcode a fixed monthly cost estimate. + +Blind spots: + - future planned attachment or operational reserve intent not known + - DNS / allowlist / manual failover dependencies + - application-level use of the reserved public IP + - service-managed lifecycle expectations outside current association state + +APIs: + - ec2:DescribeAddresses +``` + +--- + +## 17. Implementation Constants + +No rule-level numeric constants required for baseline eligibility. diff --git a/docs/specs/aws/elb_idle.md b/docs/specs/aws/elb_idle.md new file mode 100644 index 0000000..0510581 --- /dev/null +++ b/docs/specs/aws/elb_idle.md @@ -0,0 +1,364 @@ +# aws.elbv2.alb.idle / aws.elbv2.nlb.idle / aws.elb.clb.idle — Canonical Rule Specification + +## 1. Intent + +Detect ALB, NLB, and CLB load balancers that are at least `idle_days_threshold` days old +and show no trusted CloudWatch evidence of client traffic during the full lookback window, +so they can be reviewed as potential cleanup candidates. + +This is a **read-only review-candidate rule family**. It is not a delete-safe rule family. + +--- + +## 2. AWS API Grounding + +Based on official ELB / ELBv2 API and CloudWatch documentation. + +### Key facts + +1. ELBv2 `DescribeLoadBalancers` returns `LoadBalancerArn`, `LoadBalancerName`, `CreatedTime`, + `Scheme`, `VpcId`, `State`, and `Type` (`application`, `network`, `gateway`). +2. Classic ELB `DescribeLoadBalancers` returns `LoadBalancerName`, `CreatedTime`, `Scheme`, + `VPCId`, `DNSName`, and `Instances`. +3. ALB and CLB metrics are published only when requests flow; missing datapoints may be treated + as zero for ALB and CLB metrics. +4. NLB metrics `NewFlowCount`, `ProcessedBytes`, and `ActiveFlowCount` are documented as always + reported. Missing datapoints for NLB metrics must be treated as incomplete / untrusted — + not as zero. +5. ALB metrics are published under `AWS/ApplicationELB` using `LoadBalancer` dimension. +6. NLB metrics are published under `AWS/NetworkELB` using `LoadBalancer` dimension. +7. CLB metrics are published under `AWS/ELB` using `LoadBalancerName` dimension. +8. The ELBv2 CloudWatch dimension value is the ARN suffix strictly after `loadbalancer/`. +9. Gateway Load Balancers (`Type == "gateway"`) are out of scope. +10. `CreatedTime` is a documented field and may be used for age calculation. + +--- + +## 3. Scope and Terminology + +- ALB: ELBv2 `Type == "application"` +- NLB: ELBv2 `Type == "network"` +- CLB: Classic Load Balancer returned by the classic ELB API +- Gateway LBs (`Type == "gateway"`) must be skipped +- "idle over N days" means no trusted CloudWatch client-traffic signal over the full configured window + +--- + +## 4. API and IAM Contract + +**Required:** +- `elbv2:DescribeLoadBalancers` — failure → FAIL RULE for ELBv2 branch +- `elb:DescribeLoadBalancers` — failure → FAIL RULE for CLB branch +- `cloudwatch:GetMetricStatistics` — failure → FAIL RULE for the affected item's branch + +**Contextual (enrichment only; failure does not fail rule):** +- `elbv2:DescribeTargetGroups` +- `elbv2:DescribeTargetHealth` + +**Pagination:** ELBv2 and CLB pagination must be fully exhausted. + +--- + +## 5. Normalization Contract + +All rule logic must operate on normalized fields only. No raw AWS field access after +normalization. + +### ELBv2 Normalized Fields + +| Field | Derivation | +|---|---| +| `resource_id` | `LoadBalancerArn` → absent (skip) | +| `lb_family` | `"alb"` when `Type == "application"`, `"nlb"` when `Type == "network"`, `"unsupported"` otherwise | +| `load_balancer_name` | `LoadBalancerName` → null | +| `load_balancer_arn` | `LoadBalancerArn` → null | +| `created_time` | `CreatedTime` (timezone-aware UTC) → absent (skip) | +| `age_days` | `floor((now_utc - created_time_utc) / 86400)` | +| `scheme` | `Scheme` → null | +| `dns_name` | `DNSName` → null | +| `vpc_id` | `VpcId` → null | +| `state_code` | `State.Code` → null | + +### CLB Normalized Fields + +| Field | Derivation | +|---|---| +| `resource_id` | `LoadBalancerName` → absent (skip) | +| `lb_family` | Always `"clb"` | +| `load_balancer_name` | `LoadBalancerName` → null | +| `load_balancer_arn` | Always null | +| `created_time` | `CreatedTime` (timezone-aware UTC) → absent (skip) | +| `age_days` | `floor((now_utc - created_time_utc) / 86400)` | +| `scheme` | `Scheme` → null | +| `dns_name` | `DNSName` → null | +| `vpc_id` | `VPCId` → null | +| `state_code` | Always null | + +### Backend Context Fields (contextual only; never affect eligibility) + +ALB/NLB: `target_group_count`, `registered_target_count`, `has_registered_targets` +CLB: `registered_instance_count`, `has_registered_instances` + +String fields must be normalized from non-empty strings only. + +--- + +## 6. Trusted Traffic-Signal Contract + +### 6.1 ALB Traffic Contract + +Traffic present if any of: +- `RequestCount` `Sum > 0` +- `ProcessedBytes` `Sum > 0` +- `ActiveConnectionCount` `Sum > 0` + +Namespace: `AWS/ApplicationELB`, dimension `LoadBalancer = ` + +### 6.2 NLB Traffic Contract + +Traffic present if any of: +- `NewFlowCount` `Sum > 0` +- `ProcessedBytes` `Sum > 0` +- `ActiveFlowCount` `Maximum > 0` + +Namespace: `AWS/NetworkELB`, dimension `LoadBalancer = ` + +**NLB-specific:** Missing datapoints for any of these three metrics over the full lookback +window must be treated as incomplete / untrusted → FAIL RULE. + +### 6.3 CLB Traffic Contract + +Traffic present if any of: +- `RequestCount` `Sum > 0` +- `EstimatedProcessedBytes` `Sum > 0` + +Namespace: `AWS/ELB`, dimension `LoadBalancerName = ` + +### 6.4 Metric-Reading Rules + +- ALB/CLB: missing datapoints (none reported) may be treated as zero (no traffic). +- NLB: missing datapoints over the full window must be treated as FAIL RULE. +- Any required metric read failure (non-permission API error) → FAIL RULE. +- Metric evaluation is deterministic: positive signal → traffic present; all-zero with + complete coverage → zero-traffic candidate; NLB incomplete → FAIL RULE. + +--- + +## 7. Backend Registration Context Contract + +Backend registration is contextual only. Zero registered targets/instances increases +confidence but does not independently qualify a load balancer as idle. + +ALBs can be useful with rules performing redirects or fixed responses; +"no registered targets" must never be treated as equivalent to "unused." + +--- + +## 8. Evaluation Order (Mandatory) + +**ELBv2 branch:** +1. Retrieve and fully paginate ELBv2 load balancers; fail ELBv2 branch on error. +2. Normalize each ELBv2 item. +3. Skip items with `lb_family == "unsupported"`. +4. Skip items without stable identity or without usable `created_time`. +5. Skip items where `age_days < idle_days_threshold`. +6. Skip items where `state_code` is not `"active"` or `"active_impaired"`. +7. Retrieve CloudWatch traffic signals; fail rule on error. +8. Skip items with trusted traffic present. +9. Enrich with target-group/target-health context (best-effort; failure degrades context, not rule). +10. Emit findings. + +**CLB branch:** +11. Retrieve and fully paginate CLB inventory; fail CLB branch on error. +12. Normalize each CLB item. +13. Skip items without stable identity or without usable `created_time`. +14. Skip items where `age_days < idle_days_threshold`. +15. Retrieve CloudWatch traffic signals; fail rule on error. +16. Skip items with trusted traffic present. +17. Enrich with registered-instance context from normalized item. +18. Emit findings. + +No raw AWS field access after normalization. + +--- + +## 9. Exclusion Rules + +| Condition | Result | +|---|---| +| `resource_id` absent | **SKIP ITEM** | +| `lb_family == "unsupported"` | **SKIP ITEM** | +| `created_time` absent or not safely comparable | **SKIP ITEM** | +| `age_days < idle_days_threshold` | **SKIP ITEM** | +| ELBv2 `state_code` not `"active"` or `"active_impaired"` | **SKIP ITEM** | +| Trusted traffic signal present | **SKIP ITEM** | +| ELBv2 dimension unparsable from ARN | **SKIP ITEM** | + +No exclusion for: registered targets present, zero registered targets, scheme, VPC presence, tags. + +--- + +## 10. Failure Model + +- `elbv2:DescribeLoadBalancers` error → **FAIL RULE** (ELBv2 branch) +- `elb:DescribeLoadBalancers` error → **FAIL RULE** (CLB branch) +- CloudWatch metric error for any evaluated item → **FAIL RULE** +- NLB metric with no datapoints over full window → **FAIL RULE** +- Target-group / target-health enrichment failure → degrade context only (not FAIL RULE) + +--- + +## 11. Evidence and Cost Contract + +### 11.1 Required Evidence/Details Fields + +Every emitted finding must include: +- `evaluation_path = "idle-load-balancer-review-candidate"` +- `lb_family` +- `resource_id` +- `load_balancer_name` +- `load_balancer_arn` +- `scheme` +- `dns_name` +- `vpc_id` +- `created_time` +- `age_days` +- `idle_days_threshold` +- `traffic_window_days` +- `traffic_signals_checked` +- `traffic_detected = false` + +Family-specific: +- ALB/NLB: `state_code`, `has_registered_targets`, `registered_target_count`, `target_group_count` +- CLB: `has_registered_instances`, `registered_instance_count` + +### 11.2 Cost Estimation Boundary + +- `estimated_monthly_cost_usd = null` +- Do not hardcode static cost guesses such as `~$16-22/month`. + +--- + +## 12. Confidence Model + +| Condition | Confidence | +|---|---| +| Zero traffic AND no registered targets/instances | `HIGH` | +| Zero traffic AND registered targets/instances still present | `MEDIUM` | + +No LOW-confidence finding may be emitted. Metric failure = FAIL RULE, not LOW finding. + +--- + +## 13. Title and Reason Contract + +| Condition | Title | Reason | +|---|---|---| +| ALB finding | `"Idle ALB review candidate"` | `"ALB has no trusted CloudWatch traffic signal in the last {N} days"` | +| NLB finding | `"Idle NLB review candidate"` | `"NLB has no trusted CloudWatch traffic signal in the last {N} days"` | +| CLB finding | `"Idle CLB review candidate"` | `"CLB has no trusted CloudWatch traffic signal in the last {N} days"` | + +Do NOT claim the load balancer is safe to delete. + +--- + +## 14. Risk Model + +| Condition | Risk | +|---|---| +| Finding emitted | `MEDIUM` | + +--- + +## 15. Acceptance Scenarios + +### Must emit + +1. ALB older than threshold, `state_code == "active"`, no ALB traffic over full window, zero targets → EMIT, HIGH +2. NLB older than threshold, `state_code == "active_impaired"`, zero NLB traffic with valid datapoints, registered targets → EMIT, MEDIUM +3. CLB older than threshold, zero CLB traffic, no instances → EMIT, HIGH + +### Must skip + +4. ELBv2 `Type == "gateway"` → SKIP +5. Load balancer younger than threshold → SKIP +6. ALB/NLB with any metric > 0 → SKIP +7. CLB with any metric > 0 → SKIP +8. ELBv2 in `"provisioning"` or `"failed"` state → SKIP +9. ELBv2 with ARN from which CloudWatch dimension cannot be extracted → SKIP + +### Must fail + +10. CloudWatch metric read failure for evaluated item → FAIL RULE +11. Inventory pagination failure → FAIL RULE +12. NLB metric missing datapoints over full window → FAIL RULE + +### Must NOT happen + +1. LOW-confidence finding emitted +2. Metric failure → LOW finding +3. Gateway LB evaluated +4. `estimated_monthly_cost_usd` set to a non-null value +5. `has_traffic=True, fetch_failed=True` producing any finding + +--- + +## 16. In-File Contract + +``` +Rule: aws.elbv2.alb.idle +Rule: aws.elbv2.nlb.idle +Rule: aws.elb.clb.idle + + (spec — docs/specs/aws/elb_idle.md) + +Intent: + Detect ALB, NLB, and CLB load balancers that are at least + idle_days_threshold days old and show no trusted CloudWatch evidence of + client traffic during the full lookback window, so they can be reviewed + as potential cleanup candidates. + +Exclusions: + - resource_id absent (malformed identity) + - lb_family == "unsupported" (gateway LB or unknown type) + - created_time absent or not safely comparable + - age_days < idle_days_threshold (too new to evaluate) + - ELBv2 state_code not "active" or "active_impaired" + - trusted traffic present (any CloudWatch signal > 0) + - ELBv2 ARN dimension unparsable + +Detection: + - resource_id present, lb_family in {"alb","nlb","clb"} + - age_days >= idle_days_threshold + - ELBv2: state_code "active" or "active_impaired" + - all traffic signals absent during full lookback window + +Key rules: + - ALB: RequestCount Sum>0, ProcessedBytes Sum>0, or ActiveConnectionCount Sum>0 + - NLB: NewFlowCount Sum>0, ProcessedBytes Sum>0, or ActiveFlowCount Maximum>0 + - NLB: missing datapoints over full window = FAIL RULE (not zero) + - CLB: RequestCount Sum>0 or EstimatedProcessedBytes Sum>0 + - Any metric read failure = FAIL RULE; no LOW-confidence path + - ELBv2 dimension strictly from ARN suffix after loadbalancer/ + - Backend registration is contextual only + - estimated_monthly_cost_usd = None + +Blind spots: + - planned future usage or blue/green staging + - seasonal traffic patterns outside the current lookback window + - DNS / allowlist / manual failover dependencies + - NLB traffic rejected by security groups (not in CloudWatch) + +APIs: + - elbv2:DescribeLoadBalancers + - elb:DescribeLoadBalancers + - cloudwatch:GetMetricStatistics + - elbv2:DescribeTargetGroups (contextual) + - elbv2:DescribeTargetHealth (contextual) +``` + +--- + +## 17. Implementation Constants + +- `_DEFAULT_IDLE_DAYS_THRESHOLD = 14` diff --git a/docs/specs/aws/eni_detached.md b/docs/specs/aws/eni_detached.md new file mode 100644 index 0000000..afb2aff --- /dev/null +++ b/docs/specs/aws/eni_detached.md @@ -0,0 +1,310 @@ +# aws.ec2.eni.detached — Canonical Rule Specification + +## 1. Intent + +Detect network interfaces that are currently not attached according to the EC2 +`DescribeNetworkInterfaces` contract, so they can be reviewed as possible cleanup +candidates if no longer needed. + +This is a **read-only review-candidate rule**. It is not a delete-safe rule. + +--- + +## 2. AWS API Grounding + +Based on official EC2 API and User Guide. + +### Key facts + +1. `DescribeNetworkInterfaces` is the canonical API for enumerating ENIs in the scanned + Region/account scope; AWS strongly recommends paginated requests. +2. Top-level `Status` valid values: `available | associated | attaching | in-use | detaching`. +3. AWS explicitly states: if an ENI is not attached, `Status == "available"`. +4. `Attachment` is optional; `Attachment.Status` valid values: `attaching | attached | detaching | detached`. +5. The documented shape does **not** include `CreateTime`, `DetachTime`, or any + `detached_since` / `allocated_since` timestamp. +6. Requester-managed ENIs are created by AWS services on your behalf; if a service + detached an ENI but did not delete it, you can delete the detached ENI. + +### Rule-design consequences + +- Current not-attached state must be determined from documented current-state fields only. +- No temporal inference (age, detach duration) may be used. +- Top-level `Status` is the canonical state authority. +- `requesterManaged`, `operator.managed`, `interfaceType`, and `description` are contextual + only — not eligibility gates. + +--- + +## 3. Scope + +- "Not currently attached" means `Status == "available"` per the documented EC2 contract. +- The rule is evaluated independently per Region. + +--- + +## 4. API and IAM Contract + +**Required:** `ec2:DescribeNetworkInterfaces` — failure → FAIL RULE + +**Pagination:** Must be fully exhausted; no early exit. + +--- + +## 5. Normalization Contract + +All rule logic must operate on normalized fields only. No raw AWS field access after +normalization. + +### Identity fields + +| Field | Derivation | +|---|---| +| `resource_id` | `NetworkInterfaceId` → absent (skip) | +| `network_interface_id` | `NetworkInterfaceId` → absent (skip) | + +### State fields + +| Field | Derivation | +|---|---| +| `normalized_status` | `Status` → absent | + +### Attachment fields + +| Field | Derivation | +|---|---| +| `attachment_status` | `Attachment.Status` → null | +| `attachment_id` | `Attachment.AttachmentId` → null | +| `attachment_instance_id` | `Attachment.InstanceId` → null | +| `attachment_instance_owner_id` | `Attachment.InstanceOwnerId` → null | + +### Ownership / service-context fields + +| Field | Derivation | +|---|---| +| `interface_type` | `InterfaceType` → null | +| `requester_managed` | `RequesterManaged` (bool only) → null | +| `operator_managed` | `Operator.Managed` (bool only) → null | +| `operator_principal` | `Operator.Principal` → null | + +### Network / resource-metadata fields + +| Field | Derivation | +|---|---| +| `description` | `Description` → null | +| `availability_zone` | `AvailabilityZone` → null | +| `subnet_id` | `SubnetId` → null | +| `vpc_id` | `VpcId` → null | +| `private_ip_address` | `PrivateIpAddress` → null | +| `public_ip` | `Association.PublicIp` → null | +| `tag_set` | `TagSet` → `[]` | + +Normalization requirements: +- String fields: normalized only from non-empty strings. +- Boolean fields: normalized only from actual `bool` values. +- Malformed contextual fields must not produce positive eligibility evidence. + +--- + +## 6. Current Attachment-State Determination + +Top-level `normalized_status` is the **sole** state authority. + +| `normalized_status` | Eligibility | +|---|---| +| `"available"` | **ELIGIBLE** (not currently attached) | +| `"in-use"` | SKIP | +| `"attaching"` | SKIP | +| `"detaching"` | SKIP | +| `"associated"` | SKIP | + +**Attachment consistency check:** +- If `normalized_status == "available"` and `attachment_status` is `"attached"`, + `"attaching"`, or `"detaching"` → structural inconsistency → **SKIP ITEM**. +- `attachment_status` is validation only; it does not override `normalized_status`. + +--- + +## 7. Service-Managed / Requester-Managed Handling + +`requester_managed`, `operator_managed`, and `interface_type` are contextual only. +None of them exclude an ENI from evaluation. AWS documents that if a service detached +an ENI and did not delete it, the ENI is a valid deletion candidate. + +--- + +## 8. Evaluation Order (Mandatory) + +1. Retrieve and fully paginate `DescribeNetworkInterfaces`; fail rule on error. +2. Normalize each ENI item; skip non-dict or identity-absent items. +3. Skip items with absent `normalized_status`. +4. Skip items where `normalized_status != "available"`. +5. Skip items where `attachment_status` conflicts with the available state. +6. Emit findings for remaining items. + +No raw AWS field access after Step 2. + +--- + +## 9. Exclusion Rules + +| Condition | Result | +|---|---| +| `network_interface_id` absent | **SKIP ITEM** | +| `normalized_status` absent | **SKIP ITEM** | +| `normalized_status != "available"` | **SKIP ITEM** | +| `normalized_status == "available"` and `attachment_status` in `{"attached","attaching","detaching"}` | **SKIP ITEM** | + +No exclusion for: `requester_managed`, `operator_managed`, `interface_type`, tags, description. + +--- + +## 10. Failure Model + +- `DescribeNetworkInterfaces` request/pagination error → **FAIL RULE** +- Non-dict ENI item → SKIP ITEM (not FAIL RULE) +- Missing identity → SKIP ITEM (not FAIL RULE) + +--- + +## 11. Evidence and Cost Contract + +### 11.1 Required Evidence/Details Fields + +| Field | Requirement | +|---|---| +| `evaluation_path` | Exactly `"detached-eni-review-candidate"` | +| `network_interface_id` | Always present | +| `normalized_status` | Always `"available"` | +| `attachment_status` | Present or null | +| `interface_type` | Present or null | +| `requester_managed` | Present or null | +| `operator_managed` | Present or null | +| `operator_principal` | Present or null | +| `availability_zone` | Present or null | +| `subnet_id` | Present or null | +| `vpc_id` | Present or null | +| `private_ip_address` | Present or null | +| `public_ip` | Present or null | + +Optional: `attachment_id`, `attachment_instance_id`, `attachment_instance_owner_id`, +`description`, `tag_set`. + +### 11.2 Cost Estimation Boundary + +- `estimated_monthly_cost_usd = null` +- Do not hardcode a generic detached-ENI monthly cost estimate. + +--- + +## 12. Confidence Model + +| Condition | Confidence | +|---|---| +| `normalized_status == "available"` and no structural conflict | `HIGH` | + +High confidence refers to current not-attached state, not delete safety. + +--- + +## 13. Title and Reason Contract + +| Field | Value | +|---|---| +| `title` | `"Detached ENI review candidate"` | +| `reason` | `"ENI Status is 'available' — not currently attached per DescribeNetworkInterfaces"` | + +Do NOT claim the ENI is safe to delete. + +--- + +## 14. Risk Model + +| Condition | Risk | +|---|---| +| Finding emitted | `LOW` | + +--- + +## 15. Acceptance Scenarios + +### Must emit + +1. ENI with `Status == "available"`, no attachment object → EMIT HIGH +2. ENI with `Status == "available"`, `Attachment.Status == "detached"` → EMIT HIGH +3. Requester-managed ENI with `Status == "available"` → EMIT (include context) +4. Operator-managed ENI with `Status == "available"` → EMIT (include context) +5. Any `interface_type` value, `Status == "available"` → EMIT (no type exclusion) + +### Must skip + +6. ENI with `Status == "in-use"` → SKIP +7. ENI with `Status == "attaching"`, `"detaching"`, or `"associated"` → SKIP +8. ENI with `Status == "available"` and `Attachment.Status == "attached"` → SKIP +9. ENI missing `NetworkInterfaceId` → SKIP +10. ENI missing `Status` → SKIP + +### Must fail + +11. `DescribeNetworkInterfaces` request/pagination failure → FAIL RULE + +### Must NOT happen + +1. Temporal threshold applied to eligibility +2. `CreateTime` or age used for any eligibility or evidence logic +3. `interface_type` used as an exclusion +4. `requester_managed == true` used as an exclusion +5. MEDIUM or LOW confidence for a valid not-attached ENI +6. Hardcoded cost estimate in `estimated_monthly_cost_usd` + +--- + +## 16. In-File Contract + +``` +Rule: aws.ec2.eni.detached + + (spec — docs/specs/aws/eni_detached.md) + +Intent: + Detect network interfaces that are currently not attached according to the + EC2 DescribeNetworkInterfaces contract, so they can be reviewed as possible + cleanup candidates if no longer needed. + +Exclusions: + - network_interface_id absent (malformed identity) + - normalized_status absent (missing current-state signal) + - normalized_status != "available" (attached or other non-eligible state) + - structural inconsistency: normalized_status == "available" but + attachment_status in {"attached","attaching","detaching"} + +Detection: + - network_interface_id present + - normalized_status == "available" + - attachment_status absent, null, or "detached" + +Key rules: + - Top-level Status is the sole state authority; attachment_status is validation only. + - No temporal threshold — current not-attached state is the sole eligibility signal. + - No exclusion for interface_type, requester_managed, or operator_managed. + - Do not use CreateTime or any age/duration field for eligibility. + - estimated_monthly_cost_usd = None. + - Confidence: HIGH. + - Risk: LOW. + +Blind spots: + - how long the ENI has been in a not-currently-attached state + - previous attachment history + - whether an AWS service expects to recycle or clean up this ENI + - application, failover, or operational intent + - exact pricing impact + +APIs: + - ec2:DescribeNetworkInterfaces +``` + +--- + +## 17. Implementation Constants + +No rule-level numeric constants required for baseline eligibility. diff --git a/docs/specs/aws/nat_gateway_idle.md b/docs/specs/aws/nat_gateway_idle.md new file mode 100644 index 0000000..bf934fb --- /dev/null +++ b/docs/specs/aws/nat_gateway_idle.md @@ -0,0 +1,347 @@ +# aws.ec2.nat_gateway.idle — Canonical Rule Specification + +## 1. Intent + +Detect NAT Gateways that are currently `available`, old enough to evaluate, and show no +trusted CloudWatch traffic/activity evidence during the configured observation window, so +they can be reviewed as possible cleanup candidates. + +This is a **read-only review-candidate rule**. It is not a delete-safe rule. + +--- + +## 2. AWS API Grounding + +Based on official EC2/VPC API and CloudWatch documentation. + +### Key facts + +1. `DescribeNatGateways` is the canonical API for enumerating NAT Gateways in the scanned + Region/account scope and supports pagination. +2. `NatGateway.State` valid values: `pending | failed | available | deleting | deleted`. +3. AWS documents that `available` means the NAT Gateway is able to process traffic and that + this status remains until you delete it; it does not indicate usage. +4. `NatGateway.CreateTime` is a documented timestamp field. +5. NAT Gateways have `ConnectivityType` values `public | private`. +6. AWS documents NAT Gateway CloudWatch metrics in namespace `AWS/NATGateway` with dimension + `NatGatewayId`. +7. Required CloudWatch metrics and statistics: + - `BytesOutToDestination` → `Sum` + - `BytesInFromSource` → `Sum` + - `BytesInFromDestination` → `Sum` + - `BytesOutToSource` → `Sum` + - `ActiveConnectionCount` → `Maximum` +8. AWS states `ActiveConnectionCount == 0` indicates no active TCP connections. +9. AWS pricing: charged per hour available and per GB processed. No canonical fixed monthly + USD value exists in the product docs used by this rule. +10. `GetMetricStatistics` does not guarantee ordered datapoints. Missing datapoints must not + be assumed to mean zero activity. + +### Rule-design consequences + +- Only `available` NAT Gateways are eligible. +- Age thresholding is valid because `CreateTime` is documented. +- CloudWatch is the sole trusted activity source; missing datapoints → SKIP ITEM. +- Route-table references are contextual only, not eligibility gates. + +--- + +## 3. Scope + +- "idle" is a CleanCloud-derived heuristic: no trusted CloudWatch activity over the full + observation window. +- `age_days = floor((now_utc - create_time_utc) / 86400)` +- Observation window: `now_utc - idle_days_threshold * 86400` → `now_utc` +- Rule is evaluated independently per Region. + +--- + +## 4. API and IAM Contract + +**Required:** `ec2:DescribeNatGateways` — failure → FAIL RULE +**Required:** `cloudwatch:GetMetricStatistics` — failure → FAIL RULE +**Optional:** `ec2:DescribeRouteTables` — failure → degrade context, do not fail rule + +**Pagination:** `DescribeNatGateways` must be fully exhausted; no early exit. + +--- + +## 5. Normalization Contract + +All rule logic must operate on normalized fields only. No raw AWS field access after +normalization. + +### Identity fields + +| Field | Derivation | +|---|---| +| `resource_id` | `NatGatewayId` → absent (skip) | +| `nat_gateway_id` | `NatGatewayId` → absent (skip) | + +### State / age fields + +| Field | Derivation | +|---|---| +| `normalized_state` | `State` → absent (skip) | +| `create_time_utc` | `CreateTime` (timezone-aware UTC only) → absent (skip) | +| `age_days` | `floor((now_utc - create_time_utc) / 86400)` if valid and not future → absent (skip) | + +### Core context fields + +| Field | Derivation | +|---|---| +| `connectivity_type` | `ConnectivityType` → null | +| `availability_mode` | `AvailabilityMode` → null | +| `vpc_id` | `VpcId` → null | +| `subnet_id` | `SubnetId` → null | +| `nat_gateway_addresses` | `NatGatewayAddresses` → `[]` | +| `attached_appliances` | `AttachedAppliances` → `[]` | +| `auto_scaling_ips` | `AutoScalingIps` → null | +| `auto_provision_zones` | `AutoProvisionZones` → null | +| `tag_set` | `Tags` → `[]` | + +Normalization requirements: +- String fields: normalized only from non-empty strings. +- Timestamp: timezone-aware UTC only; naive datetime → absent (skip). +- Future `CreateTime` → absent (skip). +- Malformed contextual fields must not produce positive idle evidence. + +--- + +## 6. CloudWatch Traffic Contract + +### 6.1 Required Metrics + +| Metric | Statistic | Activity if | +|---|---|---| +| `BytesOutToDestination` | `Sum` | `Sum > 0` | +| `BytesInFromSource` | `Sum` | `Sum > 0` | +| `BytesInFromDestination` | `Sum` | `Sum > 0` | +| `BytesOutToSource` | `Sum` | `Sum > 0` | +| `ActiveConnectionCount` | `Maximum` | `Maximum > 0` | + +Namespace: `AWS/NATGateway`, dimension `NatGatewayId = ` + +### 6.2 Datapoint Completeness + +- Missing datapoints for any required metric must not be treated as zero. +- If any required metric returns no datapoints → **SKIP ITEM** (insufficient evidence). +- If any required metric request fails → **FAIL RULE**. + +### 6.3 Period Selection + +Period must be chosen deterministically from the configured lookback age: + +| Window age | Period requirement | +|---|---| +| < 15 days | Multiple of 60 seconds | +| 15–63 days | Multiple of 300 seconds | +| > 63 days | Multiple of 3600 seconds | + +Using `idle_days_threshold * 86400` as the Period satisfies all three constraints (86400 is a +multiple of 60, 300, and 3600) and produces a single full-window aggregate bucket. + +--- + +## 7. Route-Table Handling + +Route-table references are contextual only. + +- A route targeting `nat-gateway-id` may be surfaced as evidence. +- Route-table presence must not suppress an otherwise valid idle finding. +- Route-table absence must not compensate for missing or incomplete CloudWatch evidence. +- `DescribeRouteTables` failure → degrade context, do not fail rule. + +--- + +## 8. Evaluation Order (Mandatory) + +1. Retrieve and fully paginate `DescribeNatGateways`; fail rule on error. +2. Normalize each item. +3. Skip items with absent identity, state, `create_time_utc`, or `age_days`. +4. Skip items where `normalized_state != "available"`. +5. Skip items where `age_days < idle_days_threshold`. +6. Retrieve required CloudWatch metrics; fail rule on API error. +7. Skip items where any required metric returns no datapoints. +8. Skip items where any metric shows activity (`> 0`). +9. Retrieve route-table context (best-effort). +10. Emit findings. + +No raw AWS field access after Step 2. + +--- + +## 9. Exclusion Rules + +| Condition | Result | +|---|---| +| `nat_gateway_id` absent | **SKIP ITEM** | +| `normalized_state` absent | **SKIP ITEM** | +| `normalized_state != "available"` | **SKIP ITEM** | +| `create_time_utc` absent / naive / future | **SKIP ITEM** | +| `age_days < idle_days_threshold` | **SKIP ITEM** | +| Any required metric has no datapoints | **SKIP ITEM** | +| Any required metric shows activity | **SKIP ITEM** | + +No exclusion for: `connectivity_type`, `availability_mode`, tags, route-table presence. + +--- + +## 10. Failure Model + +- `DescribeNatGateways` error → **FAIL RULE** +- CloudWatch metric API error → **FAIL RULE** +- `DescribeRouteTables` error → degrade context only + +--- + +## 11. Evidence and Cost Contract + +### 11.1 Required Evidence/Details Fields + +| Field | Requirement | +|---|---| +| `evaluation_path` | `"idle-nat-gateway-review-candidate"` | +| `nat_gateway_id` | Always present | +| `normalized_state` | Always `"available"` | +| `create_time` | ISO 8601 UTC string | +| `age_days` | Integer | +| `idle_days_threshold` | Integer | +| `connectivity_type` | Present or null | +| `availability_mode` | Present or null | +| `vpc_id` | Present or null | +| `subnet_id` | Present or null | +| `bytes_out_to_destination` | Numeric (0.0 if metric zero) | +| `bytes_in_from_source` | Numeric | +| `bytes_in_from_destination` | Numeric | +| `bytes_out_to_source` | Numeric | +| `active_connection_count_max` | Numeric | + +Optional: `nat_gateway_addresses`, `attached_appliances`, `route_table_referenced`, +`auto_scaling_ips`, `auto_provision_zones`, `tag_set`. + +### 11.2 Cost Estimation Boundary + +- `estimated_monthly_cost_usd = null` +- Do not hardcode a fixed NAT Gateway monthly cost estimate. + +--- + +## 12. Confidence Model + +| Condition | Confidence | +|---|---| +| Zero traffic AND route-table confirms no reference | `HIGH` | +| Zero traffic AND route-table referenced OR unavailable | `MEDIUM` | + +No LOW-confidence finding may be emitted. Metric failure = FAIL RULE. + +--- + +## 13. Title and Reason Contract + +| Field | Value | +|---|---| +| `title` | `"Idle NAT Gateway review candidate"` | +| `reason` | `"NAT Gateway has no trusted CloudWatch traffic signal in the last {N} days"` | + +Do NOT claim the NAT Gateway is safe to delete. + +--- + +## 14. Risk Model + +| Condition | Risk | +|---|---| +| Finding emitted | `MEDIUM` | + +--- + +## 15. Acceptance Scenarios + +### Must emit + +1. Available, old enough, all metrics zero, no route-table reference → EMIT HIGH +2. Available, old enough, all metrics zero, route-table still references → EMIT MEDIUM +3. Available, old enough, all metrics zero, route-table lookup failed → EMIT MEDIUM + +### Must skip + +4. State `pending`, `failed`, `deleting`, or `deleted` → SKIP +5. Available but younger than threshold → SKIP +6. Any byte metric `Sum > 0` → SKIP +7. `ActiveConnectionCount Maximum > 0` → SKIP +8. Absent/naive/future `CreateTime` → SKIP +9. Any required metric returns no datapoints → SKIP + +### Must fail + +10. `DescribeNatGateways` failure → FAIL RULE +11. CloudWatch metric fetch failure → FAIL RULE + +### Must NOT happen + +1. LOW-confidence finding emitted +2. CloudWatch metric failure → LOW-confidence finding +3. Missing datapoints treated as zero activity +4. `estimated_monthly_cost_usd` set to non-null +5. Route-table absence used as traffic evidence substitute + +--- + +## 16. In-File Contract + +``` +Rule: aws.ec2.nat_gateway.idle + + (spec — docs/specs/aws/nat_gateway_idle.md) + +Intent: + Detect NAT Gateways that are currently available, old enough to evaluate, + and show no trusted CloudWatch traffic/activity evidence during the + configured observation window, so they can be reviewed as possible cleanup + candidates. + +Exclusions: + - nat_gateway_id absent (malformed identity) + - normalized_state absent (missing current-state signal) + - normalized_state != "available" + - create_time_utc absent, naive, or in the future + - age_days < idle_days_threshold (too new to evaluate) + - any required CloudWatch metric has no datapoints (insufficient evidence) + - any required metric shows activity > 0 + +Detection: + - nat_gateway_id present, normalized_state == "available" + - age_days >= idle_days_threshold + - all 5 required CloudWatch metrics return datapoints and are all zero + +Key rules: + - Missing CloudWatch datapoints → SKIP ITEM (not zero). + - CloudWatch API failure → FAIL RULE (not LOW-confidence finding). + - 5 required metrics: BytesOutToDestination, BytesInFromSource, + BytesInFromDestination, BytesOutToSource (Sum), ActiveConnectionCount (Maximum). + - Route-table context is contextual only; absence does not substitute + for CloudWatch evidence. + - Naive CreateTime → SKIP ITEM. + - estimated_monthly_cost_usd = None. + - Confidence: HIGH (no route ref) or MEDIUM (route ref or unavailable). + - Risk: MEDIUM. + +Blind spots: + - planned future usage or DR/failover intent + - seasonal or cyclical usage outside the observation window + - organizational ownership or business intent + - exact region-specific pricing impact + +APIs: + - ec2:DescribeNatGateways + - cloudwatch:GetMetricStatistics + - ec2:DescribeRouteTables (contextual) +``` + +--- + +## 17. Implementation Constants + +- `_DEFAULT_IDLE_DAYS_THRESHOLD = 14` diff --git a/docs/specs/aws/rds_idle.md b/docs/specs/aws/rds_idle.md new file mode 100644 index 0000000..c0ecdf3 --- /dev/null +++ b/docs/specs/aws/rds_idle.md @@ -0,0 +1,313 @@ +# aws.rds.instance.idle — Canonical Rule Specification + +## 1. Intent + +Detect provisioned standalone DB instances that are currently `available`, old enough to +evaluate, and show no trusted CloudWatch client-connection activity for the configured +observation window, so they can be reviewed as possible cleanup candidates. + +This is a **CleanCloud-derived review heuristic**, not an AWS-native DB instance state. +It is a **read-only review-candidate rule** — not a delete-safe rule. + +--- + +## 2. AWS API Grounding + +Based on official RDS, CloudWatch, and pricing documentation. + +### Key facts + +1. `DescribeDBInstances` is the canonical API for enumerating provisioned DB instances in + the scanned Region/account scope and supports pagination. +2. AWS explicitly notes that `DescribeDBInstances` can also return Amazon Neptune and Amazon + DocumentDB DB instances. +3. `DBInstance.InstanceCreateTime` is a documented timestamp field. +4. `DBInstance.DBInstanceStatus` is a documented state field with many values including + `available`, `creating`, `starting`, `stopped`, `stopping`, `backing-up`, `modifying`. +5. The RDS status guide states that `available` DB instances are billed. +6. `DBInstance.ReadReplicaSourceDBInstanceIdentifier`, + `DBInstance.ReadReplicaSourceDBClusterIdentifier`, and `DBInstance.DBClusterIdentifier` + are documented scope fields. +7. RDS publishes instance-level metrics in CloudWatch namespace `AWS/RDS`. +8. `DatabaseConnections` is the number of client network connections to the DB instance. +9. AWS explicitly states that `DatabaseConnections` does **not** include: + - sessions that no longer have a network connection but which the database hasn't cleaned up + - sessions created by the database engine for its own purposes + - sessions created by the database engine's parallel execution capabilities + - sessions created by the database engine job scheduler + - Amazon RDS connections +10. CloudWatch `GetMetricStatistics` uses inclusive `StartTime`, exclusive `EndTime`, rounds + `StartTime` based on lookback age, does not guarantee datapoint order, and imposes + retention / `Period` constraints. +11. AWS pricing docs state that billing for DB instance hours starts when a DB instance + becomes available and continues while it is running in an available state. +12. Fixed monthly USD cost estimates are not canonical from AWS docs. + +### Implications + +- Only `available` DB instances are eligible. +- Age thresholding is supportable because `InstanceCreateTime` is documented. +- `DatabaseConnections` Maximum is the sole required activity metric for this rule. +- `DatabaseConnections == 0` does not prove total absence of all engine activity; it only + proves absence of observed client network connections in the metric contract. +- Connection pooling and proxy layers (RDS Proxy, PgBouncer, application connection pools) + can reduce the reliability of instance-level observed client connection counts. +- `estimated_monthly_cost_usd = null`. + +--- + +## 3. Scope and Terminology + +- **"DB instance"** — an item returned by `DescribeDBInstances`. +- **"standalone"** — not a read replica of another DB instance, not a read replica of a + DB cluster, and not a member of a DB cluster. +- **"idle"** — no observed client connection activity via trusted CloudWatch + `DatabaseConnections` metric evidence for the full configured observation window. +- `idle_days_threshold` — operator-configurable, default 14. +- `observation_window_start_utc = now_utc − idle_days_threshold × 86400 seconds` +- `observation_window_end_utc = now_utc` +- `age_days = floor((now_utc − instance_create_time_utc) / 86400 seconds)` +- The rule is evaluated independently per Region. + +**Scope boundary:** standalone provisioned DB instances only. Read replicas and cluster +members are out of scope. + +--- + +## 4. Canonical Rule Statement + +A DB instance is eligible only when **all** of the following are true: + +- Stable DB instance identity exists +- `DBInstanceStatus == "available"` +- The instance is standalone +- `age_days >= idle_days_threshold` +- All `DatabaseConnections Maximum` datapoints in the observation window are exactly zero + +No additional predicate may be required for baseline eligibility, including: +CPU utilisation thresholds, storage I/O thresholds, engine family, instance class, +Multi-AZ setting, allocated storage size, or tag presence/absence. + +--- + +## 5. Normalization Contract + +All rule logic must operate on normalized fields only. + +| Canonical field | Source field | Absent / invalid | +|---|---|---| +| `resource_id` | `DBInstanceIdentifier` | skip item | +| `db_instance_id` | `DBInstanceIdentifier` | skip item | +| `normalized_status` | `DBInstanceStatus` | skip item | +| `instance_create_time_utc` | `InstanceCreateTime` (tz-aware UTC) | skip item | +| `age_days` | floor((now − create_time) / 86400) | skip item | +| `db_cluster_identifier` | `DBClusterIdentifier` | null | +| `read_replica_source_db_instance_identifier` | `ReadReplicaSourceDBInstanceIdentifier` | null | +| `read_replica_source_db_cluster_identifier` | `ReadReplicaSourceDBClusterIdentifier` | null | +| `engine` | `Engine` | null | +| `engine_version` | `EngineVersion` | null | +| `db_instance_class` | `DBInstanceClass` | null | +| `multi_az` | `MultiAZ` (bool only) | null | +| `allocated_storage_gib` | `AllocatedStorage` (int only) | null | +| `storage_type` | `StorageType` | null | +| `dbi_resource_id` | `DbiResourceId` | null | +| `db_instance_arn` | `DBInstanceArn` | null | +| `tag_set` | `TagList` (list only) | `[]` | + +Normalization requirements: +- String-valued fields: normalize only from non-empty strings. +- Timestamp fields: must be timezone-aware UTC before use; naive → skip item. +- Future `InstanceCreateTime` → skip item. + +--- + +## 6. Idle-Activity Determination + +CloudWatch is the **sole trusted activity source** for this rule. + +**Required metric:** + +| Field | Value | +|---|---| +| Namespace | `AWS/RDS` | +| Dimension | `DBInstanceIdentifier = db_instance_id` | +| Metric | `DatabaseConnections` | +| Statistic | `Maximum` | +| Period | `idle_days_threshold × 86400` (satisfies all CloudWatch retention constraints) | + +**Interpretation:** + +- If `DatabaseConnections Maximum > 0` anywhere in the observation window → **not idle** (skip item). +- The DB instance is idle only when datapoints exist and all `Maximum` values are exactly `0`. + +**Datapoint completeness:** + +- Missing datapoints **must not** be interpreted as zero connections. +- If `DatabaseConnections` returns no datapoints → **SKIP ITEM** (insufficient evidence). +- If `DatabaseConnections` retrieval fails → **FAIL RULE**. + +--- + +## 7. Pricing / Cost Boundary + +- `estimated_monthly_cost_usd = null` — no hardcoded per-engine or per-class estimates. + +--- + +## 8. Deterministic Evaluation Order + +1. Retrieve and fully paginate `DescribeDBInstances`. +2. Normalize each item. +3. For each normalized item: + - `db_instance_id` absent → **SKIP ITEM** + - `normalized_status` absent → **SKIP ITEM** + - `normalized_status != "available"` → **SKIP ITEM** + - `db_cluster_identifier` present → **SKIP ITEM** + - `read_replica_source_db_instance_identifier` present → **SKIP ITEM** + - `read_replica_source_db_cluster_identifier` present → **SKIP ITEM** + - `instance_create_time_utc` absent/invalid/future → **SKIP ITEM** + - `age_days < idle_days_threshold` → **SKIP ITEM** + - Retrieve `DatabaseConnections Maximum` + - API failure → **FAIL RULE** + - No datapoints → **SKIP ITEM** + - Any `Maximum > 0` → **SKIP ITEM** + - Otherwise → **EMIT** + +--- + +## 9. Exclusion Rules + +1. `db_instance_id` absent → malformed identity +2. `normalized_status` absent → missing state signal +3. `normalized_status != "available"` → not currently evaluable +4. `db_cluster_identifier` present → cluster member (out of scope) +5. `read_replica_source_db_instance_identifier` present → DB instance read replica +6. `read_replica_source_db_cluster_identifier` present → cross-cluster read replica +7. `instance_create_time_utc` absent/naive/future → missing/invalid age source +8. `age_days < idle_days_threshold` → too young +9. `DatabaseConnections` returns no datapoints → insufficient trusted evidence +10. Any `DatabaseConnections Maximum > 0` → observed client connections + +--- + +## 10. Failure Model + +**Rule-level failures (FAIL RULE):** +- `DescribeDBInstances` request/pagination failure +- `DatabaseConnections` CloudWatch retrieval failure +- Permission failure for required APIs + +**Item-level skips (SKIP ITEM):** +- Missing identity, status, or create-time +- Non-available status +- Replica / cluster-member scope exclusions +- Too young +- Insufficient CloudWatch datapoints +- Observed client connections + +--- + +## 11. Evidence / Details Contract + +### Required details fields + +``` +evaluation_path = "idle-rds-instance-review-candidate" +db_instance_id +normalized_status = "available" +instance_create_time (ISO-8601 UTC) +age_days +idle_days_threshold +engine +engine_version +db_instance_class +database_connections_max +``` + +### Optional context fields + +``` +db_cluster_identifier +read_replica_source_db_instance_identifier +read_replica_source_db_cluster_identifier +multi_az +allocated_storage_gib +storage_type +dbi_resource_id +db_instance_arn +tag_set +``` + +### Required evidence wording + +**Signals used** must state: +- DB instance Status is `available` +- The DB instance is standalone (not a read replica or cluster member) +- The DB instance age met the configured threshold +- `DatabaseConnections` Maximum was zero across the observation window +- The finding is based on a CleanCloud-derived idle heuristic over observed client network connections + +**Signals not checked** must state major blind spots: +- Sessions without network connections that the database hasn't cleaned up +- Sessions created by the database engine for its own purposes +- Sessions created by parallel execution capabilities or job schedulers +- Amazon RDS connections +- RDS Proxy, PgBouncer, and application connection pools that can hide real usage while keeping observed client connection counts low or zero +- Planned future usage or disaster recovery intent +- Exact region-specific pricing impact + +--- + +## 12. Confidence and Risk + +| Condition | Confidence | Risk | +|---|---|---| +| Datapoints present, all `Maximum == 0`, all gates satisfied | `MEDIUM` | `MEDIUM` | + +- **Do not** emit LOW-confidence findings when required metric data is unavailable — SKIP ITEM or FAIL RULE instead. +- `DatabaseConnections` has documented blind spots (§2 item 9), so `MEDIUM` (not `HIGH`) is the ceiling. + +--- + +## 13. Non-Goals / Blind Spots + +This rule does not prove: +- The DB instance is safe to delete +- The DB instance has no engine-internal activity +- The DB instance had no uncounted sessions +- The DB instance will not be used again +- CPU or storage I/O was zero +- Backup, snapshot, or retention needs have been evaluated + +--- + +## 14. Acceptance Scenarios + +| # | Scenario | Expected | +|---|---|---| +| 1 | Standalone `available` instance, old enough, `DatabaseConnections Maximum == 0` across all datapoints | EMIT — confidence MEDIUM | +| 2 | Instance status not `available` | SKIP ITEM | +| 3 | DB instance read replica (`ReadReplicaSourceDBInstanceIdentifier` set) | SKIP ITEM | +| 4 | Cross-cluster read replica (`ReadReplicaSourceDBClusterIdentifier` set) | SKIP ITEM | +| 5 | DB cluster member (`DBClusterIdentifier` set) | SKIP ITEM | +| 6 | Younger than `idle_days_threshold` | SKIP ITEM | +| 7 | Any `DatabaseConnections Maximum > 0` | SKIP ITEM | +| 8 | `DatabaseConnections` returns no datapoints | SKIP ITEM | +| 9 | Missing/naive/future `InstanceCreateTime` | SKIP ITEM | +| 10 | `DescribeDBInstances` fails | FAIL RULE | +| 11 | `DatabaseConnections` retrieval fails | FAIL RULE | + +--- + +## 15. Implementation Constraints + +- Use `DescribeDBInstances` as the sole required inventory source. +- Use `DatabaseConnections Maximum` as the sole required activity metric. +- Exhaust pagination; no early exit. +- Use top-level `DBInstanceStatus` as the canonical state signal. +- Use documented `InstanceCreateTime` for age gating; naive → skip. +- Do not interpret missing datapoints as zero connections. +- Do not emit LOW-confidence findings when required CloudWatch data is absent. +- Do not require CPU or I/O metrics for baseline eligibility. +- Do not hardcode engine/class/storage monthly cost estimates. +- `estimated_monthly_cost_usd = null`. diff --git a/pyproject.toml b/pyproject.toml index b84c282..9df6169 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "cleancloud" -version = "1.19.0" +version = "1.20.0" description = "Read-only cloud hygiene for AWS, Azure, and GCP. Multi-account org scanning, CI/CD enforcement, and deterministic cost modeling. No agents, no telemetry." readme = "README.md" requires-python = ">=3.10" diff --git a/tests/cleancloud/providers/aws/test_aws_elastic_ip_unattached.py b/tests/cleancloud/providers/aws/test_aws_elastic_ip_unattached.py index 4550763..17cbf8b 100644 --- a/tests/cleancloud/providers/aws/test_aws_elastic_ip_unattached.py +++ b/tests/cleancloud/providers/aws/test_aws_elastic_ip_unattached.py @@ -1,158 +1,487 @@ -from datetime import datetime, timedelta, timezone +import pytest +from botocore.exceptions import BotoCoreError, ClientError from cleancloud.core.confidence import ConfidenceLevel -from cleancloud.providers.aws.rules.elastic_ip_unattached import ( - find_unattached_elastic_ips, -) - - -def test_find_unattached_elastic_ips(mock_boto3_session): - region = "us-east-1" - ec2 = mock_boto3_session._ec2 - - old_date = datetime.now(timezone.utc) - timedelta(days=60) - recent_date = datetime.now(timezone.utc) - timedelta(days=10) - - # Mock describe_addresses (non-paginated by AWS) - ec2.describe_addresses.return_value = { - "Addresses": [ - { - "AllocationId": "eipalloc-1", - "PublicIp": "203.0.113.1", - "Domain": "vpc", - "AllocationTime": old_date, - # No AssociationId = unattached - }, - { - "AllocationId": "eipalloc-2", - "PublicIp": "203.0.113.2", - "Domain": "vpc", - "AllocationTime": old_date, - "AssociationId": "eipassoc-123", # attached - "InstanceId": "i-123", - }, - { - "AllocationId": "eipalloc-3", - "PublicIp": "203.0.113.3", - "Domain": "vpc", - "AllocationTime": recent_date, # too young - # No AssociationId = unattached but recent - }, - { - # No AllocationId (genuine EC2-Classic EIP — identified by PublicIp) - "PublicIp": "203.0.113.4", - "Domain": "standard", - # No AllocationTime - # No AssociationId = unattached - }, - { - "AllocationId": "eipalloc-5", - "PublicIp": "203.0.113.5", - "Domain": "vpc", - # No AllocationTime (VPC EIP missing timestamp — should be skipped) - # No AssociationId = unattached - }, - ] - } - - findings = find_unattached_elastic_ips(mock_boto3_session, region) - eip_ids = {f.resource_id for f in findings} - findings_by_id = {f.resource_id: f for f in findings} - - # Positive: old (60 days) unattached EIP - assert "eipalloc-1" in eip_ids - - # Positive: classic EIP (domain=standard) without AllocationTime (flagged conservatively) - # Uses PublicIp as resource_id since Classic EIPs have no AllocationId - assert "203.0.113.4" in eip_ids - - # Negative: attached EIP - assert "eipalloc-2" not in eip_ids - - # Negative: unattached but too young (10 days < 30 day threshold) - assert "eipalloc-3" not in eip_ids - - # Negative: VPC EIP without AllocationTime — cannot determine age, skip - assert "eipalloc-5" not in eip_ids - - assert len(findings) == 2 - - # Verify cost estimate ($3.75/month for each EIP) - for f in findings: - assert f.estimated_monthly_cost_usd == 3.75 - - # Verify title includes "(Review Recommended)" - for f in findings: - assert f.title == "Unattached Elastic IP (Review Recommended)" - - # Verify confidence is HIGH for all findings - for f in findings: - assert f.confidence == ConfidenceLevel.HIGH - - # Verify VPC EIP details and wording - f1 = findings_by_id["eipalloc-1"] - assert f1.details["is_classic"] is False - assert f1.details["age_days"] == 60 - assert "allocation_time" in f1.details - assert "allocated" in f1.summary and "currently unattached" in f1.summary - assert "allocated" in f1.evidence.signals_used[1] - - # Verify Classic EIP details, wording, and PublicIp fallback for resource_id - f4 = findings_by_id["203.0.113.4"] - assert f4.resource_id == "203.0.113.4" - assert f4.details["is_classic"] is True - assert "age_days" not in f4.details - assert "allocation_time" not in f4.details - assert "Classic" in f4.summary - assert any("Classic EIP" in s for s in f4.evidence.signals_used) - assert any("deprecated" in s for s in f4.evidence.signals_used) - assert f4.evidence.time_window == "Unknown (Classic EIP, no AllocationTime)" - - -def test_find_unattached_elastic_ips_custom_threshold(mock_boto3_session): - region = "us-east-1" - ec2 = mock_boto3_session._ec2 - - date_45_days_ago = datetime.now(timezone.utc) - timedelta(days=45) - - ec2.describe_addresses.return_value = { - "Addresses": [ - { - "AllocationId": "eipalloc-5", - "PublicIp": "203.0.113.5", - "Domain": "vpc", - "AllocationTime": date_45_days_ago, - } +from cleancloud.core.risk import RiskLevel +from cleancloud.providers.aws.rules.elastic_ip_unattached import find_unattached_elastic_ips + +# --------------------------------------------------------------------------- +# Test helpers +# --------------------------------------------------------------------------- + +_REGION = "us-east-1" + + +def _eip( + allocation_id: str | None = "eipalloc-001", + public_ip: str | None = "203.0.113.1", + domain: str | None = "vpc", + association_id: str | None = None, + instance_id: str | None = None, + network_interface_id: str | None = None, + private_ip_address: str | None = None, + carrier_ip: str | None = None, + tags: list | None = None, + **extra, +) -> dict: + raw: dict = {} + if allocation_id is not None: + raw["AllocationId"] = allocation_id + if public_ip is not None: + raw["PublicIp"] = public_ip + if domain is not None: + raw["Domain"] = domain + if association_id is not None: + raw["AssociationId"] = association_id + if instance_id is not None: + raw["InstanceId"] = instance_id + if network_interface_id is not None: + raw["NetworkInterfaceId"] = network_interface_id + if private_ip_address is not None: + raw["PrivateIpAddress"] = private_ip_address + if carrier_ip is not None: + raw["CarrierIp"] = carrier_ip + if tags is not None: + raw["Tags"] = tags + raw.update(extra) + return raw + + +def _run(mock_boto3_session, addresses: list) -> list: + mock_boto3_session._ec2.describe_addresses.return_value = {"Addresses": addresses} + return find_unattached_elastic_ips(mock_boto3_session, _REGION) + + +# --------------------------------------------------------------------------- +# TestMustEmit +# --------------------------------------------------------------------------- + + +class TestMustEmit: + def test_vpc_eip_no_association_fields(self, mock_boto3_session): + findings = _run(mock_boto3_session, [_eip()]) + assert len(findings) == 1 + f = findings[0] + assert f.resource_id == "eipalloc-001" + assert f.rule_id == "aws.ec2.elastic_ip.unattached" + assert f.provider == "aws" + assert f.resource_type == "aws.ec2.elastic_ip" + assert f.region == _REGION + + def test_standard_domain_no_association_fields(self, mock_boto3_session): + findings = _run( + mock_boto3_session, + [_eip(allocation_id=None, public_ip="203.0.113.10", domain="standard")], + ) + assert len(findings) == 1 + assert findings[0].resource_id == "203.0.113.10" + + def test_carrier_ip_only_as_resource_id(self, mock_boto3_session): + """When only CarrierIp is present, it becomes the resource_id.""" + findings = _run( + mock_boto3_session, + [_eip(allocation_id=None, public_ip=None, carrier_ip="203.0.113.20")], + ) + assert len(findings) == 1 + assert findings[0].resource_id == "203.0.113.20" + + def test_byoip_and_service_managed_contextual_only(self, mock_boto3_session): + """BYOIP / service_managed fields are contextual and must not suppress the finding.""" + findings = _run( + mock_boto3_session, + [ + _eip( + PublicIpv4Pool="ipv4pool-ec2-abc", + ServiceManaged=True, + CustomerOwnedIp="10.0.0.5", + CustomerOwnedIpv4Pool="coip-pool-001", + ) + ], + ) + assert len(findings) == 1 + + def test_service_managed_false_still_emits(self, mock_boto3_session): + findings = _run(mock_boto3_session, [_eip(ServiceManaged=False)]) + assert len(findings) == 1 + + def test_tags_present_still_emits(self, mock_boto3_session): + findings = _run( + mock_boto3_session, + [_eip(tags=[{"Key": "env", "Value": "prod"}])], + ) + assert len(findings) == 1 + + def test_multiple_unattached_all_emitted(self, mock_boto3_session): + findings = _run( + mock_boto3_session, + [ + _eip(allocation_id="eipalloc-a", public_ip="1.2.3.4"), + _eip(allocation_id="eipalloc-b", public_ip="1.2.3.5"), + ], + ) + assert {f.resource_id for f in findings} == {"eipalloc-a", "eipalloc-b"} + + def test_empty_addresses_returns_empty(self, mock_boto3_session): + findings = _run(mock_boto3_session, []) + assert findings == [] + + +# --------------------------------------------------------------------------- +# TestMustSkip +# --------------------------------------------------------------------------- + + +class TestMustSkip: + def test_association_id_present(self, mock_boto3_session): + findings = _run( + mock_boto3_session, + [_eip(association_id="eipassoc-123")], + ) + assert findings == [] + + def test_instance_id_present_no_association_id(self, mock_boto3_session): + findings = _run(mock_boto3_session, [_eip(instance_id="i-abc")]) + assert findings == [] + + def test_network_interface_id_present_no_association_id(self, mock_boto3_session): + findings = _run(mock_boto3_session, [_eip(network_interface_id="eni-abc")]) + assert findings == [] + + def test_private_ip_address_present_no_association_id(self, mock_boto3_session): + findings = _run(mock_boto3_session, [_eip(private_ip_address="10.0.0.5")]) + assert findings == [] + + def test_missing_all_identity_fields(self, mock_boto3_session): + findings = _run( + mock_boto3_session, + [_eip(allocation_id=None, public_ip=None, carrier_ip=None)], + ) + assert findings == [] + + def test_mixed_attached_and_unattached(self, mock_boto3_session): + findings = _run( + mock_boto3_session, + [ + _eip(allocation_id="eipalloc-attached", association_id="eipassoc-x"), + _eip(allocation_id="eipalloc-free"), + ], + ) + assert len(findings) == 1 + assert findings[0].resource_id == "eipalloc-free" + + def test_non_dict_item_skipped_not_raised(self, mock_boto3_session): + """Non-dict items in Addresses must be skipped, not raise.""" + valid = _eip() + for bad in (None, "string", 42, ["list"]): + mock_boto3_session._ec2.describe_addresses.return_value = {"Addresses": [bad, valid]} + findings = find_unattached_elastic_ips(mock_boto3_session, _REGION) + assert len(findings) == 1, f"Expected 1 finding with bad item={bad!r}" + + def test_all_four_association_fields_each_independently_skip(self, mock_boto3_session): + """Each of the four association fields independently triggers SKIP.""" + for field, value in [ + ("association_id", "eipassoc-1"), + ("instance_id", "i-001"), + ("network_interface_id", "eni-001"), + ("private_ip_address", "10.0.0.1"), + ]: + findings = _run(mock_boto3_session, [_eip(**{field: value})]) + assert findings == [], f"Expected SKIP when {field} is present" + + +# --------------------------------------------------------------------------- +# TestMustFailRule +# --------------------------------------------------------------------------- + + +class TestMustFailRule: + def test_describe_addresses_unauthorized(self, mock_boto3_session): + mock_boto3_session._ec2.describe_addresses.side_effect = ClientError( + {"Error": {"Code": "UnauthorizedOperation", "Message": "denied"}}, + "DescribeAddresses", + ) + with pytest.raises(PermissionError, match="ec2:DescribeAddresses"): + find_unattached_elastic_ips(mock_boto3_session, _REGION) + + def test_describe_addresses_access_denied(self, mock_boto3_session): + mock_boto3_session._ec2.describe_addresses.side_effect = ClientError( + {"Error": {"Code": "AccessDenied", "Message": "denied"}}, + "DescribeAddresses", + ) + with pytest.raises(PermissionError, match="ec2:DescribeAddresses"): + find_unattached_elastic_ips(mock_boto3_session, _REGION) + + def test_describe_addresses_client_error_propagates(self, mock_boto3_session): + mock_boto3_session._ec2.describe_addresses.side_effect = ClientError( + {"Error": {"Code": "RequestExpired", "Message": "expired"}}, + "DescribeAddresses", + ) + with pytest.raises(ClientError): + find_unattached_elastic_ips(mock_boto3_session, _REGION) + + def test_describe_addresses_botocore_error_propagates(self, mock_boto3_session): + mock_boto3_session._ec2.describe_addresses.side_effect = BotoCoreError() + with pytest.raises(BotoCoreError): + find_unattached_elastic_ips(mock_boto3_session, _REGION) + + def test_addresses_key_absent_fails_rule(self, mock_boto3_session): + mock_boto3_session._ec2.describe_addresses.return_value = {} + with pytest.raises(RuntimeError, match="Addresses"): + find_unattached_elastic_ips(mock_boto3_session, _REGION) + + def test_addresses_not_a_list_fails_rule(self, mock_boto3_session): + mock_boto3_session._ec2.describe_addresses.return_value = {"Addresses": "bad"} + with pytest.raises(RuntimeError, match="Addresses"): + find_unattached_elastic_ips(mock_boto3_session, _REGION) + + def test_addresses_none_fails_rule(self, mock_boto3_session): + mock_boto3_session._ec2.describe_addresses.return_value = {"Addresses": None} + with pytest.raises(RuntimeError, match="Addresses"): + find_unattached_elastic_ips(mock_boto3_session, _REGION) + + +# --------------------------------------------------------------------------- +# TestNormalization +# --------------------------------------------------------------------------- + + +class TestNormalization: + def test_allocation_id_is_preferred_resource_id(self, mock_boto3_session): + findings = _run( + mock_boto3_session, + [_eip(allocation_id="eipalloc-pref", public_ip="1.2.3.4")], + ) + assert findings[0].resource_id == "eipalloc-pref" + + def test_public_ip_fallback_when_no_allocation_id(self, mock_boto3_session): + findings = _run( + mock_boto3_session, + [_eip(allocation_id=None, public_ip="5.6.7.8")], + ) + assert findings[0].resource_id == "5.6.7.8" + + def test_carrier_ip_fallback_when_no_allocation_or_public(self, mock_boto3_session): + findings = _run( + mock_boto3_session, + [_eip(allocation_id=None, public_ip=None, carrier_ip="9.10.11.12")], + ) + assert findings[0].resource_id == "9.10.11.12" + + def test_domain_absent_is_null_in_details(self, mock_boto3_session): + raw = {"AllocationId": "eipalloc-nd", "PublicIp": "1.2.3.4"} + _run(mock_boto3_session, [raw]) + mock_boto3_session._ec2.describe_addresses.return_value = {"Addresses": [raw]} + findings = find_unattached_elastic_ips(mock_boto3_session, _REGION) + assert findings[0].details["domain"] is None + + def test_empty_string_fields_treated_as_absent(self, mock_boto3_session): + """Empty string AllocationId must not be used as resource_id.""" + raw = {"AllocationId": "", "PublicIp": "1.2.3.4"} + mock_boto3_session._ec2.describe_addresses.return_value = {"Addresses": [raw]} + findings = find_unattached_elastic_ips(mock_boto3_session, _REGION) + assert findings[0].resource_id == "1.2.3.4" + + def test_optional_context_fields_captured(self, mock_boto3_session): + findings = _run( + mock_boto3_session, + [ + _eip( + NetworkBorderGroup="us-east-1-wl1-bos-wlz-1", + PublicIpv4Pool="ipv4pool-ec2-abc", + SubnetId="subnet-xyz", + NetworkInterfaceOwnerId="111122223333", + CustomerOwnedIp="10.0.0.5", + CustomerOwnedIpv4Pool="coip-001", + ) + ], + ) + d = findings[0].details + assert d["network_border_group"] == "us-east-1-wl1-bos-wlz-1" + assert d["public_ipv4_pool"] == "ipv4pool-ec2-abc" + assert d["subnet_id"] == "subnet-xyz" + assert d["network_interface_owner_id"] == "111122223333" + assert d["customer_owned_ip"] == "10.0.0.5" + assert d["customer_owned_ipv4_pool"] == "coip-001" + + def test_service_managed_string_enum_captured(self, mock_boto3_session): + """ServiceManaged is a string enum — captured as string context.""" + for value in ("alb", "nlb", "rnat", "rds"): + findings = _run(mock_boto3_session, [_eip(ServiceManaged=value)]) + assert findings[0].details["service_managed"] == value + + def test_service_managed_non_string_not_in_details(self, mock_boto3_session): + """Non-string values (e.g. bool) must not be treated as valid string enum.""" + for bad in (True, False, 1, None): + findings = _run(mock_boto3_session, [_eip(ServiceManaged=bad)]) + assert "service_managed" not in findings[0].details + + def test_service_managed_empty_string_not_in_details(self, mock_boto3_session): + findings = _run(mock_boto3_session, [_eip(ServiceManaged="")]) + assert "service_managed" not in findings[0].details + + def test_tags_normalized_to_dict(self, mock_boto3_session): + findings = _run( + mock_boto3_session, + [_eip(tags=[{"Key": "env", "Value": "prod"}, {"Key": "team", "Value": "ops"}])], + ) + assert findings[0].details["tags"] == {"env": "prod", "team": "ops"} + + def test_empty_tags_not_in_details(self, mock_boto3_session): + findings = _run(mock_boto3_session, [_eip(tags=[])]) + assert "tags" not in findings[0].details + + def test_allocation_id_null_in_details_when_absent(self, mock_boto3_session): + findings = _run( + mock_boto3_session, + [_eip(allocation_id=None, public_ip="1.2.3.4")], + ) + assert findings[0].details["allocation_id"] is None + + def test_carrier_ip_null_in_details_when_absent(self, mock_boto3_session): + findings = _run(mock_boto3_session, [_eip()]) + assert findings[0].details["carrier_ip"] is None + + +# --------------------------------------------------------------------------- +# TestConfidenceModel +# --------------------------------------------------------------------------- + + +class TestConfidenceModel: + def test_always_high_confidence(self, mock_boto3_session): + findings = _run(mock_boto3_session, [_eip()]) + assert findings[0].confidence == ConfidenceLevel.HIGH + + def test_standard_domain_also_high(self, mock_boto3_session): + findings = _run( + mock_boto3_session, + [_eip(allocation_id=None, public_ip="1.2.3.4", domain="standard")], + ) + assert findings[0].confidence == ConfidenceLevel.HIGH + + +# --------------------------------------------------------------------------- +# TestRiskModel +# --------------------------------------------------------------------------- + + +class TestRiskModel: + def test_risk_is_low(self, mock_boto3_session): + findings = _run(mock_boto3_session, [_eip()]) + assert findings[0].risk == RiskLevel.LOW + + +# --------------------------------------------------------------------------- +# TestCostModel +# --------------------------------------------------------------------------- + + +class TestCostModel: + def test_estimated_monthly_cost_always_none(self, mock_boto3_session): + """No hardcoded cost estimate allowed — must be None.""" + findings = _run(mock_boto3_session, [_eip()]) + assert findings[0].estimated_monthly_cost_usd is None + + +# --------------------------------------------------------------------------- +# TestEvidenceContract +# --------------------------------------------------------------------------- + + +class TestEvidenceContract: + def test_all_required_fields_present(self, mock_boto3_session): + findings = _run(mock_boto3_session, [_eip()]) + d = findings[0].details + required = [ + "evaluation_path", + "resource_id", + "allocation_id", + "public_ip", + "carrier_ip", + "domain", + "currently_associated", + "association_id", + "instance_id", + "network_interface_id", + "private_ip_address", ] - } + for field in required: + assert field in d, f"Missing required field: {field}" + + def test_evaluation_path_exact(self, mock_boto3_session): + findings = _run(mock_boto3_session, [_eip()]) + assert findings[0].details["evaluation_path"] == "unattached-eip-review-candidate" + + def test_currently_associated_always_false(self, mock_boto3_session): + findings = _run(mock_boto3_session, [_eip()]) + assert findings[0].details["currently_associated"] is False + + def test_association_fields_always_null(self, mock_boto3_session): + findings = _run(mock_boto3_session, [_eip()]) + d = findings[0].details + assert d["association_id"] is None + assert d["instance_id"] is None + assert d["network_interface_id"] is None + assert d["private_ip_address"] is None + + def test_signals_used_mention_not_associated(self, mock_boto3_session): + findings = _run(mock_boto3_session, [_eip()]) + combined = " ".join(findings[0].evidence.signals_used).lower() + assert "not associated" in combined or "currently not associated" in combined + + def test_signals_used_mention_allocated(self, mock_boto3_session): + findings = _run(mock_boto3_session, [_eip()]) + combined = " ".join(findings[0].evidence.signals_used).lower() + assert "allocated" in combined + + def test_signals_used_mention_aws_recommends(self, mock_boto3_session): + findings = _run(mock_boto3_session, [_eip()]) + combined = " ".join(findings[0].evidence.signals_used).lower() + assert "aws recommends" in combined or "recommends" in combined + + def test_signals_not_checked_include_blind_spots(self, mock_boto3_session): + findings = _run(mock_boto3_session, [_eip()]) + snc = " ".join(findings[0].evidence.signals_not_checked).lower() + assert "dns" in snc or "failover" in snc + assert "application" in snc or "app" in snc + assert "service-managed" in snc or "service managed" in snc - # Test with custom 60-day threshold - findings = find_unattached_elastic_ips(mock_boto3_session, region, days_unattached=60) - eip_ids = {f.resource_id for f in findings} + def test_time_window_is_none(self, mock_boto3_session): + """No temporal threshold — time_window must be None.""" + findings = _run(mock_boto3_session, [_eip()]) + assert findings[0].evidence.time_window is None - # Should NOT be detected (45 days < 60 days threshold) - assert "eipalloc-5" not in eip_ids - # Test with custom 30-day threshold (default) - findings = find_unattached_elastic_ips(mock_boto3_session, region, days_unattached=30) - eip_ids = {f.resource_id for f in findings} +# --------------------------------------------------------------------------- +# TestTitleAndReasonContract +# --------------------------------------------------------------------------- - # Should be detected (45 days >= 30 days threshold) - assert "eipalloc-5" in eip_ids - # Verify wording uses allocation age, not unattached duration - f = findings[0] - assert "allocated" in f.summary - assert "currently unattached" in f.summary - assert f.evidence.time_window == "30 days since allocation" +class TestTitleAndReasonContract: + def test_title(self, mock_boto3_session): + findings = _run(mock_boto3_session, [_eip()]) + assert findings[0].title == "Unattached Elastic IP review candidate" + def test_reason(self, mock_boto3_session): + findings = _run(mock_boto3_session, [_eip()]) + assert findings[0].reason == "Address has no current association per DescribeAddresses" -def test_find_unattached_elastic_ips_empty(mock_boto3_session): - region = "us-east-1" - ec2 = mock_boto3_session._ec2 + def test_summary_contains_resource_id(self, mock_boto3_session): + findings = _run(mock_boto3_session, [_eip()]) + assert "eipalloc-001" in findings[0].summary - ec2.describe_addresses.return_value = {"Addresses": []} + def test_title_does_not_claim_safe_to_release(self, mock_boto3_session): + findings = _run(mock_boto3_session, [_eip()]) + combined = (findings[0].title + findings[0].summary + findings[0].reason).lower() + assert "safe to release" not in combined - findings = find_unattached_elastic_ips(mock_boto3_session, region) + def test_no_allocation_age_in_title_or_reason(self, mock_boto3_session): + findings = _run(mock_boto3_session, [_eip()]) + combined = (findings[0].title + findings[0].summary + findings[0].reason).lower() + assert "days ago" not in combined + assert "allocated" not in combined or "no longer needed" in findings[0].summary.lower() - assert len(findings) == 0 + def test_no_hardcoded_cost_in_summary(self, mock_boto3_session): + findings = _run(mock_boto3_session, [_eip()]) + assert "$3.75" not in findings[0].summary + assert "$3.75" not in findings[0].reason diff --git a/tests/cleancloud/providers/aws/test_aws_elb_idle.py b/tests/cleancloud/providers/aws/test_aws_elb_idle.py index afd0811..807cede 100644 --- a/tests/cleancloud/providers/aws/test_aws_elb_idle.py +++ b/tests/cleancloud/providers/aws/test_aws_elb_idle.py @@ -1,14 +1,36 @@ +"""Tests for cleancloud/providers/aws/rules/elb_idle.py + +Covers all spec acceptance scenarios: + Must emit / Must skip / Must fail / Normalization / Traffic signals / + Confidence model / Cost model / Evidence contract / Title-and-reason contract / + Backend enrichment / Pagination / NLB missing-datapoints behaviour +""" + from datetime import datetime, timedelta, timezone from unittest.mock import MagicMock +import pytest +from botocore.exceptions import BotoCoreError, ClientError + from cleancloud.providers.aws.rules.elb_idle import find_idle_load_balancers +# --------------------------------------------------------------------------- +# Shared helpers +# --------------------------------------------------------------------------- + +_REGION = "us-east-1" +_ACCOUNT = "123456789012" +_THRESHOLD = 14 + + +def _now(): + return datetime.now(timezone.utc) + def _make_session(elbv2, elb, cloudwatch): - """Create a mock session that returns the given clients.""" session = MagicMock() - def client_side_effect(service_name, *args, **kwargs): + def _client(service_name, *args, **kwargs): if service_name == "elbv2": return elbv2 if service_name == "elb": @@ -17,242 +39,1298 @@ def client_side_effect(service_name, *args, **kwargs): return cloudwatch raise ValueError(f"Unexpected service: {service_name}") - session.client.side_effect = client_side_effect + session.client.side_effect = _client return session -def _make_elbv2_lb( +def _elbv2_lb( name="test-alb", lb_type="application", - age_days=30, + age_days=20, state="active", + arn_suffix=None, ): - now = datetime.now(timezone.utc) - arn = f"arn:aws:elasticloadbalancing:us-east-1:123456789012:loadbalancer/app/{name}/abc123" + suffix = arn_suffix if arn_suffix is not None else f"app/{name}/abc123" + arn = f"arn:aws:elasticloadbalancing:{_REGION}:{_ACCOUNT}:loadbalancer/{suffix}" if lb_type == "network": - arn = f"arn:aws:elasticloadbalancing:us-east-1:123456789012:loadbalancer/net/{name}/abc123" + suffix = arn_suffix if arn_suffix is not None else f"net/{name}/abc123" + arn = f"arn:aws:elasticloadbalancing:{_REGION}:{_ACCOUNT}:loadbalancer/{suffix}" return { "LoadBalancerArn": arn, "LoadBalancerName": name, "Type": lb_type, - "CreatedTime": now - timedelta(days=age_days), + "CreatedTime": _now() - timedelta(days=age_days), "State": {"Code": state}, - "DNSName": f"{name}.us-east-1.elb.amazonaws.com", + "DNSName": f"{name}.{_REGION}.elb.amazonaws.com", "VpcId": "vpc-12345", + "Scheme": "internet-facing", } -def _make_clb(name="test-clb", age_days=30, instances=None): - now = datetime.now(timezone.utc) +def _clb(name="test-clb", age_days=20, instances=None): return { "LoadBalancerName": name, - "CreatedTime": now - timedelta(days=age_days), - "DNSName": f"{name}.us-east-1.elb.amazonaws.com", + "CreatedTime": _now() - timedelta(days=age_days), + "DNSName": f"{name}.{_REGION}.elb.amazonaws.com", "VPCId": "vpc-12345", - "Instances": instances or [], + "Scheme": "internet-facing", + "Instances": instances if instances is not None else [], } -def test_idle_alb_detected(): - """Idle ALB with zero requests and no targets should be flagged as HIGH confidence.""" - elbv2 = MagicMock() - elb = MagicMock() - cloudwatch = MagicMock() +def _setup_elbv2(elbv2, lbs, tg_pages=None, target_health=None): + """Configure elbv2 mock with LB and target-group paginators.""" + lb_pag = MagicMock() + lb_pag.paginate.return_value = [{"LoadBalancers": lbs}] - # ALB setup - paginator = elbv2.get_paginator.return_value - paginator.paginate.return_value = [ - {"LoadBalancers": [_make_elbv2_lb(name="idle-alb", age_days=30)]} - ] + tg_pag = MagicMock() + tg_pag.paginate.return_value = tg_pages if tg_pages is not None else [{"TargetGroups": []}] - elbv2.describe_target_groups.return_value = {"TargetGroups": []} - cloudwatch.get_metric_statistics.return_value = {"Datapoints": []} + def _pag(name): + if name == "describe_load_balancers": + return lb_pag + if name == "describe_target_groups": + return tg_pag + raise ValueError(f"Unexpected paginator: {name}") - # CLB setup - empty - elb_paginator = elb.get_paginator.return_value - elb_paginator.paginate.return_value = [{"LoadBalancerDescriptions": []}] + elbv2.get_paginator.side_effect = _pag - session = _make_session(elbv2, elb, cloudwatch) - findings = find_idle_load_balancers(session, "us-east-1") + if target_health is not None: + elbv2.describe_target_health.return_value = target_health + else: + elbv2.describe_target_health.return_value = {"TargetHealthDescriptions": []} - assert len(findings) == 1 - f = findings[0] - assert f.rule_id == "aws.elbv2.alb.idle" - assert f.resource_type == "aws.elbv2.load_balancer" - assert f.confidence.value == "high" - assert f.risk.value == "medium" - assert f.details["type"] == "application" - assert f.details["has_targets"] is False - assert "idle-alb" in f.resource_id - assert f.estimated_monthly_cost_usd == 18.0 +def _setup_clb(elb, lbs): + pag = elb.get_paginator.return_value + pag.paginate.return_value = [{"LoadBalancerDescriptions": lbs}] -def test_active_alb_skipped(): - """ALB with traffic should NOT be flagged.""" - elbv2 = MagicMock() - elb = MagicMock() - cloudwatch = MagicMock() - paginator = elbv2.get_paginator.return_value - paginator.paginate.return_value = [ - {"LoadBalancers": [_make_elbv2_lb(name="active-alb", age_days=30)]} - ] +def _cw_no_traffic(): + """CloudWatch mock returning empty datapoints for all metrics.""" + cw = MagicMock() + cw.get_metric_statistics.return_value = {"Datapoints": []} + return cw - # Has traffic - cloudwatch.get_metric_statistics.return_value = {"Datapoints": [{"Sum": 1000}]} - elb_paginator = elb.get_paginator.return_value - elb_paginator.paginate.return_value = [{"LoadBalancerDescriptions": []}] +def _cw_nlb_zero_traffic(num_datapoints=None): + """NLB needs enough zero-valued datapoints to satisfy full-window completeness. - session = _make_session(elbv2, elb, cloudwatch) - findings = find_idle_load_balancers(session, "us-east-1") + Spec requires at least expected_days - 1 datapoints. Default to _THRESHOLD + datapoints so the completeness check (>= _THRESHOLD - 1) passes. + """ + n = num_datapoints if num_datapoints is not None else _THRESHOLD + cw = MagicMock() - assert len(findings) == 0 + def _side(**kwargs): + stat = kwargs.get("Statistics", ["Sum"])[0] + return {"Datapoints": [{stat: 0}] * n} + cw.get_metric_statistics.side_effect = _side + return cw -def test_idle_nlb_detected_unhealthy_targets(): - """Idle NLB with zero flows and only unhealthy targets should be MEDIUM confidence. - Unhealthy targets are still *registered* targets — treating them as absent would - produce a false HIGH-confidence finding. has_targets=True → MEDIUM confidence. +def _cw_metric_with_signal( + trigger_metric: str, trigger_stat: str = "Sum", trigger_value: float = 100.0 +): + """CloudWatch mock that returns traffic only for the specified metric.""" + cw = MagicMock() + + def _side(**kwargs): + if kwargs.get("MetricName") == trigger_metric: + return {"Datapoints": [{trigger_stat: trigger_value}]} + stat = kwargs.get("Statistics", ["Sum"])[0] + return {"Datapoints": [{stat: 0}]} + + cw.get_metric_statistics.side_effect = _side + return cw + + +def _cw_nlb_missing_metric(missing_metric: str): + """NLB CloudWatch mock where one metric returns no datapoints (FAIL RULE). + + Non-missing metrics return full-window coverage (_THRESHOLD datapoints) + so the completeness check passes for those metrics before we reach the + missing one. """ - elbv2 = MagicMock() - elb = MagicMock() - cloudwatch = MagicMock() - - nlb = _make_elbv2_lb(name="idle-nlb", lb_type="network", age_days=20) - paginator = elbv2.get_paginator.return_value - paginator.paginate.return_value = [{"LoadBalancers": [nlb]}] - - elbv2.describe_target_groups.return_value = {"TargetGroups": [{"TargetGroupArn": "arn:tg"}]} - elbv2.describe_target_health.return_value = { - "TargetHealthDescriptions": [ - {"Target": {"Id": "i-123"}, "TargetHealth": {"State": "unhealthy"}} - ] + cw = MagicMock() + + def _side(**kwargs): + metric = kwargs.get("MetricName", "") + stat = kwargs.get("Statistics", ["Sum"])[0] + if metric == missing_metric: + return {"Datapoints": []} + return {"Datapoints": [{stat: 0}] * _THRESHOLD} + + cw.get_metric_statistics.side_effect = _side + return cw + + +def _cw_error(metric_name: str = None): + """CloudWatch mock that raises ClientError for the given metric (or all).""" + cw = MagicMock() + err = ClientError( + {"Error": {"Code": "ThrottlingException", "Message": "x"}}, "GetMetricStatistics" + ) + + def _side(**kwargs): + if metric_name is None or kwargs.get("MetricName") == metric_name: + raise err + stat = kwargs.get("Statistics", ["Sum"])[0] + return {"Datapoints": [{stat: 0}]} + + cw.get_metric_statistics.side_effect = _side + return cw + + +def _run(session, threshold=_THRESHOLD): + return find_idle_load_balancers(session, _REGION, idle_days_threshold=threshold) + + +# --------------------------------------------------------------------------- +# TestMustEmit +# --------------------------------------------------------------------------- + + +class TestMustEmit: + def test_idle_alb_zero_targets_emits_high(self): + """ALB older than threshold, active, no traffic, no targets → EMIT HIGH.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, [_elbv2_lb(name="idle-alb", age_days=20)]) + _setup_clb(elb, []) + + findings = _run(_make_session(elbv2, elb, cw)) + + assert len(findings) == 1 + f = findings[0] + assert f.rule_id == "aws.elbv2.alb.idle" + assert f.confidence.value == "high" + assert f.risk.value == "medium" + + def test_idle_alb_with_targets_emits_medium(self): + """ALB older than threshold, no traffic, but registered targets → EMIT MEDIUM.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2( + elbv2, + [_elbv2_lb(name="idle-alb-targets", age_days=20)], + tg_pages=[{"TargetGroups": [{"TargetGroupArn": "arn:tg:1"}]}], + target_health={"TargetHealthDescriptions": [{"Target": {"Id": "i-1"}}]}, + ) + _setup_clb(elb, []) + + findings = _run(_make_session(elbv2, elb, cw)) + + assert len(findings) == 1 + assert findings[0].confidence.value == "medium" + assert findings[0].details["has_registered_targets"] is True + + def test_idle_nlb_active_impaired_zero_traffic_with_targets_emits_medium(self): + """NLB in active_impaired state, zero NLB traffic with valid datapoints, has targets → EMIT MEDIUM.""" + elbv2, elb = MagicMock(), MagicMock() + cw = _cw_nlb_zero_traffic() + nlb = _elbv2_lb(name="idle-nlb", lb_type="network", age_days=20, state="active_impaired") + _setup_elbv2( + elbv2, + [nlb], + tg_pages=[{"TargetGroups": [{"TargetGroupArn": "arn:tg:1"}]}], + target_health={"TargetHealthDescriptions": [{"Target": {"Id": "i-1"}}]}, + ) + _setup_clb(elb, []) + + findings = _run(_make_session(elbv2, elb, cw)) + + assert len(findings) == 1 + f = findings[0] + assert f.rule_id == "aws.elbv2.nlb.idle" + assert f.confidence.value == "medium" + + def test_idle_nlb_no_targets_emits_high(self): + """NLB older than threshold, zero NLB traffic with valid datapoints, no targets → EMIT HIGH.""" + elbv2, elb = MagicMock(), MagicMock() + cw = _cw_nlb_zero_traffic() + nlb = _elbv2_lb(name="idle-nlb", lb_type="network", age_days=20) + _setup_elbv2(elbv2, [nlb]) + _setup_clb(elb, []) + + findings = _run(_make_session(elbv2, elb, cw)) + + assert len(findings) == 1 + assert findings[0].rule_id == "aws.elbv2.nlb.idle" + assert findings[0].confidence.value == "high" + + def test_idle_clb_no_instances_emits_high(self): + """CLB older than threshold, no traffic, no instances → EMIT HIGH.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, []) + _setup_clb(elb, [_clb(name="idle-clb", age_days=20)]) + + findings = _run(_make_session(elbv2, elb, cw)) + + assert len(findings) == 1 + f = findings[0] + assert f.rule_id == "aws.elb.clb.idle" + assert f.resource_type == "aws.elb.load_balancer" + assert f.resource_id == "idle-clb" + assert f.confidence.value == "high" + + def test_idle_clb_with_instances_emits_medium(self): + """CLB no traffic but has registered instances → EMIT MEDIUM.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, []) + _setup_clb(elb, [_clb(name="idle-clb", age_days=20, instances=[{"InstanceId": "i-1"}])]) + + findings = _run(_make_session(elbv2, elb, cw)) + + assert len(findings) == 1 + assert findings[0].confidence.value == "medium" + assert findings[0].details["registered_instance_count"] == 1 + + +# --------------------------------------------------------------------------- +# TestMustSkip +# --------------------------------------------------------------------------- + + +class TestMustSkip: + def test_gateway_lb_skipped(self): + """ELBv2 with Type='gateway' must be skipped.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + lb = _elbv2_lb(name="gwlb", lb_type="gateway", age_days=20) + _setup_elbv2(elbv2, [lb]) + _setup_clb(elb, []) + + findings = _run(_make_session(elbv2, elb, cw)) + assert findings == [] + + def test_unknown_type_skipped(self): + """ELBv2 with an unrecognised Type must be skipped as unsupported.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + lb = _elbv2_lb(name="mystery", lb_type="classic_compat", age_days=20) + _setup_elbv2(elbv2, [lb]) + _setup_clb(elb, []) + + findings = _run(_make_session(elbv2, elb, cw)) + assert findings == [] + + def test_alb_younger_than_threshold_skipped(self): + """ALB younger than idle_days_threshold is skipped.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, [_elbv2_lb(name="new-alb", age_days=5)]) + _setup_clb(elb, []) + + findings = _run(_make_session(elbv2, elb, cw)) + assert findings == [] + + def test_clb_younger_than_threshold_skipped(self): + """CLB younger than idle_days_threshold is skipped.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, []) + _setup_clb(elb, [_clb(name="new-clb", age_days=3)]) + + findings = _run(_make_session(elbv2, elb, cw)) + assert findings == [] + + def test_alb_in_provisioning_state_skipped(self): + """ELBv2 in 'provisioning' state must be skipped.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, [_elbv2_lb(name="prov-alb", age_days=20, state="provisioning")]) + _setup_clb(elb, []) + + findings = _run(_make_session(elbv2, elb, cw)) + assert findings == [] + + def test_alb_in_failed_state_skipped(self): + """ELBv2 in 'failed' state must be skipped.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, [_elbv2_lb(name="failed-alb", age_days=20, state="failed")]) + _setup_clb(elb, []) + + findings = _run(_make_session(elbv2, elb, cw)) + assert findings == [] + + def test_elbv2_unparsable_arn_dimension_skipped(self): + """ELBv2 ARN without 'loadbalancer/' cannot yield a CW dimension → SKIP ITEM.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + # Build a LB with a raw bad ARN + lb = { + "LoadBalancerArn": "bad-arn-no-loadbalancer-segment", + "LoadBalancerName": "bad-lb", + "Type": "application", + "CreatedTime": _now() - timedelta(days=20), + "State": {"Code": "active"}, + "DNSName": "bad.dns", + "VpcId": "vpc-1", + "Scheme": "internet-facing", + } + _setup_elbv2(elbv2, [lb]) + _setup_clb(elb, []) + + findings = _run(_make_session(elbv2, elb, cw)) + assert findings == [] + + def test_elbv2_missing_arn_skipped(self): + """ELBv2 without LoadBalancerArn must be skipped.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + lb = { + "LoadBalancerName": "no-arn", + "Type": "application", + "CreatedTime": _now() - timedelta(days=20), + "State": {"Code": "active"}, + } + _setup_elbv2(elbv2, [lb]) + _setup_clb(elb, []) + + findings = _run(_make_session(elbv2, elb, cw)) + assert findings == [] + + def test_elbv2_missing_created_time_skipped(self): + """ELBv2 without CreatedTime must be skipped.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + lb = { + "LoadBalancerArn": f"arn:aws:elasticloadbalancing:{_REGION}:{_ACCOUNT}:loadbalancer/app/no-time/abc", + "LoadBalancerName": "no-time", + "Type": "application", + "State": {"Code": "active"}, + } + _setup_elbv2(elbv2, [lb]) + _setup_clb(elb, []) + + findings = _run(_make_session(elbv2, elb, cw)) + assert findings == [] + + def test_clb_missing_name_skipped(self): + """CLB without LoadBalancerName must be skipped.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, []) + lb = {"CreatedTime": _now() - timedelta(days=20), "Instances": []} + _setup_clb(elb, [lb]) + + findings = _run(_make_session(elbv2, elb, cw)) + assert findings == [] + + def test_clb_missing_created_time_skipped(self): + """CLB without CreatedTime must be skipped.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, []) + lb = {"LoadBalancerName": "no-time", "Instances": []} + _setup_clb(elb, [lb]) + + findings = _run(_make_session(elbv2, elb, cw)) + assert findings == [] + + def test_non_dict_elbv2_item_skipped(self): + """Non-dict ELBv2 item must be skipped without raising.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + lb_pag = MagicMock() + lb_pag.paginate.return_value = [{"LoadBalancers": ["not-a-dict"]}] + tg_pag = MagicMock() + tg_pag.paginate.return_value = [{"TargetGroups": []}] + elbv2.get_paginator.side_effect = lambda n: ( + lb_pag if n == "describe_load_balancers" else tg_pag + ) + _setup_clb(elb, []) + + findings = _run(_make_session(elbv2, elb, cw)) + assert findings == [] + + def test_non_dict_clb_item_skipped(self): + """Non-dict CLB item must be skipped without raising.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, []) + pag = elb.get_paginator.return_value + pag.paginate.return_value = [{"LoadBalancerDescriptions": ["not-a-dict"]}] + + findings = _run(_make_session(elbv2, elb, cw)) + assert findings == [] + + +# --------------------------------------------------------------------------- +# TestTrafficSignals +# --------------------------------------------------------------------------- + + +class TestTrafficSignals: + """Each traffic metric independently causes a skip when > 0.""" + + # --- ALB --- + + def test_alb_request_count_triggers_skip(self): + elbv2, elb = MagicMock(), MagicMock() + cw = _cw_metric_with_signal("RequestCount") + _setup_elbv2(elbv2, [_elbv2_lb(age_days=20)]) + _setup_clb(elb, []) + assert _run(_make_session(elbv2, elb, cw)) == [] + + def test_alb_processed_bytes_triggers_skip(self): + elbv2, elb = MagicMock(), MagicMock() + cw = _cw_metric_with_signal("ProcessedBytes") + _setup_elbv2(elbv2, [_elbv2_lb(age_days=20)]) + _setup_clb(elb, []) + assert _run(_make_session(elbv2, elb, cw)) == [] + + def test_alb_active_connection_count_triggers_skip(self): + """ActiveConnectionCount is the third ALB signal; > 0 must prevent emission.""" + elbv2, elb = MagicMock(), MagicMock() + cw = _cw_metric_with_signal("ActiveConnectionCount") + _setup_elbv2(elbv2, [_elbv2_lb(age_days=20)]) + _setup_clb(elb, []) + assert _run(_make_session(elbv2, elb, cw)) == [] + + # --- NLB --- + + @staticmethod + def _nlb_traffic_cw(trigger_metric, trigger_stat): + """Return full-window datapoints for all metrics; trigger metric has traffic.""" + cw = MagicMock() + + def _side(**kwargs): + metric = kwargs.get("MetricName", "") + stat = kwargs.get("Statistics", ["Sum"])[0] + if metric == trigger_metric: + return {"Datapoints": [{trigger_stat: 1}] * _THRESHOLD} + return {"Datapoints": [{stat: 0}] * _THRESHOLD} + + cw.get_metric_statistics.side_effect = _side + return cw + + def test_nlb_new_flow_count_triggers_skip(self): + elbv2, elb = MagicMock(), MagicMock() + cw = TestTrafficSignals._nlb_traffic_cw("NewFlowCount", "Sum") + nlb = _elbv2_lb(lb_type="network", age_days=20) + _setup_elbv2(elbv2, [nlb]) + _setup_clb(elb, []) + assert _run(_make_session(elbv2, elb, cw)) == [] + + def test_nlb_processed_bytes_triggers_skip(self): + elbv2, elb = MagicMock(), MagicMock() + cw = TestTrafficSignals._nlb_traffic_cw("ProcessedBytes", "Sum") + nlb = _elbv2_lb(lb_type="network", age_days=20) + _setup_elbv2(elbv2, [nlb]) + _setup_clb(elb, []) + assert _run(_make_session(elbv2, elb, cw)) == [] + + def test_nlb_active_flow_count_triggers_skip(self): + """ActiveFlowCount Maximum is the third NLB signal; > 0 must prevent emission.""" + elbv2, elb = MagicMock(), MagicMock() + cw = TestTrafficSignals._nlb_traffic_cw("ActiveFlowCount", "Maximum") + nlb = _elbv2_lb(lb_type="network", age_days=20) + _setup_elbv2(elbv2, [nlb]) + _setup_clb(elb, []) + assert _run(_make_session(elbv2, elb, cw)) == [] + + # --- CLB --- + + def test_clb_request_count_triggers_skip(self): + elbv2, elb = MagicMock(), MagicMock() + cw = _cw_metric_with_signal("RequestCount") + _setup_elbv2(elbv2, []) + _setup_clb(elb, [_clb(age_days=20)]) + assert _run(_make_session(elbv2, elb, cw)) == [] + + def test_clb_estimated_processed_bytes_triggers_skip(self): + elbv2, elb = MagicMock(), MagicMock() + cw = _cw_metric_with_signal("EstimatedProcessedBytes") + _setup_elbv2(elbv2, []) + _setup_clb(elb, [_clb(age_days=20)]) + assert _run(_make_session(elbv2, elb, cw)) == [] + + +# --------------------------------------------------------------------------- +# TestMustFailRule +# --------------------------------------------------------------------------- + + +class TestMustFailRule: + def test_elbv2_inventory_client_error_raises(self): + """ELBv2 DescribeLoadBalancers failure raises (FAIL RULE).""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + exc = ClientError( + {"Error": {"Code": "InternalError", "Message": "x"}}, "DescribeLoadBalancers" + ) + lb_pag = MagicMock() + lb_pag.paginate.side_effect = exc + elbv2.get_paginator.return_value = lb_pag + _setup_clb(elb, []) + + with pytest.raises(ClientError): + _run(_make_session(elbv2, elb, cw)) + + def test_elbv2_inventory_bootocore_error_raises(self): + """ELBv2 inventory BotoCoreError propagates (FAIL RULE).""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + lb_pag = MagicMock() + lb_pag.paginate.side_effect = BotoCoreError() + elbv2.get_paginator.return_value = lb_pag + _setup_clb(elb, []) + + with pytest.raises(BotoCoreError): + _run(_make_session(elbv2, elb, cw)) + + def test_clb_inventory_client_error_raises(self): + """CLB DescribeLoadBalancers failure raises (FAIL RULE).""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, []) + exc = ClientError( + {"Error": {"Code": "InternalError", "Message": "x"}}, "DescribeLoadBalancers" + ) + pag = elb.get_paginator.return_value + pag.paginate.side_effect = exc + + with pytest.raises(ClientError): + _run(_make_session(elbv2, elb, cw)) + + def test_alb_cloudwatch_error_raises(self): + """CloudWatch error during ALB metric read raises (FAIL RULE, no LOW finding).""" + elbv2, elb = MagicMock(), MagicMock() + cw = _cw_error() + _setup_elbv2(elbv2, [_elbv2_lb(age_days=20)]) + _setup_clb(elb, []) + + with pytest.raises(ClientError): + _run(_make_session(elbv2, elb, cw)) + + def test_clb_cloudwatch_error_raises(self): + """CloudWatch error during CLB metric read raises (FAIL RULE).""" + elbv2, elb = MagicMock(), MagicMock() + cw = _cw_error() + _setup_elbv2(elbv2, []) + _setup_clb(elb, [_clb(age_days=20)]) + + with pytest.raises(ClientError): + _run(_make_session(elbv2, elb, cw)) + + def test_cloudwatch_permission_error_raises(self): + """CloudWatch AccessDenied raises PermissionError (FAIL RULE).""" + elbv2, elb = MagicMock(), MagicMock() + cw = MagicMock() + cw.get_metric_statistics.side_effect = ClientError( + {"Error": {"Code": "AccessDenied", "Message": "x"}}, "GetMetricStatistics" + ) + _setup_elbv2(elbv2, [_elbv2_lb(age_days=20)]) + _setup_clb(elb, []) + + with pytest.raises(PermissionError): + _run(_make_session(elbv2, elb, cw)) + + def test_nlb_missing_new_flow_count_raises(self): + """NLB with missing NewFlowCount datapoints raises RuntimeError (FAIL RULE).""" + elbv2, elb = MagicMock(), MagicMock() + cw = _cw_nlb_missing_metric("NewFlowCount") + nlb = _elbv2_lb(lb_type="network", age_days=20) + _setup_elbv2(elbv2, [nlb]) + _setup_clb(elb, []) + + with pytest.raises(RuntimeError): + _run(_make_session(elbv2, elb, cw)) + + def test_nlb_missing_processed_bytes_raises(self): + """NLB with missing ProcessedBytes datapoints raises RuntimeError (FAIL RULE).""" + elbv2, elb = MagicMock(), MagicMock() + cw = _cw_nlb_missing_metric("ProcessedBytes") + nlb = _elbv2_lb(lb_type="network", age_days=20) + _setup_elbv2(elbv2, [nlb]) + _setup_clb(elb, []) + + with pytest.raises(RuntimeError): + _run(_make_session(elbv2, elb, cw)) + + def test_nlb_missing_active_flow_count_raises(self): + """NLB with missing ActiveFlowCount datapoints raises RuntimeError (FAIL RULE).""" + elbv2, elb = MagicMock(), MagicMock() + cw = _cw_nlb_missing_metric("ActiveFlowCount") + nlb = _elbv2_lb(lb_type="network", age_days=20) + _setup_elbv2(elbv2, [nlb]) + _setup_clb(elb, []) + + with pytest.raises(RuntimeError): + _run(_make_session(elbv2, elb, cw)) + + # --- Gap 2: NLB insufficient datapoints (partial coverage) also FAIL RULE --- + + def test_nlb_insufficient_new_flow_count_coverage_raises(self): + """NLB NewFlowCount with only 1 datapoint (far below window) raises RuntimeError.""" + elbv2, elb = MagicMock(), MagicMock() + # 1 datapoint for a 14-day window is incomplete coverage + cw = _cw_nlb_zero_traffic(num_datapoints=1) + nlb = _elbv2_lb(lb_type="network", age_days=20) + _setup_elbv2(elbv2, [nlb]) + _setup_clb(elb, []) + + with pytest.raises(RuntimeError, match="NewFlowCount"): + _run(_make_session(elbv2, elb, cw)) + + def test_nlb_one_below_expected_days_raises(self): + """Spec requires full-window coverage; expected_days - 1 datapoints is a gap → FAIL RULE.""" + elbv2, elb = MagicMock(), MagicMock() + # 13 datapoints for a 14-day window → 1-day gap → FAIL RULE (no tolerance) + cw = _cw_nlb_zero_traffic(num_datapoints=_THRESHOLD - 1) + nlb = _elbv2_lb(lb_type="network", age_days=20) + _setup_elbv2(elbv2, [nlb]) + _setup_clb(elb, []) + + with pytest.raises(RuntimeError): + _run(_make_session(elbv2, elb, cw)) + + def test_no_low_confidence_finding_on_metric_failure(self): + """Metric failure must never produce a LOW-confidence finding.""" + elbv2, elb = MagicMock(), MagicMock() + cw = _cw_error() + _setup_elbv2(elbv2, [_elbv2_lb(age_days=20)]) + _setup_clb(elb, []) + + # Must raise, not emit + with pytest.raises(Exception): + findings = _run(_make_session(elbv2, elb, cw)) + # If somehow no raise, ensure no LOW finding + for f in findings: + assert f.confidence.value != "low", "LOW confidence finding must never be emitted" + + +# --------------------------------------------------------------------------- +# TestNormalization +# --------------------------------------------------------------------------- + + +class TestNormalization: + def test_alb_lb_family_assigned(self): + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, [_elbv2_lb(name="alb1", lb_type="application", age_days=20)]) + _setup_clb(elb, []) + + f = _run(_make_session(elbv2, elb, cw))[0] + assert f.details["lb_family"] == "alb" + + def test_nlb_lb_family_assigned(self): + elbv2, elb = MagicMock(), MagicMock() + cw = _cw_nlb_zero_traffic() + _setup_elbv2(elbv2, [_elbv2_lb(name="nlb1", lb_type="network", age_days=20)]) + _setup_clb(elb, []) + + f = _run(_make_session(elbv2, elb, cw))[0] + assert f.details["lb_family"] == "nlb" + + def test_clb_lb_family_assigned(self): + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, []) + _setup_clb(elb, [_clb(name="clb1", age_days=20)]) + + f = _run(_make_session(elbv2, elb, cw))[0] + assert f.details["lb_family"] == "clb" + + def test_clb_uses_vpcid_key(self): + """CLB spec uses 'VPCId' (capital), not 'VpcId'.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, []) + _setup_clb(elb, [_clb(name="vpc-clb", age_days=20)]) + + f = _run(_make_session(elbv2, elb, cw))[0] + assert f.details["vpc_id"] == "vpc-12345" + + def test_elbv2_state_code_captured(self): + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, [_elbv2_lb(name="a", age_days=20, state="active")]) + _setup_clb(elb, []) + + f = _run(_make_session(elbv2, elb, cw))[0] + assert f.details["state_code"] == "active" + + def test_age_days_exact_threshold_emits(self): + """age_days == idle_days_threshold exactly — must emit (>= check, not >).""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, [_elbv2_lb(name="exact", age_days=_THRESHOLD)]) + _setup_clb(elb, []) + + findings = _run(_make_session(elbv2, elb, cw)) + assert len(findings) == 1 + + def test_age_days_one_below_threshold_skips(self): + """age_days == threshold - 1 must be skipped.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, [_elbv2_lb(name="almost", age_days=_THRESHOLD - 1)]) + _setup_clb(elb, []) + + findings = _run(_make_session(elbv2, elb, cw)) + assert findings == [] + + def test_alb_active_impaired_passes_state_check(self): + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, [_elbv2_lb(name="imp", age_days=20, state="active_impaired")]) + _setup_clb(elb, []) + + findings = _run(_make_session(elbv2, elb, cw)) + assert len(findings) == 1 + + # --- Gap 1: naive CreatedTime must be SKIP, not coerced --- + + def test_elbv2_naive_created_time_skipped(self): + """ELBv2 with a naive (tz-unaware) CreatedTime must be skipped, not coerced to UTC.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + lb = { + "LoadBalancerArn": f"arn:aws:elasticloadbalancing:{_REGION}:{_ACCOUNT}:loadbalancer/app/naive/abc", + "LoadBalancerName": "naive-alb", + "Type": "application", + # Naive datetime — no tzinfo + "CreatedTime": datetime.now() - timedelta(days=30), + "State": {"Code": "active"}, + "DNSName": "naive.elb.amazonaws.com", + "VpcId": "vpc-1", + "Scheme": "internet-facing", + } + _setup_elbv2(elbv2, [lb]) + _setup_clb(elb, []) + + findings = _run(_make_session(elbv2, elb, cw)) + assert findings == [], "Naive ELBv2 CreatedTime must cause SKIP, not emit" + + def test_clb_naive_created_time_skipped(self): + """CLB with a naive (tz-unaware) CreatedTime must be skipped, not coerced to UTC.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, []) + lb = { + "LoadBalancerName": "naive-clb", + # Naive datetime — no tzinfo + "CreatedTime": datetime.now() - timedelta(days=30), + "DNSName": "naive.elb.amazonaws.com", + "VPCId": "vpc-1", + "Scheme": "internet-facing", + "Instances": [], + } + _setup_clb(elb, [lb]) + + findings = _run(_make_session(elbv2, elb, cw)) + assert findings == [], "Naive CLB CreatedTime must cause SKIP, not emit" + + def test_clb_load_balancer_arn_always_null(self): + """CLB details must have load_balancer_arn = None (CLBs have no ARN).""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, []) + _setup_clb(elb, [_clb(age_days=20)]) + + f = _run(_make_session(elbv2, elb, cw))[0] + assert f.details["load_balancer_arn"] is None + + +# --------------------------------------------------------------------------- +# TestConfidenceModel +# --------------------------------------------------------------------------- + + +class TestConfidenceModel: + def test_alb_no_targets_high(self): + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, [_elbv2_lb(age_days=20)]) + _setup_clb(elb, []) + assert _run(_make_session(elbv2, elb, cw))[0].confidence.value == "high" + + def test_alb_with_targets_medium(self): + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2( + elbv2, + [_elbv2_lb(age_days=20)], + tg_pages=[{"TargetGroups": [{"TargetGroupArn": "arn:tg:1"}]}], + target_health={"TargetHealthDescriptions": [{"Target": {"Id": "i-1"}}]}, + ) + _setup_clb(elb, []) + assert _run(_make_session(elbv2, elb, cw))[0].confidence.value == "medium" + + def test_alb_enrichment_failure_medium(self): + """When target-group enrichment fails, confidence degrades to MEDIUM (not HIGH).""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + lb_pag = MagicMock() + lb_pag.paginate.return_value = [{"LoadBalancers": [_elbv2_lb(age_days=20)]}] + tg_pag = MagicMock() + tg_pag.paginate.side_effect = ClientError( + {"Error": {"Code": "InternalError", "Message": "x"}}, "DescribeTargetGroups" + ) + + def _pag(name): + if name == "describe_load_balancers": + return lb_pag + return tg_pag + + elbv2.get_paginator.side_effect = _pag + _setup_clb(elb, []) + + findings = _run(_make_session(elbv2, elb, cw)) + assert len(findings) == 1 + assert findings[0].confidence.value == "medium" + + def test_clb_no_instances_high(self): + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, []) + _setup_clb(elb, [_clb(age_days=20)]) + assert _run(_make_session(elbv2, elb, cw))[0].confidence.value == "high" + + def test_clb_with_instances_medium(self): + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, []) + _setup_clb(elb, [_clb(age_days=20, instances=[{"InstanceId": "i-1"}])]) + assert _run(_make_session(elbv2, elb, cw))[0].confidence.value == "medium" + + def test_no_low_confidence_ever_emitted(self): + """Confidence must only be HIGH or MEDIUM — never LOW.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, [_elbv2_lb(age_days=20)]) + _setup_clb(elb, [_clb(age_days=20)]) + + for f in _run(_make_session(elbv2, elb, cw)): + assert f.confidence.value != "low" + + +# --------------------------------------------------------------------------- +# TestCostModel +# --------------------------------------------------------------------------- + + +class TestCostModel: + def test_alb_estimated_cost_null(self): + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, [_elbv2_lb(age_days=20)]) + _setup_clb(elb, []) + assert _run(_make_session(elbv2, elb, cw))[0].estimated_monthly_cost_usd is None + + def test_nlb_estimated_cost_null(self): + elbv2, elb = MagicMock(), MagicMock() + cw = _cw_nlb_zero_traffic() + _setup_elbv2(elbv2, [_elbv2_lb(lb_type="network", age_days=20)]) + _setup_clb(elb, []) + assert _run(_make_session(elbv2, elb, cw))[0].estimated_monthly_cost_usd is None + + def test_clb_estimated_cost_null(self): + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, []) + _setup_clb(elb, [_clb(age_days=20)]) + assert _run(_make_session(elbv2, elb, cw))[0].estimated_monthly_cost_usd is None + + def test_no_hardcoded_cost_string_in_details(self): + """Details must not contain any hardcoded cost string like '~$16-22/month'.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, [_elbv2_lb(age_days=20)]) + _setup_clb(elb, [_clb(age_days=20)]) + + for f in _run(_make_session(elbv2, elb, cw)): + details_str = str(f.details) + assert "$" not in details_str, "Hardcoded cost string found in details" + + +# --------------------------------------------------------------------------- +# TestEvidenceContract +# --------------------------------------------------------------------------- + + +class TestEvidenceContract: + """Every emitted finding must include all required evidence/details fields.""" + + _ALB_REQUIRED = { + "evaluation_path", + "lb_family", + "resource_id", + "load_balancer_name", + "load_balancer_arn", + "scheme", + "dns_name", + "vpc_id", + "created_time", + "age_days", + "idle_days_threshold", + "traffic_window_days", + "traffic_signals_checked", + "traffic_detected", + "state_code", + "has_registered_targets", + "registered_target_count", + "target_group_count", } - cloudwatch.get_metric_statistics.return_value = {"Datapoints": []} - elb_paginator = elb.get_paginator.return_value - elb_paginator.paginate.return_value = [{"LoadBalancerDescriptions": []}] + _CLB_REQUIRED = { + "evaluation_path", + "lb_family", + "resource_id", + "load_balancer_name", + "load_balancer_arn", + "scheme", + "dns_name", + "vpc_id", + "created_time", + "age_days", + "idle_days_threshold", + "traffic_window_days", + "traffic_signals_checked", + "traffic_detected", + "has_registered_instances", + "registered_instance_count", + } + + def test_alb_required_details_present(self): + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, [_elbv2_lb(age_days=20)]) + _setup_clb(elb, []) + + f = _run(_make_session(elbv2, elb, cw))[0] + for key in self._ALB_REQUIRED: + assert key in f.details, f"Missing required details key: {key}" - session = _make_session(elbv2, elb, cloudwatch) - findings = find_idle_load_balancers(session, "us-east-1") + def test_clb_required_details_present(self): + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, []) + _setup_clb(elb, [_clb(age_days=20)]) - nlb_findings = [f for f in findings if f.rule_id == "aws.elbv2.nlb.idle"] - assert len(nlb_findings) == 1 - # Unhealthy but registered targets → has_targets=True → MEDIUM confidence - assert nlb_findings[0].confidence.value == "medium" - assert nlb_findings[0].details["has_targets"] is True + f = _run(_make_session(elbv2, elb, cw))[0] + for key in self._CLB_REQUIRED: + assert key in f.details, f"Missing required details key: {key}" + def test_evaluation_path_exact_value(self): + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, [_elbv2_lb(age_days=20)]) + _setup_clb(elb, []) -def test_idle_nlb_healthy_targets_medium_confidence(): - """Idle NLB with zero flows but healthy targets should be MEDIUM confidence.""" - elbv2 = MagicMock() - elb = MagicMock() - cloudwatch = MagicMock() + f = _run(_make_session(elbv2, elb, cw))[0] + assert f.details["evaluation_path"] == "idle-load-balancer-review-candidate" - nlb = _make_elbv2_lb(name="idle-nlb", lb_type="network", age_days=20) - paginator = elbv2.get_paginator.return_value - paginator.paginate.return_value = [{"LoadBalancers": [nlb]}] + def test_traffic_detected_always_false(self): + """traffic_detected must always be False for emitted findings.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, [_elbv2_lb(age_days=20)]) + _setup_clb(elb, [_clb(age_days=20)]) - elbv2.describe_target_groups.return_value = {"TargetGroups": [{"TargetGroupArn": "arn:tg"}]} - elbv2.describe_target_health.return_value = { - "TargetHealthDescriptions": [ - {"Target": {"Id": "i-123"}, "TargetHealth": {"State": "healthy"}} + for f in _run(_make_session(elbv2, elb, cw)): + assert f.details["traffic_detected"] is False + + def test_alb_traffic_signals_checked_contains_active_connection_count(self): + """ALB traffic_signals_checked must include ActiveConnectionCount:Sum.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, [_elbv2_lb(age_days=20)]) + _setup_clb(elb, []) + + f = _run(_make_session(elbv2, elb, cw))[0] + assert "ActiveConnectionCount:Sum" in f.details["traffic_signals_checked"] + + def test_nlb_traffic_signals_checked_contains_active_flow_count(self): + """NLB traffic_signals_checked must include ActiveFlowCount:Maximum.""" + elbv2, elb = MagicMock(), MagicMock() + cw = _cw_nlb_zero_traffic() + _setup_elbv2(elbv2, [_elbv2_lb(lb_type="network", age_days=20)]) + _setup_clb(elb, []) + + f = _run(_make_session(elbv2, elb, cw))[0] + assert "ActiveFlowCount:Maximum" in f.details["traffic_signals_checked"] + + def test_clb_traffic_signals_checked_contains_estimated_bytes(self): + """CLB traffic_signals_checked must include EstimatedProcessedBytes:Sum.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, []) + _setup_clb(elb, [_clb(age_days=20)]) + + f = _run(_make_session(elbv2, elb, cw))[0] + assert "EstimatedProcessedBytes:Sum" in f.details["traffic_signals_checked"] + + def test_idle_days_threshold_in_details(self): + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, [_elbv2_lb(age_days=30)]) + _setup_clb(elb, []) + + f = _run(_make_session(elbv2, elb, cw), threshold=14)[0] + assert f.details["idle_days_threshold"] == 14 + assert f.details["traffic_window_days"] == 14 + + +# --------------------------------------------------------------------------- +# TestTitleAndReasonContract +# --------------------------------------------------------------------------- + + +class TestTitleAndReasonContract: + def test_alb_title(self): + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, [_elbv2_lb(age_days=20)]) + _setup_clb(elb, []) + + f = _run(_make_session(elbv2, elb, cw))[0] + assert f.title == "Idle ALB review candidate" + + def test_nlb_title(self): + elbv2, elb = MagicMock(), MagicMock() + cw = _cw_nlb_zero_traffic() + _setup_elbv2(elbv2, [_elbv2_lb(lb_type="network", age_days=20)]) + _setup_clb(elb, []) + + f = _run(_make_session(elbv2, elb, cw))[0] + assert f.title == "Idle NLB review candidate" + + def test_clb_title(self): + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, []) + _setup_clb(elb, [_clb(age_days=20)]) + + f = _run(_make_session(elbv2, elb, cw))[0] + assert f.title == "Idle CLB review candidate" + + def test_alb_reason_contains_threshold(self): + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, [_elbv2_lb(age_days=30)]) + _setup_clb(elb, []) + + f = _run(_make_session(elbv2, elb, cw), threshold=21)[0] + assert "21" in f.reason + assert "ALB" in f.reason + + def test_nlb_reason_contains_threshold(self): + elbv2, elb = MagicMock(), MagicMock() + # Provide 21 datapoints so completeness check passes for threshold=21 + cw = _cw_nlb_zero_traffic(num_datapoints=21) + _setup_elbv2(elbv2, [_elbv2_lb(lb_type="network", age_days=30)]) + _setup_clb(elb, []) + + f = _run(_make_session(elbv2, elb, cw), threshold=21)[0] + assert "NLB" in f.reason + assert "21" in f.reason + + def test_clb_reason_contains_threshold(self): + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, []) + _setup_clb(elb, [_clb(age_days=30)]) + + f = _run(_make_session(elbv2, elb, cw), threshold=21)[0] + assert "CLB" in f.reason + assert "21" in f.reason + + def test_title_does_not_claim_safe_to_delete(self): + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, [_elbv2_lb(age_days=20)]) + _setup_clb(elb, [_clb(age_days=20)]) + + for f in _run(_make_session(elbv2, elb, cw)): + assert "safe" not in f.title.lower() + assert "delete" not in f.title.lower() + + +# --------------------------------------------------------------------------- +# TestRiskModel +# --------------------------------------------------------------------------- + + +class TestRiskModel: + def test_alb_risk_medium(self): + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, [_elbv2_lb(age_days=20)]) + _setup_clb(elb, []) + assert _run(_make_session(elbv2, elb, cw))[0].risk.value == "medium" + + def test_clb_risk_medium(self): + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, []) + _setup_clb(elb, [_clb(age_days=20)]) + assert _run(_make_session(elbv2, elb, cw))[0].risk.value == "medium" + + +# --------------------------------------------------------------------------- +# TestBackendEnrichment +# --------------------------------------------------------------------------- + + +class TestBackendEnrichment: + def test_target_enrichment_failure_does_not_fail_rule(self): + """Target-group enrichment failure must not raise — finding still emitted.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + lb_pag = MagicMock() + lb_pag.paginate.return_value = [{"LoadBalancers": [_elbv2_lb(age_days=20)]}] + tg_pag = MagicMock() + tg_pag.paginate.side_effect = ClientError( + {"Error": {"Code": "ServiceUnavailableException", "Message": "x"}}, + "DescribeTargetGroups", + ) + + def _pag(name): + return lb_pag if name == "describe_load_balancers" else tg_pag + + elbv2.get_paginator.side_effect = _pag + _setup_clb(elb, []) + + findings = _run(_make_session(elbv2, elb, cw)) + # Enrichment failure → finding still emitted, no exception + assert len(findings) == 1 + + def test_clb_instances_from_normalized_item(self): + """CLB backend context comes directly from the Instances field.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, []) + _setup_clb( + elb, [_clb(age_days=20, instances=[{"InstanceId": "i-1"}, {"InstanceId": "i-2"}])] + ) + + f = _run(_make_session(elbv2, elb, cw))[0] + assert f.details["registered_instance_count"] == 2 + assert f.details["has_registered_instances"] is True + + def test_enrichment_failure_counts_are_none_not_zero(self): + """Gap 3: when enrichment fails, registered_target_count and target_group_count + must be None (unknown), not silently set to 0 (which would look like zero targets).""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + lb_pag = MagicMock() + lb_pag.paginate.return_value = [{"LoadBalancers": [_elbv2_lb(age_days=20)]}] + tg_pag = MagicMock() + tg_pag.paginate.side_effect = ClientError( + {"Error": {"Code": "ServiceUnavailableException", "Message": "x"}}, + "DescribeTargetGroups", + ) + + def _pag(name): + return lb_pag if name == "describe_load_balancers" else tg_pag + + elbv2.get_paginator.side_effect = _pag + _setup_clb(elb, []) + + findings = _run(_make_session(elbv2, elb, cw)) + assert len(findings) == 1 + f = findings[0] + assert f.details["has_registered_targets"] is None + assert f.details["registered_target_count"] is None, "Must be None, not 0" + assert f.details["target_group_count"] is None, "Must be None, not 0" + + def test_unhealthy_targets_count_as_registered(self): + """Any non-empty TargetHealthDescriptions entry counts as a registered target.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2( + elbv2, + [_elbv2_lb(age_days=20)], + tg_pages=[{"TargetGroups": [{"TargetGroupArn": "arn:tg:1"}]}], + target_health={ + "TargetHealthDescriptions": [ + {"Target": {"Id": "i-1"}, "TargetHealth": {"State": "unhealthy"}} + ] + }, + ) + _setup_clb(elb, []) + + f = _run(_make_session(elbv2, elb, cw))[0] + assert f.details["has_registered_targets"] is True + assert f.confidence.value == "medium" + + +# --------------------------------------------------------------------------- +# TestPagination +# --------------------------------------------------------------------------- + + +class TestPagination: + def test_elbv2_multiple_pages_all_processed(self): + """ELBv2 paginator with two pages — both pages' LBs are evaluated.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + + lb_pag = MagicMock() + lb_pag.paginate.return_value = [ + {"LoadBalancers": [_elbv2_lb(name="alb1", age_days=20)]}, + {"LoadBalancers": [_elbv2_lb(name="alb2", age_days=25)]}, ] - } - cloudwatch.get_metric_statistics.return_value = {"Datapoints": []} + tg_pag = MagicMock() + tg_pag.paginate.return_value = [{"TargetGroups": []}] + elbv2.get_paginator.side_effect = lambda n: ( + lb_pag if n == "describe_load_balancers" else tg_pag + ) + _setup_clb(elb, []) - elb_paginator = elb.get_paginator.return_value - elb_paginator.paginate.return_value = [{"LoadBalancerDescriptions": []}] + findings = _run(_make_session(elbv2, elb, cw)) + assert len(findings) == 2 - session = _make_session(elbv2, elb, cloudwatch) - findings = find_idle_load_balancers(session, "us-east-1") + def test_clb_multiple_pages_all_processed(self): + """CLB paginator with two pages — both pages' LBs are evaluated.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, []) - nlb_findings = [f for f in findings if f.rule_id == "aws.elbv2.nlb.idle"] - assert len(nlb_findings) == 1 - # Healthy targets but no traffic -> MEDIUM confidence - assert nlb_findings[0].confidence.value == "medium" - assert nlb_findings[0].details["has_targets"] is True + pag = elb.get_paginator.return_value + pag.paginate.return_value = [ + {"LoadBalancerDescriptions": [_clb(name="clb1", age_days=20)]}, + {"LoadBalancerDescriptions": [_clb(name="clb2", age_days=25)]}, + ] + + findings = _run(_make_session(elbv2, elb, cw)) + assert len(findings) == 2 + def test_both_branches_run(self): + """ALB and CLB findings are both collected in a single call.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + _setup_elbv2(elbv2, [_elbv2_lb(name="idle-alb", age_days=20)]) + _setup_clb(elb, [_clb(name="idle-clb", age_days=20)]) -def test_idle_clb_detected(): - """Idle CLB with zero requests and no instances should be flagged.""" - elbv2 = MagicMock() - elb = MagicMock() - cloudwatch = MagicMock() + findings = _run(_make_session(elbv2, elb, cw)) + rule_ids = {f.rule_id for f in findings} + assert "aws.elbv2.alb.idle" in rule_ids + assert "aws.elb.clb.idle" in rule_ids - # elbv2 - empty - elbv2_paginator = elbv2.get_paginator.return_value - elbv2_paginator.paginate.return_value = [{"LoadBalancers": []}] - # CLB setup - clb = _make_clb(name="idle-clb", age_days=30, instances=[]) - elb_paginator = elb.get_paginator.return_value - elb_paginator.paginate.return_value = [{"LoadBalancerDescriptions": [clb]}] +# --------------------------------------------------------------------------- +# TestBranchIsolation +# --------------------------------------------------------------------------- - cloudwatch.get_metric_statistics.return_value = {"Datapoints": []} - session = _make_session(elbv2, elb, cloudwatch) - findings = find_idle_load_balancers(session, "us-east-1") +class TestBranchIsolation: + """ELBv2 and CLB branches must run independently. - assert len(findings) == 1 - f = findings[0] - assert f.rule_id == "aws.elb.clb.idle" - assert f.resource_type == "aws.elb.load_balancer" - assert f.resource_id == "idle-clb" - assert f.confidence.value == "high" # No instances + no traffic - assert f.details["has_instances"] is False - assert f.estimated_monthly_cost_usd == 18.0 + A failure in one branch must not prevent the other branch from being evaluated. + Both branches are always attempted; the first exception is re-raised afterward. + """ + def test_elbv2_failure_does_not_prevent_clb_evaluation(self): + """ELBv2 inventory failure → CLB paginator is still called.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() -def test_young_lb_skipped(): - """LB younger than threshold should NOT be flagged.""" - elbv2 = MagicMock() - elb = MagicMock() - cloudwatch = MagicMock() + # Make ELBv2 inventory fail + exc = ClientError( + {"Error": {"Code": "InternalError", "Message": "x"}}, "DescribeLoadBalancers" + ) + lb_pag = MagicMock() + lb_pag.paginate.side_effect = exc + elbv2.get_paginator.return_value = lb_pag - # Young ALB (5 days old) - paginator = elbv2.get_paginator.return_value - paginator.paginate.return_value = [ - {"LoadBalancers": [_make_elbv2_lb(name="young-alb", age_days=5)]} - ] + # CLB has a valid idle LB + _setup_clb(elb, [_clb(name="surviving-clb", age_days=20)]) - # Young CLB (3 days old) - elb_paginator = elb.get_paginator.return_value - elb_paginator.paginate.return_value = [ - {"LoadBalancerDescriptions": [_make_clb(name="young-clb", age_days=3)]} - ] + with pytest.raises(ClientError): + _run(_make_session(elbv2, elb, cw)) - session = _make_session(elbv2, elb, cloudwatch) - findings = find_idle_load_balancers(session, "us-east-1") + # CLB paginator must have been called despite ELBv2 failure + elb.get_paginator.assert_called() - assert len(findings) == 0 + def test_clb_failure_does_not_prevent_elbv2_evaluation(self): + """CLB inventory failure → ELBv2 paginator was still called and evaluated.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + # ELBv2 has a valid idle ALB + _setup_elbv2(elbv2, [_elbv2_lb(name="surviving-alb", age_days=20)]) -def test_clb_with_instances_medium_confidence(): - """CLB with instances but no traffic should be MEDIUM confidence.""" - elbv2 = MagicMock() - elb = MagicMock() - cloudwatch = MagicMock() + # Make CLB inventory fail + exc = ClientError( + {"Error": {"Code": "InternalError", "Message": "x"}}, "DescribeLoadBalancers" + ) + clb_pag = elb.get_paginator.return_value + clb_pag.paginate.side_effect = exc - elbv2_paginator = elbv2.get_paginator.return_value - elbv2_paginator.paginate.return_value = [{"LoadBalancers": []}] + with pytest.raises(ClientError): + _run(_make_session(elbv2, elb, cw)) - clb = _make_clb( - name="idle-with-instances", - age_days=30, - instances=[{"InstanceId": "i-123"}], - ) - elb_paginator = elb.get_paginator.return_value - elb_paginator.paginate.return_value = [{"LoadBalancerDescriptions": [clb]}] + # ELBv2 paginator must have been called (its branch completed) + elbv2.get_paginator.assert_called() + + def test_elbv2_and_clb_both_fail_raises_elbv2_exception(self): + """When both branches fail, the ELBv2 exception (first) is re-raised.""" + elbv2, elb, cw = MagicMock(), MagicMock(), _cw_no_traffic() + + elbv2_exc = ClientError( + {"Error": {"Code": "ELBv2Error", "Message": "x"}}, "DescribeLoadBalancers" + ) + clb_exc = ClientError( + {"Error": {"Code": "CLBError", "Message": "x"}}, "DescribeLoadBalancers" + ) + + lb_pag = MagicMock() + lb_pag.paginate.side_effect = elbv2_exc + elbv2.get_paginator.return_value = lb_pag - cloudwatch.get_metric_statistics.return_value = {"Datapoints": []} + clb_pag = elb.get_paginator.return_value + clb_pag.paginate.side_effect = clb_exc - session = _make_session(elbv2, elb, cloudwatch) - findings = find_idle_load_balancers(session, "us-east-1") + with pytest.raises(ClientError) as exc_info: + _run(_make_session(elbv2, elb, cw)) - assert len(findings) == 1 - assert findings[0].confidence.value == "medium" - assert findings[0].details["has_instances"] is True - assert findings[0].details["instance_count"] == 1 + # Must be the ELBv2 exception (first branch failure) + assert exc_info.value.response["Error"]["Code"] == "ELBv2Error" diff --git a/tests/cleancloud/providers/aws/test_aws_eni_detached.py b/tests/cleancloud/providers/aws/test_aws_eni_detached.py index 06a9537..90ff98a 100644 --- a/tests/cleancloud/providers/aws/test_aws_eni_detached.py +++ b/tests/cleancloud/providers/aws/test_aws_eni_detached.py @@ -1,289 +1,741 @@ -from datetime import datetime, timedelta, timezone - -from cleancloud.core.confidence import ConfidenceLevel -from cleancloud.providers.aws.rules.eni_detached import find_detached_enis +"""Tests for aws.ec2.eni.detached rule. +Covers all acceptance scenarios from docs/specs/aws/eni_detached.md §15 and +the normalization, evidence, confidence, cost, risk, title/reason, failure, and +pagination contracts from the same spec. +""" -def test_find_detached_enis(mock_boto3_session): - region = "us-east-1" - ec2 = mock_boto3_session._ec2 - - old_date = datetime.now(timezone.utc) - timedelta(days=90) # Older than 60-day threshold - recent_date = datetime.now(timezone.utc) - timedelta(days=30) # Younger than 60-day threshold - - # Mock paginator for describe_network_interfaces - paginator = ec2.get_paginator.return_value - paginator.paginate.return_value = [ - { - "NetworkInterfaces": [ - { - "NetworkInterfaceId": "eni-1", - "Status": "available", # Detached - "CreateTime": old_date, - "VpcId": "vpc-123", - "SubnetId": "subnet-123", - "AvailabilityZone": "us-east-1a", - "Description": "User-created ENI", - "InterfaceType": "interface", # Standard ENI - "TagSet": [{"Key": "Name", "Value": "test-eni"}], - }, - { - "NetworkInterfaceId": "eni-2", - "Status": "in-use", # Attached - "CreateTime": old_date, - "VpcId": "vpc-123", - "SubnetId": "subnet-123", - "AvailabilityZone": "us-east-1a", - "Description": "Attached ENI", - "InterfaceType": "interface", - }, - { - "NetworkInterfaceId": "eni-3", - "Status": "available", # Detached but recent - "CreateTime": recent_date, - "VpcId": "vpc-123", - "SubnetId": "subnet-123", - "AvailabilityZone": "us-east-1a", - "Description": "Recently created", - "InterfaceType": "interface", - }, - { - "NetworkInterfaceId": "eni-4", - "Status": "available", # AWS infrastructure (Load Balancer) - "CreateTime": old_date, - "VpcId": "vpc-123", - "SubnetId": "subnet-123", - "AvailabilityZone": "us-east-1a", - "Description": "ELB app/my-alb/1234567890", - "InterfaceType": "load_balancer", # AWS infrastructure - exclude - }, - { - "NetworkInterfaceId": "eni-5", - "Status": "available", # Detached Lambda ENI (USER resource - should flag!) - "CreateTime": old_date, - "VpcId": "vpc-123", - "SubnetId": "subnet-123", - "AvailabilityZone": "us-east-1a", - "Description": "AWS Lambda VPC ENI-my-function", - "InterfaceType": "interface", # Standard ENI type - "RequesterManaged": True, # Created by AWS service, but YOUR resource - }, - { - "NetworkInterfaceId": "eni-6", - "Status": "available", # Detached, old, no tags - "CreateTime": old_date, - "VpcId": "vpc-123", - "SubnetId": "subnet-123", - "AvailabilityZone": "us-east-1a", - "Description": "", - "InterfaceType": "interface", - "TagSet": [], - }, - ] - } - ] - - findings = find_detached_enis(mock_boto3_session, region) - eni_ids = {f.resource_id for f in findings} - findings_by_id = {f.resource_id: f for f in findings} - - # Positive: old (90 days) detached standard ENI with tags - assert "eni-1" in eni_ids - - # Positive: old (90 days) detached Lambda ENI (RequesterManaged but user resource!) - assert "eni-5" in eni_ids - - # Positive: old (90 days) detached ENI without tags - assert "eni-6" in eni_ids - - # Negative: attached ENI - assert "eni-2" not in eni_ids - - # Negative: detached but too young (30 days < 60 day threshold) - assert "eni-3" not in eni_ids - - # Negative: AWS infrastructure (Load Balancer) - assert "eni-4" not in eni_ids - - # Verify we got exactly 3 findings (including Lambda ENI) - assert len(findings) == 3 - - # Verify title includes "(Review Recommended)" - for f in findings: - assert f.title == "Detached Network Interface (Review Recommended)" - - # Verify confidence is MEDIUM for all findings - for f in findings: - assert f.confidence == ConfidenceLevel.MEDIUM - - # Verify standard ENI details - f1 = findings_by_id["eni-1"] - assert f1.details["interface_type"] == "interface" - assert f1.details["requester_managed"] is False - assert f1.details["age_days"] == 90 - assert "created" in f1.summary and "currently detached" in f1.summary - - # Verify Lambda ENI details and requester-managed signal - f5 = findings_by_id["eni-5"] - assert f5.details["interface_type"] == "interface" - assert f5.details["requester_managed"] is True - assert any("requester-managed" in s for s in f5.evidence.signals_used) - - # Verify untagged ENI has "no tags" signal - f6 = findings_by_id["eni-6"] - assert f6.details["requester_managed"] is False - assert any("no tags" in s for s in f6.evidence.signals_used) - - # Verify Hyperplane in signals_not_checked - for f in findings: - assert any("Hyperplane" in s for s in f.evidence.signals_not_checked) - - -def test_find_detached_enis_custom_threshold(mock_boto3_session): - region = "us-east-1" - ec2 = mock_boto3_session._ec2 - - date_45_days_ago = datetime.now(timezone.utc) - timedelta(days=45) - - paginator = ec2.get_paginator.return_value - paginator.paginate.return_value = [ - { - "NetworkInterfaces": [ - { - "NetworkInterfaceId": "eni-7", - "Status": "available", - "CreateTime": date_45_days_ago, - "VpcId": "vpc-123", - "SubnetId": "subnet-123", - "AvailabilityZone": "us-east-1a", - "Description": "Test ENI", - "InterfaceType": "interface", - } - ] - } - ] - - # Test with custom 60-day threshold - findings = find_detached_enis(mock_boto3_session, region, max_age_days=60) - eni_ids = {f.resource_id for f in findings} - - # Should NOT be detected (45 days < 60 days threshold) - assert "eni-7" not in eni_ids - - # Test with custom 30-day threshold - findings = find_detached_enis(mock_boto3_session, region, max_age_days=30) - eni_ids = {f.resource_id for f in findings} - - # Should be detected (45 days >= 30 days threshold) - assert "eni-7" in eni_ids - - # Verify wording uses creation age, not detached duration - f = findings[0] - assert "created" in f.summary - assert "currently detached" in f.summary - assert f.evidence.time_window == "30 days since creation" - - -def test_find_detached_enis_empty(mock_boto3_session): - region = "us-east-1" - ec2 = mock_boto3_session._ec2 - - paginator = ec2.get_paginator.return_value - paginator.paginate.return_value = [{"NetworkInterfaces": []}] - - findings = find_detached_enis(mock_boto3_session, region) - - assert len(findings) == 0 - - -def test_find_detached_enis_interface_types(mock_boto3_session): - """Test that InterfaceType correctly distinguishes AWS infrastructure from user resources.""" - region = "us-east-1" - ec2 = mock_boto3_session._ec2 - - old_date = datetime.now(timezone.utc) - timedelta(days=60) - - # Test various interface types - paginator = ec2.get_paginator.return_value - paginator.paginate.return_value = [ - { - "NetworkInterfaces": [ - { - "NetworkInterfaceId": "eni-user-1", - "Status": "available", - "CreateTime": old_date, - "VpcId": "vpc-123", - "SubnetId": "subnet-123", - "AvailabilityZone": "us-east-1a", - "Description": "User-created ENI", - "InterfaceType": "interface", # Standard - should be flagged - }, - { - "NetworkInterfaceId": "eni-lambda-1", - "Status": "available", - "CreateTime": old_date, - "VpcId": "vpc-123", - "SubnetId": "subnet-123", - "AvailabilityZone": "us-east-1a", - "Description": "Lambda VPC ENI", - "InterfaceType": "interface", # Lambda = user resource - should be flagged! - "RequesterManaged": True, - }, - { - "NetworkInterfaceId": "eni-elb-1", - "Status": "available", - "CreateTime": old_date, - "VpcId": "vpc-123", - "SubnetId": "subnet-123", - "AvailabilityZone": "us-east-1a", - "Description": "ELB app/my-alb/1234567890", - "InterfaceType": "load_balancer", # AWS infrastructure - exclude - }, - { - "NetworkInterfaceId": "eni-nat-1", - "Status": "available", - "CreateTime": old_date, - "VpcId": "vpc-123", - "SubnetId": "subnet-123", - "AvailabilityZone": "us-east-1a", - "Description": "NAT Gateway", - "InterfaceType": "nat_gateway", # AWS infrastructure - exclude - }, - { - "NetworkInterfaceId": "eni-vpce-1", - "Status": "available", - "CreateTime": old_date, - "VpcId": "vpc-123", - "SubnetId": "subnet-123", - "AvailabilityZone": "us-east-1a", - "Description": "VPC Endpoint", - "InterfaceType": "vpc_endpoint", # AWS infrastructure - exclude - }, - ] - } - ] +from unittest.mock import MagicMock - findings = find_detached_enis(mock_boto3_session, region) - eni_ids = {f.resource_id for f in findings} - findings_by_id = {f.resource_id: f for f in findings} +import pytest +from botocore.exceptions import BotoCoreError, ClientError - # Should flag user resources (including Lambda!) - assert "eni-user-1" in eni_ids - assert "eni-lambda-1" in eni_ids # Lambda ENI is a user resource! - - # Should exclude AWS infrastructure - assert "eni-elb-1" not in eni_ids - assert "eni-nat-1" not in eni_ids - assert "eni-vpce-1" not in eni_ids - - assert len(findings) == 2 # Only user-1 and lambda-1 +from cleancloud.core.confidence import ConfidenceLevel +from cleancloud.core.risk import RiskLevel +from cleancloud.providers.aws.rules.eni_detached import find_detached_enis - # Verify interface_type and requester_managed in details - f_user = findings_by_id["eni-user-1"] - assert f_user.details["interface_type"] == "interface" - assert f_user.details["requester_managed"] is False +_REGION = "us-east-1" - f_lambda = findings_by_id["eni-lambda-1"] - assert f_lambda.details["interface_type"] == "interface" - assert f_lambda.details["requester_managed"] is True - assert any("requester-managed" in s for s in f_lambda.evidence.signals_used) + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_session(ec2: MagicMock) -> MagicMock: + session = MagicMock() + session.client.return_value = ec2 + return session + + +def _setup_ec2(enis: list) -> MagicMock: + """Return an ec2 client mock whose paginator yields one page of ENIs.""" + ec2 = MagicMock() + paginator = MagicMock() + ec2.get_paginator.return_value = paginator + paginator.paginate.return_value = [{"NetworkInterfaces": enis}] + return ec2 + + +def _eni( + eni_id: str = "eni-aabbccdd", + status: str = "available", + **extra, +) -> dict: + """Build a minimal ENI dict with defaults that pass all exclusion rules.""" + base = { + "NetworkInterfaceId": eni_id, + "Status": status, + } + base.update(extra) + return base + + +def _run(session: MagicMock) -> list: + return find_detached_enis(session, _REGION) + + +def _client_error(code: str = "SomeError") -> ClientError: + return ClientError({"Error": {"Code": code, "Message": "test"}}, "DescribeNetworkInterfaces") + + +# --------------------------------------------------------------------------- +# §15 Must Emit +# --------------------------------------------------------------------------- + + +class TestMustEmit: + def test_available_no_attachment_object(self): + """Scenario 1: ENI available, no Attachment object → EMIT HIGH.""" + ec2 = _setup_ec2([_eni("eni-1", "available")]) + findings = _run(_make_session(ec2)) + assert len(findings) == 1 + assert findings[0].resource_id == "eni-1" + assert findings[0].confidence == ConfidenceLevel.HIGH + + def test_available_attachment_detached(self): + """Scenario 2: ENI available, Attachment.Status == 'detached' → EMIT HIGH.""" + eni = _eni( + "eni-2", "available", Attachment={"Status": "detached", "AttachmentId": "eni-attach-01"} + ) + ec2 = _setup_ec2([eni]) + findings = _run(_make_session(ec2)) + assert len(findings) == 1 + assert findings[0].resource_id == "eni-2" + assert findings[0].confidence == ConfidenceLevel.HIGH + + def test_requester_managed_available(self): + """Scenario 3: Requester-managed ENI available → EMIT (no exclusion).""" + eni = _eni("eni-3", "available", RequesterManaged=True) + ec2 = _setup_ec2([eni]) + findings = _run(_make_session(ec2)) + assert len(findings) == 1 + assert findings[0].resource_id == "eni-3" + + def test_operator_managed_available(self): + """Scenario 4: Operator-managed ENI available → EMIT (no exclusion).""" + eni = _eni("eni-4", "available", Operator={"Managed": True, "Principal": "some-service"}) + ec2 = _setup_ec2([eni]) + findings = _run(_make_session(ec2)) + assert len(findings) == 1 + assert findings[0].resource_id == "eni-4" + + def test_any_interface_type_available(self): + """Scenario 5: Any InterfaceType, Status available → EMIT (no type exclusion).""" + for itype in ("interface", "load_balancer", "nat_gateway", "vpc_endpoint", "efa", "branch"): + eni = _eni(f"eni-{itype}", "available", InterfaceType=itype) + ec2 = _setup_ec2([eni]) + findings = _run(_make_session(ec2)) + assert len(findings) == 1, f"Expected emit for InterfaceType={itype!r}" + assert findings[0].resource_id == f"eni-{itype}" + + +# --------------------------------------------------------------------------- +# §15 Must Skip +# --------------------------------------------------------------------------- + + +class TestMustSkip: + def test_in_use_skipped(self): + """Scenario 6: Status == 'in-use' → SKIP.""" + ec2 = _setup_ec2([_eni("eni-inuse", "in-use")]) + findings = _run(_make_session(ec2)) + assert findings == [] + + def test_attaching_skipped(self): + """Scenario 7a: Status == 'attaching' → SKIP.""" + ec2 = _setup_ec2([_eni("eni-attaching", "attaching")]) + assert _run(_make_session(ec2)) == [] + + def test_detaching_skipped(self): + """Scenario 7b: Status == 'detaching' → SKIP.""" + ec2 = _setup_ec2([_eni("eni-detaching", "detaching")]) + assert _run(_make_session(ec2)) == [] + + def test_associated_skipped(self): + """Scenario 7c: Status == 'associated' → SKIP.""" + ec2 = _setup_ec2([_eni("eni-associated", "associated")]) + assert _run(_make_session(ec2)) == [] + + def test_available_attachment_attached_skipped(self): + """Scenario 8: Status 'available' but Attachment.Status 'attached' → SKIP (inconsistency).""" + eni = _eni("eni-conflict", "available", Attachment={"Status": "attached"}) + ec2 = _setup_ec2([eni]) + assert _run(_make_session(ec2)) == [] + + def test_available_attachment_attaching_skipped(self): + """Structural inconsistency: 'available' + Attachment.Status 'attaching' → SKIP.""" + eni = _eni("eni-conflict2", "available", Attachment={"Status": "attaching"}) + ec2 = _setup_ec2([eni]) + assert _run(_make_session(ec2)) == [] + + def test_available_attachment_detaching_skipped(self): + """Structural inconsistency: 'available' + Attachment.Status 'detaching' → SKIP.""" + eni = _eni("eni-conflict3", "available", Attachment={"Status": "detaching"}) + ec2 = _setup_ec2([eni]) + assert _run(_make_session(ec2)) == [] + + def test_missing_network_interface_id_skipped(self): + """Scenario 9: Missing NetworkInterfaceId → SKIP.""" + eni = {"Status": "available"} + ec2 = _setup_ec2([eni]) + assert _run(_make_session(ec2)) == [] + + def test_missing_status_skipped(self): + """Scenario 10: Missing Status → SKIP.""" + eni = {"NetworkInterfaceId": "eni-nostatus"} + ec2 = _setup_ec2([eni]) + assert _run(_make_session(ec2)) == [] + + +# --------------------------------------------------------------------------- +# §15 Must Fail +# --------------------------------------------------------------------------- + + +class TestMustFailRule: + def test_client_error_raises(self): + """Scenario 11: DescribeNetworkInterfaces request failure → FAIL RULE (re-raise).""" + ec2 = MagicMock() + ec2.get_paginator.return_value.paginate.side_effect = _client_error("AccessDenied") + with pytest.raises(ClientError): + _run(_make_session(ec2)) + + def test_unauthorized_operation_raises_permission_error(self): + ec2 = MagicMock() + ec2.get_paginator.return_value.paginate.side_effect = _client_error("UnauthorizedOperation") + with pytest.raises(PermissionError): + _run(_make_session(ec2)) + + def test_botocore_error_raises(self): + ec2 = MagicMock() + ec2.get_paginator.return_value.paginate.side_effect = BotoCoreError() + with pytest.raises(BotoCoreError): + _run(_make_session(ec2)) + + +# --------------------------------------------------------------------------- +# §15 Must NOT Happen +# --------------------------------------------------------------------------- + + +class TestMustNotHappen: + def test_no_temporal_threshold_applied(self): + """No temporal threshold — any available ENI regardless of creation age emits.""" + # Provide ENI with no CreateTime at all — must still emit. + ec2 = _setup_ec2([_eni("eni-notime", "available")]) + findings = _run(_make_session(ec2)) + assert len(findings) == 1 + + def test_create_time_not_in_details(self): + """CreateTime must not appear in details — no temporal claim from DescribeNetworkInterfaces.""" + ec2 = _setup_ec2([_eni("eni-ct", "available")]) + findings = _run(_make_session(ec2)) + assert "create_time" not in findings[0].details + assert "age_days" not in findings[0].details + + def test_interface_type_not_exclusion(self): + """No interface_type may be used as an exclusion gate.""" + excluded_types = [ + "load_balancer", + "nat_gateway", + "vpc_endpoint", + "gateway_load_balancer", + "gateway_load_balancer_endpoint", + ] + for itype in excluded_types: + eni = _eni(f"eni-{itype}", "available", InterfaceType=itype) + ec2 = _setup_ec2([eni]) + findings = _run(_make_session(ec2)) + assert len(findings) == 1, f"interface_type={itype!r} must not be excluded" + + def test_requester_managed_true_not_exclusion(self): + """requester_managed == True must not exclude the ENI.""" + eni = _eni("eni-rm", "available", RequesterManaged=True) + ec2 = _setup_ec2([eni]) + assert len(_run(_make_session(ec2))) == 1 + + def test_cost_estimate_is_none(self): + """estimated_monthly_cost_usd must always be None.""" + ec2 = _setup_ec2([_eni("eni-cost", "available")]) + findings = _run(_make_session(ec2)) + assert findings[0].estimated_monthly_cost_usd is None + + def test_confidence_never_medium_or_low(self): + """HIGH confidence only; MEDIUM and LOW must not appear.""" + ec2 = _setup_ec2([_eni("eni-conf", "available")]) + f = _run(_make_session(ec2))[0] + assert f.confidence not in (ConfidenceLevel.MEDIUM, ConfidenceLevel.LOW) + + +# --------------------------------------------------------------------------- +# Normalization contract +# --------------------------------------------------------------------------- + + +class TestNormalization: + def test_non_dict_eni_skipped(self): + """Non-dict item in NetworkInterfaces → SKIP (not FAIL RULE).""" + ec2 = _setup_ec2(["not-a-dict", None, 42]) + assert _run(_make_session(ec2)) == [] + + def test_empty_string_network_interface_id_skipped(self): + """Empty string NetworkInterfaceId treated as absent → SKIP.""" + eni = {"NetworkInterfaceId": "", "Status": "available"} + ec2 = _setup_ec2([eni]) + assert _run(_make_session(ec2)) == [] + + def test_empty_string_status_skipped(self): + """Empty string Status treated as absent → SKIP.""" + eni = {"NetworkInterfaceId": "eni-x", "Status": ""} + ec2 = _setup_ec2([eni]) + assert _run(_make_session(ec2)) == [] + + def test_requester_managed_string_treated_as_null(self): + """RequesterManaged as string → not a bool → normalized to null (not excluded).""" + eni = _eni("eni-rmstr", "available", RequesterManaged="true") + ec2 = _setup_ec2([eni]) + findings = _run(_make_session(ec2)) + assert len(findings) == 1 + assert findings[0].details["requester_managed"] is None + + def test_requester_managed_false_stored_correctly(self): + eni = _eni("eni-rmf", "available", RequesterManaged=False) + findings = _run(_make_session(_setup_ec2([eni]))) + assert findings[0].details["requester_managed"] is False + + def test_requester_managed_true_stored_correctly(self): + eni = _eni("eni-rmt", "available", RequesterManaged=True) + findings = _run(_make_session(_setup_ec2([eni]))) + assert findings[0].details["requester_managed"] is True + + def test_operator_managed_non_bool_treated_as_null(self): + """Operator.Managed as string → null (not an exclusion).""" + eni = _eni("eni-opstr", "available", Operator={"Managed": "yes"}) + ec2 = _setup_ec2([eni]) + findings = _run(_make_session(ec2)) + assert len(findings) == 1 + assert findings[0].details["operator_managed"] is None + + def test_operator_managed_true_stored_and_not_excluded(self): + eni = _eni("eni-opt", "available", Operator={"Managed": True, "Principal": "svc"}) + findings = _run(_make_session(_setup_ec2([eni]))) + assert len(findings) == 1 + assert findings[0].details["operator_managed"] is True + assert findings[0].details["operator_principal"] == "svc" + + def test_operator_non_dict_yields_null_fields(self): + eni = _eni("eni-opnd", "available", Operator="bad") + findings = _run(_make_session(_setup_ec2([eni]))) + assert len(findings) == 1 + assert findings[0].details["operator_managed"] is None + assert findings[0].details["operator_principal"] is None + + def test_tag_set_absent_yields_empty_list(self): + eni = _eni("eni-notag", "available") + findings = _run(_make_session(_setup_ec2([eni]))) + assert findings[0].details["tag_set"] == [] + + def test_tag_set_list_preserved(self): + tags = [{"Key": "Name", "Value": "my-eni"}] + eni = _eni("eni-tag", "available", TagSet=tags) + findings = _run(_make_session(_setup_ec2([eni]))) + assert findings[0].details["tag_set"] == tags + + def test_tag_set_non_list_yields_empty_list(self): + eni = _eni("eni-badtag", "available", TagSet="not-a-list") + findings = _run(_make_session(_setup_ec2([eni]))) + assert findings[0].details["tag_set"] == [] + + def test_public_ip_from_association(self): + eni = _eni("eni-pub", "available", Association={"PublicIp": "1.2.3.4"}) + findings = _run(_make_session(_setup_ec2([eni]))) + assert findings[0].details["public_ip"] == "1.2.3.4" + + def test_public_ip_absent_when_no_association(self): + eni = _eni("eni-nopub", "available") + findings = _run(_make_session(_setup_ec2([eni]))) + assert findings[0].details["public_ip"] is None + + def test_association_non_dict_yields_null_public_ip(self): + eni = _eni("eni-assocstr", "available", Association="bad") + findings = _run(_make_session(_setup_ec2([eni]))) + assert findings[0].details["public_ip"] is None + + def test_empty_string_contextual_fields_yield_null(self): + """Empty strings for contextual string fields must normalize to null.""" + eni = _eni( + "eni-emptyctx", + "available", + InterfaceType="", + AvailabilityZone="", + SubnetId="", + VpcId="", + PrivateIpAddress="", + Description="", + ) + findings = _run(_make_session(_setup_ec2([eni]))) + d = findings[0].details + assert d["interface_type"] is None + assert d["availability_zone"] is None + assert d["subnet_id"] is None + assert d["vpc_id"] is None + assert d["private_ip_address"] is None + assert d["description"] is None + + def test_attachment_absent_yields_null_attachment_fields(self): + eni = _eni("eni-noatt", "available") + findings = _run(_make_session(_setup_ec2([eni]))) + d = findings[0].details + assert d["attachment_status"] is None + assert d["attachment_id"] is None + assert d["attachment_instance_id"] is None + assert d["attachment_instance_owner_id"] is None + + def test_attachment_fields_populated_from_object(self): + att = { + "Status": "detached", + "AttachmentId": "eni-attach-01", + "InstanceId": "i-abc123", + "InstanceOwnerId": "123456789012", + } + eni = _eni("eni-att", "available", Attachment=att) + findings = _run(_make_session(_setup_ec2([eni]))) + d = findings[0].details + assert d["attachment_status"] == "detached" + assert d["attachment_id"] == "eni-attach-01" + assert d["attachment_instance_id"] == "i-abc123" + assert d["attachment_instance_owner_id"] == "123456789012" + + def test_malformed_attachment_non_dict_yields_null_fields(self): + eni = _eni("eni-badatt", "available", Attachment="not-a-dict") + findings = _run(_make_session(_setup_ec2([eni]))) + d = findings[0].details + assert d["attachment_status"] is None + assert d["attachment_id"] is None + + +# --------------------------------------------------------------------------- +# Attachment consistency (structural inconsistency rule) +# --------------------------------------------------------------------------- + + +class TestAttachmentConsistency: + def test_available_plus_detached_attachment_emits(self): + """available + Attachment.Status 'detached' → consistent → EMIT.""" + eni = _eni("eni-detatt", "available", Attachment={"Status": "detached"}) + findings = _run(_make_session(_setup_ec2([eni]))) + assert len(findings) == 1 + + def test_available_plus_null_attachment_status_emits(self): + """available + Attachment object missing Status → null attachment_status → EMIT.""" + eni = _eni("eni-noastatus", "available", Attachment={"AttachmentId": "eni-attach-01"}) + findings = _run(_make_session(_setup_ec2([eni]))) + assert len(findings) == 1 + + def test_available_plus_attached_skipped(self): + eni = _eni("eni-inconsis1", "available", Attachment={"Status": "attached"}) + assert _run(_make_session(_setup_ec2([eni]))) == [] + + def test_available_plus_attaching_skipped(self): + eni = _eni("eni-inconsis2", "available", Attachment={"Status": "attaching"}) + assert _run(_make_session(_setup_ec2([eni]))) == [] + + def test_available_plus_detaching_skipped(self): + eni = _eni("eni-inconsis3", "available", Attachment={"Status": "detaching"}) + assert _run(_make_session(_setup_ec2([eni]))) == [] + + def test_available_plus_unknown_attachment_status_skipped(self): + """Unknown/malformed attachment_status (e.g. 'foo') → SKIP; only null/'detached' emits.""" + eni = _eni("eni-unknown-att", "available", Attachment={"Status": "foo"}) + assert _run(_make_session(_setup_ec2([eni]))) == [] + + def test_available_plus_arbitrary_string_attachment_status_skipped(self): + """Any non-null, non-'detached' attachment_status string → SKIP.""" + for bad_status in ("pending", "error", "unknown", "AVAILABLE", ""): + eni_id = f"eni-bad-{bad_status or 'empty'}" + # Empty string normalizes to None via _str(), so it should emit. + # Non-empty unknown strings should skip. + eni = _eni(eni_id, "available", Attachment={"Status": bad_status}) + findings = _run(_make_session(_setup_ec2([eni]))) + if bad_status == "": + # Empty string → attachment_status normalizes to None → emit + assert ( + len(findings) == 1 + ), "Empty attachment Status should emit (normalized to null)" + else: + assert findings == [], f"attachment_status={bad_status!r} should skip" + + def test_attachment_status_does_not_override_top_level_status(self): + """attachment_status is validation only; it must not independently produce eligibility.""" + eni = _eni("eni-auth", "in-use", Attachment={"Status": "detached"}) + assert _run(_make_session(_setup_ec2([eni]))) == [] + + +# --------------------------------------------------------------------------- +# Signals used (§11.3) +# --------------------------------------------------------------------------- + + +class TestSignalsUsed: + def test_top_level_status_signal_always_present(self): + ec2 = _setup_ec2([_eni("eni-sig1", "available")]) + signals = _run(_make_session(ec2))[0].evidence.signals_used + assert any("'available'" in s for s in signals) + + def test_requester_managed_true_adds_signal(self): + eni = _eni("eni-rm-sig", "available", RequesterManaged=True) + signals = _run(_make_session(_setup_ec2([eni])))[0].evidence.signals_used + assert any("requester-managed" in s.lower() for s in signals) + + def test_requester_managed_false_no_extra_signal(self): + eni = _eni("eni-rmf-sig", "available", RequesterManaged=False) + signals = _run(_make_session(_setup_ec2([eni])))[0].evidence.signals_used + assert not any("requester-managed" in s.lower() for s in signals) + + def test_requester_managed_null_no_extra_signal(self): + eni = _eni("eni-rmn-sig", "available") + signals = _run(_make_session(_setup_ec2([eni])))[0].evidence.signals_used + assert not any("requester-managed" in s.lower() for s in signals) + + def test_operator_managed_true_adds_signal(self): + eni = _eni("eni-op-sig", "available", Operator={"Managed": True, "Principal": "svc-x"}) + signals = _run(_make_session(_setup_ec2([eni])))[0].evidence.signals_used + assert any("operator-managed" in s.lower() for s in signals) + assert any("svc-x" in s for s in signals) + + def test_operator_managed_true_no_principal_uses_unknown(self): + eni = _eni("eni-op-noprinc", "available", Operator={"Managed": True}) + signals = _run(_make_session(_setup_ec2([eni])))[0].evidence.signals_used + assert any("operator-managed" in s.lower() for s in signals) + assert any("unknown" in s for s in signals) + + def test_operator_managed_false_no_extra_signal(self): + eni = _eni("eni-opf-sig", "available", Operator={"Managed": False}) + signals = _run(_make_session(_setup_ec2([eni])))[0].evidence.signals_used + assert not any("operator-managed" in s.lower() for s in signals) + + def test_both_requester_and_operator_managed_both_signals_present(self): + eni = _eni( + "eni-both-sig", + "available", + RequesterManaged=True, + Operator={"Managed": True, "Principal": "svc-y"}, + ) + signals = _run(_make_session(_setup_ec2([eni])))[0].evidence.signals_used + assert any("requester-managed" in s.lower() for s in signals) + assert any("operator-managed" in s.lower() for s in signals) + + +# --------------------------------------------------------------------------- +# Evidence contract (§11) +# --------------------------------------------------------------------------- + + +class TestEvidenceContract: + def test_required_details_fields_present(self): + """All required evidence/details fields must be present in every finding.""" + eni = _eni( + "eni-evid", + "available", + InterfaceType="interface", + RequesterManaged=True, + Operator={"Managed": False, "Principal": "svc"}, + AvailabilityZone="us-east-1a", + SubnetId="subnet-aaa", + VpcId="vpc-bbb", + PrivateIpAddress="10.0.0.5", + Association={"PublicIp": "52.1.2.3"}, + ) + findings = _run(_make_session(_setup_ec2([eni]))) + d = findings[0].details + + required_fields = [ + "evaluation_path", + "network_interface_id", + "normalized_status", + "attachment_status", + "interface_type", + "requester_managed", + "operator_managed", + "operator_principal", + "availability_zone", + "subnet_id", + "vpc_id", + "private_ip_address", + "public_ip", + ] + for field in required_fields: + assert field in d, f"Required field '{field}' missing from details" + + def test_evaluation_path_exact_value(self): + ec2 = _setup_ec2([_eni("eni-ep", "available")]) + findings = _run(_make_session(ec2)) + assert findings[0].details["evaluation_path"] == "detached-eni-review-candidate" + + def test_normalized_status_always_available_in_details(self): + ec2 = _setup_ec2([_eni("eni-ns", "available")]) + findings = _run(_make_session(ec2)) + assert findings[0].details["normalized_status"] == "available" + + def test_network_interface_id_in_details(self): + ec2 = _setup_ec2([_eni("eni-id-check", "available")]) + findings = _run(_make_session(ec2)) + assert findings[0].details["network_interface_id"] == "eni-id-check" + + +# --------------------------------------------------------------------------- +# Confidence model (§12) +# --------------------------------------------------------------------------- + + +class TestConfidenceModel: + def test_high_confidence_for_available_no_conflict(self): + ec2 = _setup_ec2([_eni("eni-conf1", "available")]) + assert _run(_make_session(ec2))[0].confidence == ConfidenceLevel.HIGH + + def test_high_confidence_with_detached_attachment(self): + eni = _eni("eni-conf2", "available", Attachment={"Status": "detached"}) + assert _run(_make_session(_setup_ec2([eni])))[0].confidence == ConfidenceLevel.HIGH + + def test_high_confidence_requester_managed(self): + eni = _eni("eni-conf3", "available", RequesterManaged=True) + assert _run(_make_session(_setup_ec2([eni])))[0].confidence == ConfidenceLevel.HIGH + + def test_high_confidence_operator_managed(self): + eni = _eni("eni-conf4", "available", Operator={"Managed": True}) + assert _run(_make_session(_setup_ec2([eni])))[0].confidence == ConfidenceLevel.HIGH + + +# --------------------------------------------------------------------------- +# Cost model (§11.2) +# --------------------------------------------------------------------------- + + +class TestCostModel: + def test_estimated_monthly_cost_always_none(self): + ec2 = _setup_ec2([_eni("eni-cost1", "available")]) + assert _run(_make_session(ec2))[0].estimated_monthly_cost_usd is None + + +# --------------------------------------------------------------------------- +# Risk model (§14) +# --------------------------------------------------------------------------- + + +class TestRiskModel: + def test_risk_is_low(self): + ec2 = _setup_ec2([_eni("eni-risk", "available")]) + assert _run(_make_session(ec2))[0].risk == RiskLevel.LOW + + +# --------------------------------------------------------------------------- +# Title and reason contract (§13) +# --------------------------------------------------------------------------- + + +class TestTitleAndReasonContract: + def test_title_exact(self): + ec2 = _setup_ec2([_eni("eni-title", "available")]) + assert _run(_make_session(ec2))[0].title == "ENI not currently attached review candidate" + + def test_reason_exact(self): + ec2 = _setup_ec2([_eni("eni-reason", "available")]) + reason = _run(_make_session(ec2))[0].reason + assert ( + reason + == "ENI Status is 'available' — not currently attached per DescribeNetworkInterfaces" + ) + + def test_title_does_not_claim_safe_to_delete(self): + ec2 = _setup_ec2([_eni("eni-safe", "available")]) + title = _run(_make_session(ec2))[0].title + assert "delete" not in title.lower() + assert "safe" not in title.lower() + + +# --------------------------------------------------------------------------- +# Pagination exhaustion +# --------------------------------------------------------------------------- + + +class TestPagination: + def test_multiple_pages_all_evaluated(self): + """Pagination must be fully exhausted — all pages contribute findings.""" + ec2 = MagicMock() + paginator = MagicMock() + ec2.get_paginator.return_value = paginator + paginator.paginate.return_value = [ + {"NetworkInterfaces": [_eni("eni-p1", "available")]}, + {"NetworkInterfaces": [_eni("eni-p2", "available")]}, + {"NetworkInterfaces": [_eni("eni-p3", "in-use")]}, + ] + findings = _run(_make_session(ec2)) + ids = {f.resource_id for f in findings} + assert "eni-p1" in ids + assert "eni-p2" in ids + assert "eni-p3" not in ids + assert len(findings) == 2 + + def test_empty_page_yields_no_findings(self): + ec2 = _setup_ec2([]) + assert _run(_make_session(ec2)) == [] + + def test_paginator_called_with_correct_operation(self): + ec2 = _setup_ec2([]) + _run(_make_session(ec2)) + ec2.get_paginator.assert_called_once_with("describe_network_interfaces") + + def test_mixed_valid_and_malformed_items(self): + """Malformed items in a page are silently skipped; valid items emit.""" + ec2 = MagicMock() + paginator = MagicMock() + ec2.get_paginator.return_value = paginator + paginator.paginate.return_value = [ + { + "NetworkInterfaces": [ + "not-a-dict", + None, + {"Status": "available"}, # missing NetworkInterfaceId + _eni("eni-valid", "available"), + ] + } + ] + findings = _run(_make_session(ec2)) + assert len(findings) == 1 + assert findings[0].resource_id == "eni-valid" + + +# --------------------------------------------------------------------------- +# Additional correctness checks +# --------------------------------------------------------------------------- + + +class TestCorrectness: + def test_resource_id_matches_network_interface_id(self): + ec2 = _setup_ec2([_eni("eni-rid", "available")]) + f = _run(_make_session(ec2))[0] + assert f.resource_id == "eni-rid" + assert f.details["network_interface_id"] == "eni-rid" + + def test_region_in_finding(self): + ec2 = _setup_ec2([_eni("eni-reg", "available")]) + session = MagicMock() + session.client.return_value = ec2 + findings = find_detached_enis(session, "eu-west-1") + assert findings[0].region == "eu-west-1" + + def test_rule_id_correct(self): + ec2 = _setup_ec2([_eni("eni-ruleid", "available")]) + assert _run(_make_session(ec2))[0].rule_id == "aws.ec2.eni.detached" + + def test_provider_is_aws(self): + ec2 = _setup_ec2([_eni("eni-prov", "available")]) + assert _run(_make_session(ec2))[0].provider == "aws" + + def test_multiple_available_enis_all_emit(self): + """All available ENIs in one page emit, regardless of other attributes.""" + enis = [ + _eni("eni-a1", "available"), + _eni("eni-a2", "available", RequesterManaged=True), + _eni("eni-a3", "available", InterfaceType="load_balancer"), + _eni("eni-a4", "available", Operator={"Managed": True}), + _eni("eni-a5", "available", Attachment={"Status": "detached"}), + ] + ec2 = _setup_ec2(enis) + findings = _run(_make_session(ec2)) + ids = {f.resource_id for f in findings} + assert ids == {"eni-a1", "eni-a2", "eni-a3", "eni-a4", "eni-a5"} + + def test_mixed_statuses_only_available_emits(self): + enis = [ + _eni("eni-av", "available"), + _eni("eni-iu", "in-use"), + _eni("eni-at", "attaching"), + _eni("eni-dt", "detaching"), + _eni("eni-as", "associated"), + ] + ec2 = _setup_ec2(enis) + findings = _run(_make_session(ec2)) + assert len(findings) == 1 + assert findings[0].resource_id == "eni-av" diff --git a/tests/cleancloud/providers/aws/test_aws_nat_gateway_idle.py b/tests/cleancloud/providers/aws/test_aws_nat_gateway_idle.py index 0fd4db3..8c7f34f 100644 --- a/tests/cleancloud/providers/aws/test_aws_nat_gateway_idle.py +++ b/tests/cleancloud/providers/aws/test_aws_nat_gateway_idle.py @@ -1,341 +1,723 @@ +"""Tests for aws.ec2.nat_gateway.idle rule. + +Covers all acceptance scenarios from docs/specs/aws/nat_gateway_idle.md §15 +and the normalization, evidence, confidence, cost, risk, title/reason, +failure, and pagination contracts. +""" + from datetime import datetime, timedelta, timezone from unittest.mock import MagicMock -from botocore.exceptions import ClientError +import pytest +from botocore.exceptions import BotoCoreError, ClientError +from cleancloud.core.confidence import ConfidenceLevel +from cleancloud.core.risk import RiskLevel from cleancloud.providers.aws.rules.nat_gateway_idle import find_idle_nat_gateways +_REGION = "us-east-1" +_THRESHOLD = 14 -def test_find_idle_nat_gateways(mock_boto3_session): - region = "us-east-1" - ec2 = mock_boto3_session._ec2 - - now = datetime.now(timezone.utc) - old_date = now - timedelta(days=30) - recent_date = now - timedelta(days=5) - - # Mock paginator for describe_nat_gateways - paginator = ec2.get_paginator.return_value - paginator.paginate.return_value = [ - { - "NatGateways": [ - # Idle NAT Gateway (30 days old, no traffic) - should be flagged - { - "NatGatewayId": "nat-idle123", - "State": "available", - "CreateTime": old_date, - "VpcId": "vpc-123", - "SubnetId": "subnet-123", - "NatGatewayAddresses": [ - { - "AllocationId": "eipalloc-123", - "PublicIp": "54.1.2.3", - "PrivateIp": "10.0.1.100", - } - ], - "Tags": [{"Key": "Name", "Value": "idle-nat-gateway"}], - }, - # Active NAT Gateway (has traffic) - should NOT be flagged - { - "NatGatewayId": "nat-active456", - "State": "available", - "CreateTime": old_date, - "VpcId": "vpc-456", - "SubnetId": "subnet-456", - "NatGatewayAddresses": [], - "Tags": [], - }, - # Young NAT Gateway (5 days old) - should NOT be flagged - { - "NatGatewayId": "nat-young789", - "State": "available", - "CreateTime": recent_date, - "VpcId": "vpc-789", - "SubnetId": "subnet-789", - "NatGatewayAddresses": [], - "Tags": [], - }, - # Pending NAT Gateway - should NOT be flagged - { - "NatGatewayId": "nat-pending000", - "State": "pending", - "CreateTime": old_date, - "VpcId": "vpc-000", - "SubnetId": "subnet-000", - "NatGatewayAddresses": [], - "Tags": [], - }, - ] - } - ] - - # Mock CloudWatch client - cloudwatch_mock = MagicMock() - mock_boto3_session.client.side_effect = lambda service, **kwargs: ( - ec2 if service == "ec2" else cloudwatch_mock - ) - - # Mock CloudWatch metrics - idle for nat-idle123, active for nat-active456 - def mock_get_metric_statistics(**kwargs): - nat_id = kwargs["Dimensions"][0]["Value"] - if nat_id == "nat-idle123": - # No traffic - return {"Datapoints": []} - elif nat_id == "nat-active456": - # Has traffic - return {"Datapoints": [{"Sum": 1000000}]} - else: - return {"Datapoints": []} - - cloudwatch_mock.get_metric_statistics.side_effect = mock_get_metric_statistics - - findings = find_idle_nat_gateways(mock_boto3_session, region) - nat_ids = {f.resource_id for f in findings} - - # Should flag idle NAT Gateway - assert "nat-idle123" in nat_ids - - # Should NOT flag active NAT Gateway (has traffic) - assert "nat-active456" not in nat_ids - - # Should NOT flag young NAT Gateway - assert "nat-young789" not in nat_ids - - # Should NOT flag pending NAT Gateway - assert "nat-pending000" not in nat_ids - - # Verify finding details - assert len(findings) == 1 - finding = findings[0] - assert finding.provider == "aws" - assert finding.rule_id == "aws.ec2.nat_gateway.idle" - # Zero traffic + no route table references → HIGH confidence and risk - assert finding.confidence.value == "high" - assert finding.risk.value == "high" - assert finding.details["name"] == "idle-nat-gateway" - assert finding.details["vpc_id"] == "vpc-123" - assert "~$32.85/month" in finding.details["estimated_monthly_cost"] - assert finding.estimated_monthly_cost_usd == 32.85 - - -def test_find_idle_nat_gateways_empty_account(mock_boto3_session): - region = "us-east-1" - ec2 = mock_boto3_session._ec2 - - paginator = ec2.get_paginator.return_value - paginator.paginate.return_value = [{"NatGateways": []}] - - # Mock CloudWatch client (needed even for empty results) - cloudwatch_mock = MagicMock() - mock_boto3_session.client.side_effect = lambda service, **kwargs: ( - ec2 if service == "ec2" else cloudwatch_mock - ) - - findings = find_idle_nat_gateways(mock_boto3_session, region) - assert findings == [] - - -def test_find_idle_nat_gateways_custom_threshold(mock_boto3_session): - region = "us-east-1" - ec2 = mock_boto3_session._ec2 - - now = datetime.now(timezone.utc) - # NAT Gateway is 20 days old - creation_date = now - timedelta(days=20) - - paginator = ec2.get_paginator.return_value - paginator.paginate.return_value = [ - { - "NatGateways": [ - { - "NatGatewayId": "nat-test", - "State": "available", - "CreateTime": creation_date, - "VpcId": "vpc-test", - "SubnetId": "subnet-test", - "NatGatewayAddresses": [], - "Tags": [], - }, - ] - } - ] - - # Mock CloudWatch - no traffic - cloudwatch_mock = MagicMock() - mock_boto3_session.client.side_effect = lambda service, **kwargs: ( - ec2 if service == "ec2" else cloudwatch_mock - ) - cloudwatch_mock.get_metric_statistics.return_value = {"Datapoints": []} - - # With 30-day threshold, should NOT be flagged (only 20 days old) - findings_30 = find_idle_nat_gateways(mock_boto3_session, region, idle_days=30) - assert len(findings_30) == 0 - # With 14-day threshold, should be flagged (20 > 14) - findings_14 = find_idle_nat_gateways(mock_boto3_session, region, idle_days=14) - assert len(findings_14) == 1 - assert findings_14[0].resource_id == "nat-test" +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- -def test_find_idle_nat_gateways_with_traffic(mock_boto3_session): - """NAT Gateway with traffic should not be flagged.""" - region = "us-east-1" - ec2 = mock_boto3_session._ec2 +def _make_session(ec2: MagicMock, cw: MagicMock) -> MagicMock: + session = MagicMock() - now = datetime.now(timezone.utc) - old_date = now - timedelta(days=30) - - paginator = ec2.get_paginator.return_value - paginator.paginate.return_value = [ - { - "NatGateways": [ - { - "NatGatewayId": "nat-active", - "State": "available", - "CreateTime": old_date, - "VpcId": "vpc-123", - "SubnetId": "subnet-123", - "NatGatewayAddresses": [], - "Tags": [], - }, - ] - } - ] + def _client(service, **kwargs): + if service == "ec2": + return ec2 + if service == "cloudwatch": + return cw + raise ValueError(f"Unexpected service: {service}") - # Mock CloudWatch - has traffic - cloudwatch_mock = MagicMock() - mock_boto3_session.client.side_effect = lambda service, **kwargs: ( - ec2 if service == "ec2" else cloudwatch_mock - ) - cloudwatch_mock.get_metric_statistics.return_value = { - "Datapoints": [ - {"Sum": 50000000}, # 50 MB of traffic - {"Sum": 100000000}, # 100 MB of traffic - ] - } - - findings = find_idle_nat_gateways(mock_boto3_session, region) - assert findings == [] - - -def test_find_idle_nat_gateways_title_includes_threshold(mock_boto3_session): - """Verify title includes the days_idle threshold.""" - region = "us-east-1" - ec2 = mock_boto3_session._ec2 - - now = datetime.now(timezone.utc) - old_date = now - timedelta(days=30) - - paginator = ec2.get_paginator.return_value - paginator.paginate.return_value = [ - { - "NatGateways": [ - { - "NatGatewayId": "nat-test", - "State": "available", - "CreateTime": old_date, - "VpcId": "vpc-test", - "SubnetId": "subnet-test", - "NatGatewayAddresses": [], - "Tags": [], - }, - ] - } - ] + session.client.side_effect = _client + return session - cloudwatch_mock = MagicMock() - mock_boto3_session.client.side_effect = lambda service, **kwargs: ( - ec2 if service == "ec2" else cloudwatch_mock - ) - cloudwatch_mock.get_metric_statistics.return_value = {"Datapoints": []} - # Test with custom threshold - findings = find_idle_nat_gateways(mock_boto3_session, region, idle_days=7) - assert len(findings) == 1 - assert "7+ Days" in findings[0].title +def _setup_ec2(nat_gws: list) -> MagicMock: + ec2 = MagicMock() + paginator = MagicMock() + ec2.get_paginator.return_value = paginator + paginator.paginate.return_value = [{"NatGateways": nat_gws}] + ec2.describe_route_tables.return_value = {"RouteTables": []} + return ec2 -def _make_nat_gw(nat_id, age_days=30): +def _nat_gw( + gw_id: str = "nat-aabbccdd", + state: str = "available", + age_days: int = 20, + **extra, +) -> dict: now = datetime.now(timezone.utc) - return { - "NatGatewayId": nat_id, - "State": "available", + base = { + "NatGatewayId": gw_id, + "State": state, "CreateTime": now - timedelta(days=age_days), "VpcId": "vpc-test", "SubnetId": "subnet-test", - "NatGatewayAddresses": [], - "Tags": [], + "ConnectivityType": "public", } + base.update(extra) + return base -def test_metric_fetch_failure_produces_low_confidence_finding(mock_boto3_session): - """When CloudWatch metrics fail with a transient error, a LOW-confidence finding - is created with an 'unverified' title instead of being silently suppressed.""" - region = "us-east-1" - ec2 = mock_boto3_session._ec2 +def _cw_zero_traffic(num_datapoints: int = 1) -> MagicMock: + """CloudWatch mock that returns `num_datapoints` zero-valued datapoints for every metric.""" + cw = MagicMock() - paginator = ec2.get_paginator.return_value - paginator.paginate.return_value = [{"NatGateways": [_make_nat_gw("nat-fetch-fail")]}] + def _get_stats(**kwargs): + stat = kwargs["Statistics"][0] + return {"Datapoints": [{stat: 0.0} for _ in range(num_datapoints)]} - cloudwatch_mock = MagicMock() - mock_boto3_session.client.side_effect = lambda service, **kwargs: ( - ec2 if service == "ec2" else cloudwatch_mock - ) + cw.get_metric_statistics.side_effect = _get_stats + return cw - error_response = {"Error": {"Code": "Throttling", "Message": "Rate exceeded"}} - cloudwatch_mock.get_metric_statistics.side_effect = ClientError( - error_response, "GetMetricStatistics" - ) - findings = find_idle_nat_gateways(mock_boto3_session, region) - assert len(findings) == 1 - f = findings[0] - assert f.confidence.value == "low" - assert "Requires Traffic Verification" in f.title - assert "unverified" in f.reason.lower() or "could not be fetched" in f.reason.lower() - signals_not_checked = [s.lower() for s in f.evidence.signals_not_checked] - assert any("fetch failed" in s for s in signals_not_checked) +def _cw_no_datapoints() -> MagicMock: + """CloudWatch mock that returns empty datapoints for every metric.""" + cw = MagicMock() + cw.get_metric_statistics.return_value = {"Datapoints": []} + return cw -def test_not_in_route_tables_signal(mock_boto3_session): - """When no route table references the NAT GW, the finding includes a signal noting it.""" - region = "us-east-1" - ec2 = mock_boto3_session._ec2 +def _cw_with_traffic(trigger_metric: str, trigger_stat: str, value: float) -> MagicMock: + """CloudWatch mock that returns traffic on one specific metric.""" + cw = MagicMock() - paginator = ec2.get_paginator.return_value - paginator.paginate.return_value = [{"NatGateways": [_make_nat_gw("nat-no-routes")]}] + def _get_stats(**kwargs): + metric = kwargs["MetricName"] + stat = kwargs["Statistics"][0] + if metric == trigger_metric: + return {"Datapoints": [{stat: value}]} + return { + "Datapoints": [ + {"Sum": 0.0, "Maximum": 0.0}.get(stat, 0.0) and {stat: 0.0} or {stat: 0.0} + ] + } + + cw.get_metric_statistics.side_effect = _get_stats + return cw - cloudwatch_mock = MagicMock() - mock_boto3_session.client.side_effect = lambda service, **kwargs: ( - ec2 if service == "ec2" else cloudwatch_mock - ) - cloudwatch_mock.get_metric_statistics.return_value = {"Datapoints": []} - ec2.describe_route_tables.return_value = {"RouteTables": []} - findings = find_idle_nat_gateways(mock_boto3_session, region) - assert len(findings) == 1 - signals = findings[0].evidence.signals_used - assert any("not referenced by any vpc route table" in s.lower() for s in signals) - assert findings[0].details["in_route_tables"] is False +def _cw_active_connection(value: float = 5.0) -> MagicMock: + """CloudWatch mock where ActiveConnectionCount Maximum > 0.""" + cw = MagicMock() + def _get_stats(**kwargs): + metric = kwargs["MetricName"] + if metric == "ActiveConnectionCount": + return {"Datapoints": [{"Maximum": value}]} + return {"Datapoints": [{"Sum": 0.0}]} -def test_in_route_tables_signal(mock_boto3_session): - """When a route table references the NAT GW, the finding notes it (still idle by traffic).""" - region = "us-east-1" - ec2 = mock_boto3_session._ec2 + cw.get_metric_statistics.side_effect = _get_stats + return cw - paginator = ec2.get_paginator.return_value - paginator.paginate.return_value = [{"NatGateways": [_make_nat_gw("nat-has-routes")]}] - cloudwatch_mock = MagicMock() - mock_boto3_session.client.side_effect = lambda service, **kwargs: ( - ec2 if service == "ec2" else cloudwatch_mock +def _cw_error(code: str = "Throttling") -> MagicMock: + cw = MagicMock() + cw.get_metric_statistics.side_effect = ClientError( + {"Error": {"Code": code, "Message": "test"}}, "GetMetricStatistics" ) - cloudwatch_mock.get_metric_statistics.return_value = {"Datapoints": []} - ec2.describe_route_tables.return_value = {"RouteTables": [{"RouteTableId": "rtb-abc"}]} - - findings = find_idle_nat_gateways(mock_boto3_session, region) - assert len(findings) == 1 - signals = findings[0].evidence.signals_used - assert any("referenced by at least one vpc route table" in s.lower() for s in signals) - assert findings[0].details["in_route_tables"] is True + return cw + + +def _run(session: MagicMock, threshold: int = _THRESHOLD) -> list: + return find_idle_nat_gateways(session, _REGION, idle_days_threshold=threshold) + + +def _client_error(code: str = "SomeError") -> ClientError: + return ClientError({"Error": {"Code": code, "Message": "test"}}, "DescribeNatGateways") + + +# --------------------------------------------------------------------------- +# §15 Must Emit +# --------------------------------------------------------------------------- + + +class TestMustEmit: + def test_available_old_enough_zero_traffic_no_route_ref_high(self): + """Scenario 1: Available, old, zero traffic, no route ref → EMIT HIGH.""" + ec2 = _setup_ec2([_nat_gw("nat-1", age_days=20)]) + ec2.describe_route_tables.return_value = {"RouteTables": []} + cw = _cw_zero_traffic() + findings = _run(_make_session(ec2, cw)) + assert len(findings) == 1 + assert findings[0].resource_id == "nat-1" + assert findings[0].confidence == ConfidenceLevel.HIGH + + def test_available_old_enough_zero_traffic_route_ref_medium(self): + """Scenario 2: Available, old, zero traffic, route table still references → EMIT MEDIUM.""" + ec2 = _setup_ec2([_nat_gw("nat-2", age_days=20)]) + ec2.describe_route_tables.return_value = {"RouteTables": [{"RouteTableId": "rtb-abc"}]} + cw = _cw_zero_traffic() + findings = _run(_make_session(ec2, cw)) + assert len(findings) == 1 + assert findings[0].confidence == ConfidenceLevel.MEDIUM + + def test_available_old_enough_zero_traffic_rt_lookup_failed_medium(self): + """Scenario 3a: DescribeRouteTables ClientError → EMIT MEDIUM, context unavailable.""" + ec2 = _setup_ec2([_nat_gw("nat-3", age_days=20)]) + ec2.describe_route_tables.side_effect = _client_error("AccessDenied") + cw = _cw_zero_traffic() + findings = _run(_make_session(ec2, cw)) + assert len(findings) == 1 + assert findings[0].confidence == ConfidenceLevel.MEDIUM + + def test_available_old_enough_zero_traffic_rt_botocore_error_medium(self): + """Scenario 3b: DescribeRouteTables BotoCoreError → EMIT MEDIUM, context unavailable.""" + ec2 = _setup_ec2([_nat_gw("nat-3b", age_days=20)]) + ec2.describe_route_tables.side_effect = BotoCoreError() + cw = _cw_zero_traffic() + findings = _run(_make_session(ec2, cw)) + assert len(findings) == 1 + assert findings[0].confidence == ConfidenceLevel.MEDIUM + assert findings[0].details["route_table_referenced"] is None + + def test_rt_any_exception_degrades_gracefully(self): + """Any exception from DescribeRouteTables degrades context — scan never blows up.""" + ec2 = _setup_ec2([_nat_gw("nat-rtexc", age_days=20)]) + ec2.describe_route_tables.side_effect = RuntimeError("unexpected") + cw = _cw_zero_traffic() + findings = _run(_make_session(ec2, cw)) + assert len(findings) == 1 + assert findings[0].confidence == ConfidenceLevel.MEDIUM + assert findings[0].details["route_table_referenced"] is None + + +# --------------------------------------------------------------------------- +# §15 Must Skip +# --------------------------------------------------------------------------- + + +class TestMustSkip: + def test_state_pending_skipped(self): + """Scenario 4a: State pending → SKIP.""" + ec2 = _setup_ec2([_nat_gw("nat-pend", state="pending")]) + assert _run(_make_session(ec2, _cw_zero_traffic())) == [] + + def test_state_failed_skipped(self): + ec2 = _setup_ec2([_nat_gw("nat-fail", state="failed")]) + assert _run(_make_session(ec2, _cw_zero_traffic())) == [] + + def test_state_deleting_skipped(self): + ec2 = _setup_ec2([_nat_gw("nat-del", state="deleting")]) + assert _run(_make_session(ec2, _cw_zero_traffic())) == [] + + def test_state_deleted_skipped(self): + ec2 = _setup_ec2([_nat_gw("nat-deld", state="deleted")]) + assert _run(_make_session(ec2, _cw_zero_traffic())) == [] + + def test_too_young_skipped(self): + """Scenario 5: Available but younger than threshold → SKIP.""" + ec2 = _setup_ec2([_nat_gw("nat-young", age_days=5)]) + assert _run(_make_session(ec2, _cw_zero_traffic()), threshold=14) == [] + + def test_bytes_out_to_destination_nonzero_skipped(self): + """Scenario 6: BytesOutToDestination Sum > 0 → SKIP.""" + ec2 = _setup_ec2([_nat_gw("nat-bytes")]) + cw = MagicMock() + + def _get_stats(**kwargs): + metric = kwargs["MetricName"] + stat = kwargs["Statistics"][0] + if metric == "BytesOutToDestination": + return {"Datapoints": [{"Sum": 100.0}]} + return {"Datapoints": [{stat: 0.0}]} + + cw.get_metric_statistics.side_effect = _get_stats + assert _run(_make_session(ec2, cw)) == [] + + def test_bytes_in_from_source_nonzero_skipped(self): + ec2 = _setup_ec2([_nat_gw("nat-bifs")]) + cw = MagicMock() + + def _get_stats(**kwargs): + metric = kwargs["MetricName"] + stat = kwargs["Statistics"][0] + if metric == "BytesInFromSource": + return {"Datapoints": [{"Sum": 1.0}]} + return {"Datapoints": [{stat: 0.0}]} + + cw.get_metric_statistics.side_effect = _get_stats + assert _run(_make_session(ec2, cw)) == [] + + def test_active_connection_count_nonzero_skipped(self): + """Scenario 7: ActiveConnectionCount Maximum > 0 → SKIP.""" + ec2 = _setup_ec2([_nat_gw("nat-acc")]) + cw = _cw_active_connection(value=3.0) + assert _run(_make_session(ec2, cw)) == [] + + def test_missing_create_time_skipped(self): + """Scenario 8a: Missing CreateTime → SKIP.""" + gw = {"NatGatewayId": "nat-noct", "State": "available"} + ec2 = _setup_ec2([gw]) + assert _run(_make_session(ec2, _cw_zero_traffic())) == [] + + def test_naive_create_time_skipped(self): + """Scenario 8b: Naive (timezone-unaware) CreateTime → SKIP.""" + gw = _nat_gw("nat-naive") + gw["CreateTime"] = datetime.now() # naive, no tzinfo + ec2 = _setup_ec2([gw]) + assert _run(_make_session(ec2, _cw_zero_traffic())) == [] + + def test_future_create_time_skipped(self): + """Scenario 8c: Future CreateTime → SKIP.""" + gw = _nat_gw("nat-future") + gw["CreateTime"] = datetime.now(timezone.utc) + timedelta(days=10) + ec2 = _setup_ec2([gw]) + assert _run(_make_session(ec2, _cw_zero_traffic())) == [] + + def test_no_datapoints_any_metric_skipped(self): + """Scenario 9: Any required metric returns no datapoints → SKIP ITEM.""" + ec2 = _setup_ec2([_nat_gw("nat-nodata")]) + cw = _cw_no_datapoints() + assert _run(_make_session(ec2, cw)) == [] + + def test_partial_datapoints_missing_one_metric_skipped(self): + """If one metric has no datapoints but others do → SKIP ITEM.""" + ec2 = _setup_ec2([_nat_gw("nat-partial")]) + cw = MagicMock() + call_count = [0] + + def _get_stats(**kwargs): + call_count[0] += 1 + # First metric returns data; second metric returns nothing + if call_count[0] == 1: + return {"Datapoints": [{"Sum": 0.0}]} + return {"Datapoints": []} + + cw.get_metric_statistics.side_effect = _get_stats + assert _run(_make_session(ec2, cw)) == [] + + +# --------------------------------------------------------------------------- +# §15 Must Fail +# --------------------------------------------------------------------------- + + +class TestMustFailRule: + def test_describe_nat_gateways_client_error_raises(self): + """Scenario 10: DescribeNatGateways ClientError → FAIL RULE.""" + ec2 = MagicMock() + ec2.get_paginator.return_value.paginate.side_effect = _client_error("InternalServerError") + with pytest.raises(ClientError): + _run(_make_session(ec2, _cw_zero_traffic())) + + def test_describe_nat_gateways_unauthorized_raises_permission_error(self): + ec2 = MagicMock() + ec2.get_paginator.return_value.paginate.side_effect = _client_error("UnauthorizedOperation") + with pytest.raises(PermissionError): + _run(_make_session(ec2, _cw_zero_traffic())) + + def test_describe_nat_gateways_botocore_error_raises(self): + ec2 = MagicMock() + ec2.get_paginator.return_value.paginate.side_effect = BotoCoreError() + with pytest.raises(BotoCoreError): + _run(_make_session(ec2, _cw_zero_traffic())) + + def test_cloudwatch_client_error_raises(self): + """Scenario 11: CloudWatch metric fetch ClientError → FAIL RULE.""" + ec2 = _setup_ec2([_nat_gw("nat-cwerr")]) + cw = _cw_error("InternalServerError") + with pytest.raises(ClientError): + _run(_make_session(ec2, cw)) + + def test_cloudwatch_botocore_error_raises(self): + ec2 = _setup_ec2([_nat_gw("nat-cwboto")]) + cw = MagicMock() + cw.get_metric_statistics.side_effect = BotoCoreError() + with pytest.raises(BotoCoreError): + _run(_make_session(ec2, cw)) + + def test_cloudwatch_unauthorized_raises_permission_error(self): + ec2 = _setup_ec2([_nat_gw("nat-cwunauth")]) + cw = _cw_error("UnauthorizedOperation") + with pytest.raises(PermissionError): + _run(_make_session(ec2, cw)) + + def test_cloudwatch_throttle_raises_not_low_confidence(self): + """Throttling error must raise (FAIL RULE), NOT produce a LOW-confidence finding.""" + ec2 = _setup_ec2([_nat_gw("nat-throttle")]) + cw = _cw_error("Throttling") + with pytest.raises(ClientError): + _run(_make_session(ec2, cw)) + + +# --------------------------------------------------------------------------- +# §15 Must NOT Happen +# --------------------------------------------------------------------------- + + +class TestMustNotHappen: + def test_low_confidence_never_emitted(self): + """LOW confidence finding must never be emitted.""" + ec2 = _setup_ec2([_nat_gw("nat-nolow")]) + cw = _cw_zero_traffic() + findings = _run(_make_session(ec2, cw)) + for f in findings: + assert f.confidence != ConfidenceLevel.LOW + + def test_missing_datapoints_not_treated_as_zero(self): + """Missing datapoints → SKIP ITEM, not zero traffic → no finding.""" + ec2 = _setup_ec2([_nat_gw("nat-nodata2")]) + cw = _cw_no_datapoints() + assert _run(_make_session(ec2, cw)) == [] + + def test_cost_is_none(self): + """estimated_monthly_cost_usd must always be None.""" + ec2 = _setup_ec2([_nat_gw("nat-cost")]) + cw = _cw_zero_traffic() + f = _run(_make_session(ec2, cw))[0] + assert f.estimated_monthly_cost_usd is None + + def test_route_table_absence_not_substitute_for_cloudwatch(self): + """Route-table absence must not compensate for missing CloudWatch evidence.""" + ec2 = _setup_ec2([_nat_gw("nat-rt-subst")]) + ec2.describe_route_tables.return_value = {"RouteTables": []} + cw = _cw_no_datapoints() # No CW data → must skip + assert _run(_make_session(ec2, cw)) == [] + + +# --------------------------------------------------------------------------- +# Normalization contract +# --------------------------------------------------------------------------- + + +class TestNormalization: + def test_non_dict_item_skipped(self): + ec2 = MagicMock() + paginator = MagicMock() + ec2.get_paginator.return_value = paginator + paginator.paginate.return_value = [{"NatGateways": ["not-a-dict", None, 42]}] + assert _run(_make_session(ec2, _cw_zero_traffic())) == [] + + def test_missing_nat_gateway_id_skipped(self): + gw = {"State": "available", "CreateTime": datetime.now(timezone.utc) - timedelta(days=20)} + ec2 = _setup_ec2([gw]) + assert _run(_make_session(ec2, _cw_zero_traffic())) == [] + + def test_empty_string_nat_gateway_id_skipped(self): + gw = _nat_gw("nat-x") + gw["NatGatewayId"] = "" + ec2 = _setup_ec2([gw]) + assert _run(_make_session(ec2, _cw_zero_traffic())) == [] + + def test_missing_state_skipped(self): + gw = { + "NatGatewayId": "nat-nostate", + "CreateTime": datetime.now(timezone.utc) - timedelta(days=20), + } + ec2 = _setup_ec2([gw]) + assert _run(_make_session(ec2, _cw_zero_traffic())) == [] + + def test_age_exactly_at_threshold_emits(self): + """age_days == threshold → eligible (>= check).""" + ec2 = _setup_ec2([_nat_gw("nat-exact", age_days=_THRESHOLD)]) + cw = _cw_zero_traffic() + findings = _run(_make_session(ec2, cw)) + assert len(findings) == 1 + + def test_age_one_below_threshold_skipped(self): + ec2 = _setup_ec2([_nat_gw("nat-below", age_days=_THRESHOLD - 1)]) + assert _run(_make_session(ec2, _cw_zero_traffic())) == [] + + def test_tags_absent_yields_empty_list(self): + gw = _nat_gw("nat-notag") + gw.pop("Tags", None) + ec2 = _setup_ec2([gw]) + f = _run(_make_session(ec2, _cw_zero_traffic()))[0] + assert f.details["tag_set"] == [] + + def test_tags_list_preserved(self): + tags = [{"Key": "Name", "Value": "my-nat"}] + ec2 = _setup_ec2([_nat_gw("nat-tag", Tags=tags)]) + f = _run(_make_session(ec2, _cw_zero_traffic()))[0] + assert f.details["tag_set"] == tags + + def test_nat_gateway_addresses_absent_yields_empty_list(self): + gw = _nat_gw("nat-noaddr") + gw.pop("NatGatewayAddresses", None) + ec2 = _setup_ec2([gw]) + f = _run(_make_session(ec2, _cw_zero_traffic()))[0] + assert f.details["nat_gateway_addresses"] == [] + + def test_connectivity_type_null_when_absent(self): + gw = _nat_gw("nat-notype") + gw.pop("ConnectivityType", None) + ec2 = _setup_ec2([gw]) + f = _run(_make_session(ec2, _cw_zero_traffic()))[0] + assert f.details["connectivity_type"] is None + + +# --------------------------------------------------------------------------- +# Evidence contract (§11) +# --------------------------------------------------------------------------- + + +class TestEvidenceContract: + def test_required_details_fields_present(self): + ec2 = _setup_ec2([_nat_gw("nat-evid")]) + f = _run(_make_session(ec2, _cw_zero_traffic()))[0] + d = f.details + required = [ + "evaluation_path", + "nat_gateway_id", + "normalized_state", + "create_time", + "age_days", + "idle_days_threshold", + "connectivity_type", + "availability_mode", + "vpc_id", + "subnet_id", + "bytes_out_to_destination", + "bytes_in_from_source", + "bytes_in_from_destination", + "bytes_out_to_source", + "active_connection_count_max", + ] + for field in required: + assert field in d, f"Required field '{field}' missing" + + def test_evaluation_path_exact(self): + ec2 = _setup_ec2([_nat_gw("nat-ep")]) + f = _run(_make_session(ec2, _cw_zero_traffic()))[0] + assert f.details["evaluation_path"] == "idle-nat-gateway-review-candidate" + + def test_normalized_state_is_available(self): + ec2 = _setup_ec2([_nat_gw("nat-ns")]) + f = _run(_make_session(ec2, _cw_zero_traffic()))[0] + assert f.details["normalized_state"] == "available" + + def test_all_metric_values_zero_in_details(self): + ec2 = _setup_ec2([_nat_gw("nat-mv")]) + f = _run(_make_session(ec2, _cw_zero_traffic()))[0] + d = f.details + assert d["bytes_out_to_destination"] == 0.0 + assert d["bytes_in_from_source"] == 0.0 + assert d["bytes_in_from_destination"] == 0.0 + assert d["bytes_out_to_source"] == 0.0 + assert d["active_connection_count_max"] == 0.0 + + def test_route_table_referenced_false_in_details(self): + ec2 = _setup_ec2([_nat_gw("nat-rtf")]) + ec2.describe_route_tables.return_value = {"RouteTables": []} + f = _run(_make_session(ec2, _cw_zero_traffic()))[0] + assert f.details["route_table_referenced"] is False + + def test_route_table_referenced_true_in_details(self): + ec2 = _setup_ec2([_nat_gw("nat-rtt")]) + ec2.describe_route_tables.return_value = {"RouteTables": [{"RouteTableId": "rtb-x"}]} + f = _run(_make_session(ec2, _cw_zero_traffic()))[0] + assert f.details["route_table_referenced"] is True + + def test_route_table_referenced_none_when_check_fails(self): + ec2 = _setup_ec2([_nat_gw("nat-rtn")]) + ec2.describe_route_tables.side_effect = _client_error("AccessDenied") + f = _run(_make_session(ec2, _cw_zero_traffic()))[0] + assert f.details["route_table_referenced"] is None + + def test_idle_days_threshold_in_details(self): + ec2 = _setup_ec2([_nat_gw("nat-thresh", age_days=30)]) + f = _run(_make_session(ec2, _cw_zero_traffic()), threshold=21)[0] + assert f.details["idle_days_threshold"] == 21 + + def test_active_connection_count_max_in_details(self): + ec2 = _setup_ec2([_nat_gw("nat-acc")]) + f = _run(_make_session(ec2, _cw_zero_traffic()))[0] + assert "active_connection_count_max" in f.details + + +# --------------------------------------------------------------------------- +# Confidence model (§12) +# --------------------------------------------------------------------------- + + +class TestConfidenceModel: + def test_high_when_no_route_ref(self): + ec2 = _setup_ec2([_nat_gw("nat-ch")]) + ec2.describe_route_tables.return_value = {"RouteTables": []} + f = _run(_make_session(ec2, _cw_zero_traffic()))[0] + assert f.confidence == ConfidenceLevel.HIGH + + def test_medium_when_route_table_referenced(self): + ec2 = _setup_ec2([_nat_gw("nat-cm-rt")]) + ec2.describe_route_tables.return_value = {"RouteTables": [{"RouteTableId": "rtb-x"}]} + f = _run(_make_session(ec2, _cw_zero_traffic()))[0] + assert f.confidence == ConfidenceLevel.MEDIUM + + def test_medium_when_route_table_check_fails(self): + ec2 = _setup_ec2([_nat_gw("nat-cm-fail")]) + ec2.describe_route_tables.side_effect = _client_error("AccessDenied") + f = _run(_make_session(ec2, _cw_zero_traffic()))[0] + assert f.confidence == ConfidenceLevel.MEDIUM + + def test_low_confidence_never_emitted(self): + ec2 = _setup_ec2([_nat_gw("nat-nolow")]) + cw = _cw_zero_traffic() + for f in _run(_make_session(ec2, cw)): + assert f.confidence != ConfidenceLevel.LOW + + +# --------------------------------------------------------------------------- +# Cost model (§11.2) +# --------------------------------------------------------------------------- + + +class TestCostModel: + def test_estimated_monthly_cost_always_none(self): + ec2 = _setup_ec2([_nat_gw("nat-cost")]) + f = _run(_make_session(ec2, _cw_zero_traffic()))[0] + assert f.estimated_monthly_cost_usd is None + + def test_no_hardcoded_cost_in_details(self): + """No dollar-amount cost estimate should appear in details.""" + ec2 = _setup_ec2([_nat_gw("nat-nodetcost")]) + f = _run(_make_session(ec2, _cw_zero_traffic()))[0] + detail_str = str(f.details).lower() + assert "$32" not in detail_str + assert "estimated_monthly_cost" not in f.details + + +# --------------------------------------------------------------------------- +# Risk model (§14) +# --------------------------------------------------------------------------- + + +class TestRiskModel: + def test_risk_is_medium_no_route_ref(self): + ec2 = _setup_ec2([_nat_gw("nat-risk1")]) + ec2.describe_route_tables.return_value = {"RouteTables": []} + f = _run(_make_session(ec2, _cw_zero_traffic()))[0] + assert f.risk == RiskLevel.MEDIUM + + def test_risk_is_medium_with_route_ref(self): + ec2 = _setup_ec2([_nat_gw("nat-risk2")]) + ec2.describe_route_tables.return_value = {"RouteTables": [{"RouteTableId": "rtb-x"}]} + f = _run(_make_session(ec2, _cw_zero_traffic()))[0] + assert f.risk == RiskLevel.MEDIUM + + def test_risk_never_high(self): + ec2 = _setup_ec2([_nat_gw("nat-nohigh")]) + for f in _run(_make_session(ec2, _cw_zero_traffic())): + assert f.risk != RiskLevel.HIGH + + +# --------------------------------------------------------------------------- +# Title and reason contract (§13) +# --------------------------------------------------------------------------- + + +class TestTitleAndReasonContract: + def test_title_exact(self): + ec2 = _setup_ec2([_nat_gw("nat-title")]) + f = _run(_make_session(ec2, _cw_zero_traffic()))[0] + assert f.title == "Idle NAT Gateway review candidate" + + def test_reason_contains_threshold(self): + ec2 = _setup_ec2([_nat_gw("nat-reason", age_days=30)]) + f = _run(_make_session(ec2, _cw_zero_traffic()), threshold=21)[0] + assert "21" in f.reason + + def test_title_does_not_claim_safe_to_delete(self): + ec2 = _setup_ec2([_nat_gw("nat-safe")]) + f = _run(_make_session(ec2, _cw_zero_traffic()))[0] + assert "delete" not in f.title.lower() + assert "safe" not in f.title.lower() + + +# --------------------------------------------------------------------------- +# Pagination +# --------------------------------------------------------------------------- + + +class TestPagination: + def test_multiple_pages_all_evaluated(self): + ec2 = MagicMock() + paginator = MagicMock() + ec2.get_paginator.return_value = paginator + ec2.describe_route_tables.return_value = {"RouteTables": []} + paginator.paginate.return_value = [ + {"NatGateways": [_nat_gw("nat-p1")]}, + {"NatGateways": [_nat_gw("nat-p2")]}, + {"NatGateways": [_nat_gw("nat-p3", state="deleted")]}, + ] + findings = _run(_make_session(ec2, _cw_zero_traffic())) + ids = {f.resource_id for f in findings} + assert "nat-p1" in ids + assert "nat-p2" in ids + assert "nat-p3" not in ids + + def test_empty_page_yields_no_findings(self): + ec2 = _setup_ec2([]) + assert _run(_make_session(ec2, _cw_zero_traffic())) == [] + + def test_paginator_called_with_correct_operation(self): + ec2 = _setup_ec2([]) + _run(_make_session(ec2, _cw_zero_traffic())) + ec2.get_paginator.assert_called_once_with("describe_nat_gateways") + + +# --------------------------------------------------------------------------- +# Additional correctness +# --------------------------------------------------------------------------- + + +class TestCorrectness: + def test_resource_id_matches_nat_gateway_id(self): + ec2 = _setup_ec2([_nat_gw("nat-rid")]) + f = _run(_make_session(ec2, _cw_zero_traffic()))[0] + assert f.resource_id == "nat-rid" + assert f.details["nat_gateway_id"] == "nat-rid" + + def test_rule_id_correct(self): + ec2 = _setup_ec2([_nat_gw("nat-ruleid")]) + assert _run(_make_session(ec2, _cw_zero_traffic()))[0].rule_id == "aws.ec2.nat_gateway.idle" + + def test_provider_is_aws(self): + ec2 = _setup_ec2([_nat_gw("nat-prov")]) + assert _run(_make_session(ec2, _cw_zero_traffic()))[0].provider == "aws" + + def test_active_connection_count_metric_is_checked(self): + """ActiveConnectionCount must be in the required metrics; missing data → SKIP.""" + ec2 = _setup_ec2([_nat_gw("nat-acccheck")]) + cw = MagicMock() + metrics_queried = [] + + def _get_stats(**kwargs): + metrics_queried.append(kwargs["MetricName"]) + return { + "Datapoints": [ + {"Sum": 0.0, "Maximum": 0.0}.get(kwargs["Statistics"][0], 0.0) + and {"Sum": 0.0} + or {kwargs["Statistics"][0]: 0.0} + ] + } + + cw.get_metric_statistics.side_effect = _get_stats + _run(_make_session(ec2, cw)) + assert "ActiveConnectionCount" in metrics_queried + + def test_five_metrics_queried_per_nat_gateway(self): + """Exactly 5 CloudWatch metric calls must be made per evaluated NAT Gateway.""" + ec2 = _setup_ec2([_nat_gw("nat-5m")]) + cw = _cw_zero_traffic() + _run(_make_session(ec2, cw)) + assert cw.get_metric_statistics.call_count == 5 + + def test_connectivity_type_private_emits(self): + ec2 = _setup_ec2([_nat_gw("nat-priv", ConnectivityType="private")]) + findings = _run(_make_session(ec2, _cw_zero_traffic())) + assert len(findings) == 1 + assert findings[0].details["connectivity_type"] == "private" + + def test_multiple_available_old_zero_traffic_nat_gws_all_emit(self): + nat_gws = [_nat_gw(f"nat-multi-{i}", age_days=20) for i in range(3)] + ec2 = _setup_ec2(nat_gws) + findings = _run(_make_session(ec2, _cw_zero_traffic())) + assert len(findings) == 3 diff --git a/tests/cleancloud/providers/aws/test_aws_rds_idle.py b/tests/cleancloud/providers/aws/test_aws_rds_idle.py index 3125316..f4a064a 100644 --- a/tests/cleancloud/providers/aws/test_aws_rds_idle.py +++ b/tests/cleancloud/providers/aws/test_aws_rds_idle.py @@ -1,345 +1,954 @@ +""" +Tests for aws.rds.instance.idle rule. + +Test class overview: + TestMustEmit — canonical detection path + TestMustSkip — all exclusion rules + TestMustFailRule — required API failure behaviour + TestNormalization — _normalize_db_instance field extraction + TestCloudWatchContract — metric name, statistic, period, dimension + TestEvidenceContract — signals_used, signals_not_checked, evaluation_path + TestConfidenceModel — always MEDIUM + TestCostModel — estimated_monthly_cost_usd always None + TestRiskModel — always MEDIUM + TestTitleAndReasonContract — exact title and evaluation_path + TestPagination — multi-page exhaustion + TestStandaloneScope — all three scope exclusion fields +""" + from datetime import datetime, timedelta, timezone from unittest.mock import MagicMock -from cleancloud.providers.aws.rules.rds_idle import find_idle_rds_instances +import pytest +from botocore.exceptions import BotoCoreError, ClientError +from cleancloud.providers.aws.rules.rds_idle import ( + _normalize_db_instance, + find_idle_rds_instances, +) -def _make_rds_paginator(mock_boto3_session, instances): - rds = mock_boto3_session._rds - paginator = rds.get_paginator.return_value - paginator.paginate.return_value = [{"DBInstances": instances}] - return rds +# --------------------------------------------------------------------------- +# Shared helpers +# --------------------------------------------------------------------------- +_REGION = "us-east-1" -def _make_cw_side_effect(responses_by_db_and_metric): - """Build a side_effect that routes by (db_id, metric_name).""" - def side_effect(**kwargs): - db_id = kwargs["Dimensions"][0]["Value"] - metric = kwargs["MetricName"] - return responses_by_db_and_metric.get( - (db_id, metric), - responses_by_db_and_metric.get(db_id, {"Datapoints": []}), - ) +def _now() -> datetime: + return datetime.now(timezone.utc) - return side_effect +def _old() -> datetime: + """30 days ago — always older than the default 14-day threshold.""" + return datetime.now(timezone.utc) - timedelta(days=30) -def test_find_idle_rds_instances(mock_boto3_session): - region = "us-east-1" - now = datetime.now(timezone.utc) - old_date = now - timedelta(days=30) - recent_date = now - timedelta(days=5) - rds = _make_rds_paginator( - mock_boto3_session, - [ - # Idle instance (30 days old, no connections) — should be flagged - { - "DBInstanceIdentifier": "idle-db", - "DBInstanceStatus": "available", - "InstanceCreateTime": old_date, - "DBInstanceClass": "db.t3.medium", - "Engine": "mysql", - "EngineVersion": "8.0.35", - "MultiAZ": False, - "AllocatedStorage": 100, - "ReadReplicaSourceDBInstanceIdentifier": None, - "DBClusterIdentifier": None, - "TagList": [{"Key": "env", "Value": "dev"}], - }, - # Active instance (has connections) — should NOT be flagged - { - "DBInstanceIdentifier": "active-db", - "DBInstanceStatus": "available", - "InstanceCreateTime": old_date, - "DBInstanceClass": "db.r5.large", - "Engine": "postgres", - "EngineVersion": "15.4", - "MultiAZ": True, - "AllocatedStorage": 200, - "ReadReplicaSourceDBInstanceIdentifier": None, - "DBClusterIdentifier": None, - "TagList": [], - }, - # Young instance (5 days old) — should NOT be flagged - { - "DBInstanceIdentifier": "young-db", - "DBInstanceStatus": "available", - "InstanceCreateTime": recent_date, - "DBInstanceClass": "db.t3.micro", - "Engine": "mysql", - "EngineVersion": "8.0.35", - "MultiAZ": False, - "AllocatedStorage": 20, - "ReadReplicaSourceDBInstanceIdentifier": None, - "DBClusterIdentifier": None, - "TagList": [], - }, - # Read replica — should NOT be flagged - { - "DBInstanceIdentifier": "replica-db", - "DBInstanceStatus": "available", - "InstanceCreateTime": old_date, - "DBInstanceClass": "db.t3.medium", - "Engine": "mysql", - "EngineVersion": "8.0.35", - "MultiAZ": False, - "AllocatedStorage": 100, - "ReadReplicaSourceDBInstanceIdentifier": "source-db", - "DBClusterIdentifier": None, - "TagList": [], - }, - # Aurora cluster member — should NOT be flagged - { - "DBInstanceIdentifier": "aurora-db", - "DBInstanceStatus": "available", - "InstanceCreateTime": old_date, - "DBInstanceClass": "db.r5.large", - "Engine": "aurora-mysql", - "EngineVersion": "8.0.mysql_aurora.3.04.0", - "MultiAZ": False, - "AllocatedStorage": 0, - "ReadReplicaSourceDBInstanceIdentifier": None, - "DBClusterIdentifier": "my-aurora-cluster", - "TagList": [], - }, - ], - ) - - cloudwatch_mock = MagicMock() - mock_boto3_session.client.side_effect = lambda service, **kwargs: ( - rds if service == "rds" else cloudwatch_mock - ) - cloudwatch_mock.get_metric_statistics.side_effect = _make_cw_side_effect( - { - # idle-db: zero connections + low CPU + zero IO — all three signals agree - ("idle-db", "DatabaseConnections"): {"Datapoints": [{"Sum": 0}]}, - ("idle-db", "CPUUtilization"): {"Datapoints": [{"Maximum": 2.0}]}, - ("idle-db", "ReadIOPS"): {"Datapoints": [{"Sum": 0}]}, - ("idle-db", "WriteIOPS"): {"Datapoints": [{"Sum": 0}]}, - # active-db: has connections - ("active-db", "DatabaseConnections"): {"Datapoints": [{"Sum": 500}]}, - } - ) - - findings = find_idle_rds_instances(mock_boto3_session, region) - db_ids = {f.resource_id for f in findings} - - assert "idle-db" in db_ids - assert "active-db" not in db_ids - assert "young-db" not in db_ids - assert "replica-db" not in db_ids - assert "aurora-db" not in db_ids - - assert len(findings) == 1 - finding = findings[0] - assert finding.provider == "aws" - assert finding.rule_id == "aws.rds.instance.idle" - assert finding.resource_type == "aws.rds.instance" - assert finding.confidence.value == "medium" # three-signal: connections + CPU + IO - assert finding.risk.value == "high" - assert finding.details["engine"] == "mysql 8.0.35" - assert finding.details["instance_class"] == "db.t3.medium" - assert finding.details["connections_14d"] == 0 - assert finding.details["allocated_storage_gb"] == 100 - assert "~$49/month" in finding.details["estimated_compute_cost"] - assert finding.estimated_monthly_cost_usd is not None - assert finding.estimated_monthly_cost_usd > 0 - assert finding.details["tags"] == {"env": "dev"} - assert "cluster_id" not in finding.details - assert "peak_cpu_pct" in finding.details - - -def test_find_idle_rds_instances_empty(mock_boto3_session): - region = "us-east-1" - rds = mock_boto3_session._rds - - paginator = rds.get_paginator.return_value - paginator.paginate.return_value = [{"DBInstances": []}] - - cloudwatch_mock = MagicMock() - mock_boto3_session.client.side_effect = lambda service, **kwargs: ( - rds if service == "rds" else cloudwatch_mock - ) - - findings = find_idle_rds_instances(mock_boto3_session, region) - assert findings == [] - - -def test_find_idle_rds_instances_custom_threshold(mock_boto3_session): - region = "us-east-1" - rds = mock_boto3_session._rds - now = datetime.now(timezone.utc) - creation_date = now - timedelta(days=20) - - paginator = rds.get_paginator.return_value - paginator.paginate.return_value = [ - { - "DBInstances": [ - { - "DBInstanceIdentifier": "test-db", - "DBInstanceStatus": "available", - "InstanceCreateTime": creation_date, - "DBInstanceClass": "db.t3.small", - "Engine": "postgres", - "EngineVersion": "15.4", - "MultiAZ": False, - "AllocatedStorage": 50, - "ReadReplicaSourceDBInstanceIdentifier": None, - "DBClusterIdentifier": None, - "TagList": [], - }, - ] - } - ] - - cloudwatch_mock = MagicMock() - mock_boto3_session.client.side_effect = lambda service, **kwargs: ( - rds if service == "rds" else cloudwatch_mock - ) - cloudwatch_mock.get_metric_statistics.side_effect = _make_cw_side_effect( - { - ("test-db", "DatabaseConnections"): {"Datapoints": [{"Sum": 0}]}, - ("test-db", "CPUUtilization"): {"Datapoints": [{"Maximum": 1.0}]}, - } - ) - - # With 30-day threshold, should NOT be flagged (only 20 days old) - findings_30 = find_idle_rds_instances(mock_boto3_session, region, idle_days=30) - assert len(findings_30) == 0 - - # With 14-day threshold, should be flagged (20 > 14) - findings_14 = find_idle_rds_instances(mock_boto3_session, region, idle_days=14) - assert len(findings_14) == 1 - assert findings_14[0].resource_id == "test-db" - - -def test_find_idle_rds_instances_with_connections(mock_boto3_session): - """RDS instance with connections should not be flagged.""" - region = "us-east-1" - rds = mock_boto3_session._rds - now = datetime.now(timezone.utc) - old_date = now - timedelta(days=30) - - paginator = rds.get_paginator.return_value - paginator.paginate.return_value = [ - { - "DBInstances": [ - { - "DBInstanceIdentifier": "active-db", - "DBInstanceStatus": "available", - "InstanceCreateTime": old_date, - "DBInstanceClass": "db.r5.large", - "Engine": "postgres", - "EngineVersion": "15.4", - "MultiAZ": True, - "AllocatedStorage": 200, - "ReadReplicaSourceDBInstanceIdentifier": None, - "DBClusterIdentifier": None, - "TagList": [], - }, - ] - } - ] - - cloudwatch_mock = MagicMock() - mock_boto3_session.client.side_effect = lambda service, **kwargs: ( - rds if service == "rds" else cloudwatch_mock - ) - cloudwatch_mock.get_metric_statistics.return_value = { - "Datapoints": [{"Sum": 150}, {"Sum": 200}] +def _young() -> datetime: + """5 days ago — always younger than the default 14-day threshold.""" + return datetime.now(timezone.utc) - timedelta(days=5) + + +def _client_error(code: str) -> ClientError: + return ClientError({"Error": {"Code": code, "Message": code}}, "op") + + +def _botocore_error() -> BotoCoreError: + return BotoCoreError() + + +def _make_instance(**overrides) -> dict: + """Return a minimal valid DescribeDBInstances item.""" + base = { + "DBInstanceIdentifier": "test-db", + "DBInstanceStatus": "available", + "InstanceCreateTime": _old(), + "Engine": "mysql", + "EngineVersion": "8.0.35", + "DBInstanceClass": "db.t3.medium", + "MultiAZ": False, + "AllocatedStorage": 100, + "StorageType": "gp2", + "DBClusterIdentifier": None, + "ReadReplicaSourceDBInstanceIdentifier": None, + "ReadReplicaSourceDBClusterIdentifier": None, + "TagList": [], } + base.update(overrides) + return base - findings = find_idle_rds_instances(mock_boto3_session, region) - assert findings == [] - - -def test_find_idle_rds_no_datapoints_skipped(mock_boto3_session): - """Instance where CW returns zero datapoints should be skipped (no metric visibility).""" - region = "us-east-1" - rds = mock_boto3_session._rds - now = datetime.now(timezone.utc) - old_date = now - timedelta(days=30) - - paginator = rds.get_paginator.return_value - paginator.paginate.return_value = [ - { - "DBInstances": [ - { - "DBInstanceIdentifier": "no-data-db", - "DBInstanceStatus": "available", - "InstanceCreateTime": old_date, - "DBInstanceClass": "db.t3.medium", - "Engine": "mysql", - "EngineVersion": "8.0.35", - "MultiAZ": False, - "AllocatedStorage": 100, - "ReadReplicaSourceDBInstanceIdentifier": None, - "DBClusterIdentifier": None, - "TagList": [], - }, + +def _zero_connections_response() -> dict: + """CloudWatch response: datapoints present, all Maximum == 0.""" + return {"Datapoints": [{"Maximum": 0.0}]} + + +def _nonzero_connections_response(val: float = 5.0) -> dict: + return {"Datapoints": [{"Maximum": val}]} + + +def _no_datapoints_response() -> dict: + return {"Datapoints": []} + + +def _setup( + mock_boto3_session, + instances: list, + cw_response=None, + cw_side_effect=None, +): + """Wire up RDS paginator and CloudWatch mock, return (rds, cloudwatch).""" + rds = MagicMock() + paginator = MagicMock() + paginator.paginate.return_value = [{"DBInstances": instances}] + rds.get_paginator.return_value = paginator + + cloudwatch = MagicMock() + if cw_side_effect is not None: + cloudwatch.get_metric_statistics.side_effect = cw_side_effect + elif cw_response is not None: + cloudwatch.get_metric_statistics.return_value = cw_response + + def client_side_effect(service, **kwargs): + if service == "rds": + return rds + if service == "cloudwatch": + return cloudwatch + raise ValueError(f"Unexpected service: {service}") + + mock_boto3_session.client.side_effect = client_side_effect + return rds, cloudwatch + + +# --------------------------------------------------------------------------- +# TestMustEmit +# --------------------------------------------------------------------------- + + +class TestMustEmit: + def test_standalone_available_old_zero_connections_emits(self, mock_boto3_session): + """Canonical path: standalone, available, old enough, zero DatabaseConnections.""" + _setup( + mock_boto3_session, + [_make_instance()], + cw_response=_zero_connections_response(), + ) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + assert len(findings) == 1 + f = findings[0] + assert f.provider == "aws" + assert f.rule_id == "aws.rds.instance.idle" + assert f.resource_type == "aws.rds.instance" + assert f.resource_id == "test-db" + assert f.region == _REGION + + def test_multiple_datapoints_all_zero_emits(self, mock_boto3_session): + """Multiple datapoints all Maximum == 0 → EMIT.""" + resp = { + "Datapoints": [ + {"Maximum": 0.0}, + {"Maximum": 0.0}, + {"Maximum": 0.0}, ] } - ] - - cloudwatch_mock = MagicMock() - mock_boto3_session.client.side_effect = lambda service, **kwargs: ( - rds if service == "rds" else cloudwatch_mock - ) - # No datapoints at all — metric has no visibility - cloudwatch_mock.get_metric_statistics.return_value = {"Datapoints": []} - - findings = find_idle_rds_instances(mock_boto3_session, region) - # Zero datapoints → LOW-confidence "requires verification" finding (not silently skipped) - assert len(findings) == 1 - assert findings[0].confidence.value == "low" - assert findings[0].risk.value == "medium" - assert "Requires" in findings[0].title or "Verification" in findings[0].title - assert findings[0].details.get("connections_datapoints") == 0 - - -def test_find_idle_rds_low_confidence_without_cpu(mock_boto3_session): - """Instance with zero connections but no CPU data should be LOW confidence.""" - region = "us-east-1" - rds = mock_boto3_session._rds - now = datetime.now(timezone.utc) - old_date = now - timedelta(days=30) - - paginator = rds.get_paginator.return_value - paginator.paginate.return_value = [ - { - "DBInstances": [ - { - "DBInstanceIdentifier": "idle-no-cpu", - "DBInstanceStatus": "available", - "InstanceCreateTime": old_date, - "DBInstanceClass": "db.t3.medium", - "Engine": "mysql", - "EngineVersion": "8.0.35", - "MultiAZ": False, - "AllocatedStorage": 100, - "ReadReplicaSourceDBInstanceIdentifier": None, - "DBClusterIdentifier": None, - "TagList": [], - }, + _setup(mock_boto3_session, [_make_instance()], cw_response=resp) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + assert len(findings) == 1 + + def test_details_database_connections_max_zero(self, mock_boto3_session): + """Emitted finding must include database_connections_max == 0.0.""" + _setup( + mock_boto3_session, + [_make_instance()], + cw_response=_zero_connections_response(), + ) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + assert findings[0].details["database_connections_max"] == 0.0 + + def test_details_required_fields_present(self, mock_boto3_session): + """All required details fields from spec §11.1 must be present.""" + _setup( + mock_boto3_session, + [_make_instance()], + cw_response=_zero_connections_response(), + ) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + d = findings[0].details + for key in ( + "evaluation_path", + "db_instance_id", + "normalized_status", + "instance_create_time", + "age_days", + "idle_days_threshold", + "engine", + "engine_version", + "db_instance_class", + "database_connections_max", + ): + assert key in d, f"Missing required detail key: {key}" + + def test_optional_context_fields_present(self, mock_boto3_session): + """Optional context fields from spec §11.1 must be present in details.""" + _setup( + mock_boto3_session, + [_make_instance()], + cw_response=_zero_connections_response(), + ) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + d = findings[0].details + for key in ( + "db_cluster_identifier", + "read_replica_source_db_instance_identifier", + "read_replica_source_db_cluster_identifier", + "multi_az", + "allocated_storage_gib", + "storage_type", + "tag_set", + ): + assert key in d, f"Missing optional context detail: {key}" + + +# --------------------------------------------------------------------------- +# TestMustSkip +# --------------------------------------------------------------------------- + + +class TestMustSkip: + def test_non_available_status_skipped(self, mock_boto3_session): + for status in ("stopped", "stopping", "creating", "modifying", "backing-up"): + _setup( + mock_boto3_session, + [_make_instance(DBInstanceStatus=status)], + cw_response=_zero_connections_response(), + ) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + assert findings == [], f"Expected skip for status={status}" + + def test_cluster_member_skipped(self, mock_boto3_session): + """DB cluster member (DBClusterIdentifier present) → SKIP ITEM.""" + _setup( + mock_boto3_session, + [_make_instance(DBClusterIdentifier="my-cluster")], + cw_response=_zero_connections_response(), + ) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + assert findings == [] + + def test_read_replica_db_instance_source_skipped(self, mock_boto3_session): + """ReadReplicaSourceDBInstanceIdentifier present → SKIP ITEM.""" + _setup( + mock_boto3_session, + [_make_instance(ReadReplicaSourceDBInstanceIdentifier="source-db")], + cw_response=_zero_connections_response(), + ) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + assert findings == [] + + def test_read_replica_db_cluster_source_skipped(self, mock_boto3_session): + """ReadReplicaSourceDBClusterIdentifier present → SKIP ITEM.""" + _setup( + mock_boto3_session, + [_make_instance(ReadReplicaSourceDBClusterIdentifier="source-cluster")], + cw_response=_zero_connections_response(), + ) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + assert findings == [] + + def test_too_young_skipped(self, mock_boto3_session): + """Instance younger than idle_days_threshold → SKIP ITEM.""" + _setup( + mock_boto3_session, + [_make_instance(InstanceCreateTime=_young())], + cw_response=_zero_connections_response(), + ) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + assert findings == [] + + def test_exactly_at_threshold_emits(self, mock_boto3_session): + """age_days == idle_days_threshold satisfies >= check → EMIT.""" + # Use 14 days + 1 hour to ensure floor(total_seconds / 86400) == 14 + at_threshold = datetime.now(timezone.utc) - timedelta(days=14, hours=1) + _setup( + mock_boto3_session, + [_make_instance(InstanceCreateTime=at_threshold)], + cw_response=_zero_connections_response(), + ) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + assert len(findings) == 1 + + def test_no_datapoints_skipped(self, mock_boto3_session): + """DatabaseConnections returns no datapoints → SKIP ITEM (not LOW finding).""" + _setup( + mock_boto3_session, + [_make_instance()], + cw_response=_no_datapoints_response(), + ) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + assert findings == [] + + def test_nonzero_connections_skipped(self, mock_boto3_session): + """DatabaseConnections Maximum > 0 → SKIP ITEM.""" + _setup( + mock_boto3_session, + [_make_instance()], + cw_response=_nonzero_connections_response(val=1.0), + ) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + assert findings == [] + + def test_any_nonzero_datapoint_skipped(self, mock_boto3_session): + """Multiple datapoints — one is nonzero → SKIP ITEM.""" + resp = { + "Datapoints": [ + {"Maximum": 0.0}, + {"Maximum": 3.0}, ] } - ] - - cloudwatch_mock = MagicMock() - mock_boto3_session.client.side_effect = lambda service, **kwargs: ( - rds if service == "rds" else cloudwatch_mock - ) - cloudwatch_mock.get_metric_statistics.side_effect = _make_cw_side_effect( - { - # Connections: zero (has datapoints) - ("idle-no-cpu", "DatabaseConnections"): {"Datapoints": [{"Sum": 0}]}, - # CPU: no data available - ("idle-no-cpu", "CPUUtilization"): {"Datapoints": []}, - } - ) + _setup(mock_boto3_session, [_make_instance()], cw_response=resp) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + assert findings == [] + + def test_missing_db_instance_identifier_skipped(self, mock_boto3_session): + """Missing DBInstanceIdentifier → SKIP ITEM.""" + item = _make_instance() + del item["DBInstanceIdentifier"] + _setup(mock_boto3_session, [item], cw_response=_zero_connections_response()) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + assert findings == [] + + def test_missing_status_skipped(self, mock_boto3_session): + """Missing DBInstanceStatus → SKIP ITEM.""" + item = _make_instance() + del item["DBInstanceStatus"] + _setup(mock_boto3_session, [item], cw_response=_zero_connections_response()) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + assert findings == [] + + def test_missing_create_time_skipped(self, mock_boto3_session): + """Missing InstanceCreateTime → SKIP ITEM.""" + item = _make_instance() + del item["InstanceCreateTime"] + _setup(mock_boto3_session, [item], cw_response=_zero_connections_response()) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + assert findings == [] + + def test_naive_create_time_skipped(self, mock_boto3_session): + """Naive InstanceCreateTime (no tzinfo) → SKIP ITEM.""" + naive = _old().replace(tzinfo=None) + _setup( + mock_boto3_session, + [_make_instance(InstanceCreateTime=naive)], + cw_response=_zero_connections_response(), + ) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + assert findings == [] + + def test_future_create_time_skipped(self, mock_boto3_session): + """InstanceCreateTime in the future → SKIP ITEM.""" + future = datetime.now(timezone.utc) + timedelta(days=1) + _setup( + mock_boto3_session, + [_make_instance(InstanceCreateTime=future)], + cw_response=_zero_connections_response(), + ) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + assert findings == [] + + def test_non_dict_item_skipped(self, mock_boto3_session): + """Non-dict items in DBInstances list → silently skipped.""" + _setup( + mock_boto3_session, + [None, "string", 42, _make_instance()], + cw_response=_zero_connections_response(), + ) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + assert len(findings) == 1 + + def test_empty_db_instances_no_findings(self, mock_boto3_session): + _setup(mock_boto3_session, [], cw_response=_zero_connections_response()) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + assert findings == [] + + +# --------------------------------------------------------------------------- +# TestMustFailRule +# --------------------------------------------------------------------------- + + +class TestMustFailRule: + def test_describe_db_instances_access_denied_raises_permission_error(self, mock_boto3_session): + rds = MagicMock() + rds.get_paginator.side_effect = _client_error("AccessDenied") + cloudwatch = MagicMock() + + def client_side_effect(service, **kwargs): + return rds if service == "rds" else cloudwatch + + mock_boto3_session.client.side_effect = client_side_effect + with pytest.raises(PermissionError, match="rds:DescribeDBInstances"): + find_idle_rds_instances(mock_boto3_session, _REGION) + + def test_describe_db_instances_unauthorized_raises_permission_error(self, mock_boto3_session): + rds = MagicMock() + rds.get_paginator.side_effect = _client_error("UnauthorizedOperation") + cloudwatch = MagicMock() + + def client_side_effect(service, **kwargs): + return rds if service == "rds" else cloudwatch + + mock_boto3_session.client.side_effect = client_side_effect + with pytest.raises(PermissionError): + find_idle_rds_instances(mock_boto3_session, _REGION) + + def test_describe_db_instances_other_client_error_propagates(self, mock_boto3_session): + rds = MagicMock() + rds.get_paginator.side_effect = _client_error("InternalServerError") + cloudwatch = MagicMock() + + def client_side_effect(service, **kwargs): + return rds if service == "rds" else cloudwatch + + mock_boto3_session.client.side_effect = client_side_effect + with pytest.raises(ClientError): + find_idle_rds_instances(mock_boto3_session, _REGION) + + def test_describe_db_instances_botocore_error_propagates(self, mock_boto3_session): + rds = MagicMock() + rds.get_paginator.side_effect = _botocore_error() + cloudwatch = MagicMock() + + def client_side_effect(service, **kwargs): + return rds if service == "rds" else cloudwatch + + mock_boto3_session.client.side_effect = client_side_effect + with pytest.raises(BotoCoreError): + find_idle_rds_instances(mock_boto3_session, _REGION) + + def test_cloudwatch_access_denied_raises_permission_error(self, mock_boto3_session): + _setup( + mock_boto3_session, + [_make_instance()], + cw_side_effect=_client_error("AccessDenied"), + ) + with pytest.raises(PermissionError, match="cloudwatch:GetMetricStatistics"): + find_idle_rds_instances(mock_boto3_session, _REGION) + + def test_cloudwatch_unauthorized_raises_permission_error(self, mock_boto3_session): + _setup( + mock_boto3_session, + [_make_instance()], + cw_side_effect=_client_error("UnauthorizedOperation"), + ) + with pytest.raises(PermissionError): + find_idle_rds_instances(mock_boto3_session, _REGION) + + def test_cloudwatch_throttle_raises_not_skipped(self, mock_boto3_session): + """Throttling is a required-call failure → FAIL RULE, not SKIP or LOW confidence.""" + _setup( + mock_boto3_session, + [_make_instance()], + cw_side_effect=_client_error("ThrottlingException"), + ) + with pytest.raises(ClientError): + find_idle_rds_instances(mock_boto3_session, _REGION) + + def test_cloudwatch_internal_error_raises(self, mock_boto3_session): + _setup( + mock_boto3_session, + [_make_instance()], + cw_side_effect=_client_error("InternalServerError"), + ) + with pytest.raises(ClientError): + find_idle_rds_instances(mock_boto3_session, _REGION) + + def test_cloudwatch_botocore_error_raises(self, mock_boto3_session): + _setup( + mock_boto3_session, + [_make_instance()], + cw_side_effect=_botocore_error(), + ) + with pytest.raises(BotoCoreError): + find_idle_rds_instances(mock_boto3_session, _REGION) + + +# --------------------------------------------------------------------------- +# TestNormalization +# --------------------------------------------------------------------------- + + +class TestNormalization: + def test_string_fields_extracted(self): + now = _now() + n = _normalize_db_instance(_make_instance(), now) + assert n is not None + assert n["db_instance_id"] == "test-db" + assert n["normalized_status"] == "available" + assert n["engine"] == "mysql" + assert n["engine_version"] == "8.0.35" + assert n["db_instance_class"] == "db.t3.medium" + assert n["storage_type"] == "gp2" + + def test_age_days_computed(self): + now = _now() + create_time = now - timedelta(days=20) + n = _normalize_db_instance(_make_instance(InstanceCreateTime=create_time), now) + assert n is not None + assert n["age_days"] == 20 + + def test_multi_az_bool_preserved(self): + now = _now() + n = _normalize_db_instance(_make_instance(MultiAZ=True), now) + assert n["multi_az"] is True + + def test_multi_az_non_bool_nulled(self): + now = _now() + n = _normalize_db_instance(_make_instance(MultiAZ="yes"), now) + assert n["multi_az"] is None + + def test_allocated_storage_int_preserved(self): + now = _now() + n = _normalize_db_instance(_make_instance(AllocatedStorage=200), now) + assert n["allocated_storage_gib"] == 200 + + def test_allocated_storage_non_int_nulled(self): + now = _now() + n = _normalize_db_instance(_make_instance(AllocatedStorage="200"), now) + assert n["allocated_storage_gib"] is None + + def test_tag_list_preserved(self): + now = _now() + tags = [{"Key": "env", "Value": "dev"}] + n = _normalize_db_instance(_make_instance(TagList=tags), now) + assert n["tag_set"] == tags + + def test_tag_list_absent_defaults_to_empty_list(self): + now = _now() + item = _make_instance() + del item["TagList"] + n = _normalize_db_instance(item, now) + assert n["tag_set"] == [] + + def test_scope_fields_null_when_absent(self): + now = _now() + n = _normalize_db_instance(_make_instance(), now) + assert n["db_cluster_identifier"] is None + assert n["read_replica_source_db_instance_identifier"] is None + assert n["read_replica_source_db_cluster_identifier"] is None + + def test_scope_fields_extracted_when_present(self): + now = _now() + n = _normalize_db_instance( + _make_instance( + DBClusterIdentifier="my-cluster", + ReadReplicaSourceDBInstanceIdentifier="source-db", + ReadReplicaSourceDBClusterIdentifier="source-cluster", + ), + now, + ) + assert n["db_cluster_identifier"] == "my-cluster" + assert n["read_replica_source_db_instance_identifier"] == "source-db" + assert n["read_replica_source_db_cluster_identifier"] == "source-cluster" + + def test_empty_string_db_instance_id_returns_none(self): + now = _now() + result = _normalize_db_instance(_make_instance(DBInstanceIdentifier=""), now) + assert result is None + + def test_non_dict_returns_none(self): + now = _now() + assert _normalize_db_instance(None, now) is None + assert _normalize_db_instance("string", now) is None + assert _normalize_db_instance(42, now) is None + + def test_naive_create_time_returns_none(self): + now = _now() + naive = _old().replace(tzinfo=None) + result = _normalize_db_instance(_make_instance(InstanceCreateTime=naive), now) + assert result is None + + def test_future_create_time_returns_none(self): + now = _now() + future = now + timedelta(days=1) + result = _normalize_db_instance(_make_instance(InstanceCreateTime=future), now) + assert result is None + + def test_non_datetime_create_time_returns_none(self): + now = _now() + result = _normalize_db_instance( + _make_instance(InstanceCreateTime="2025-01-01T00:00:00Z"), now + ) + assert result is None + + def test_resource_id_equals_db_instance_id(self): + n = _normalize_db_instance(_make_instance(), _now()) + assert n["resource_id"] == n["db_instance_id"] == "test-db" + + +# --------------------------------------------------------------------------- +# TestCloudWatchContract +# --------------------------------------------------------------------------- + + +class TestCloudWatchContract: + def test_database_connections_maximum_statistic_used(self, mock_boto3_session): + """Spec requires DatabaseConnections with Maximum statistic — not Sum.""" + _, cw = _setup( + mock_boto3_session, + [_make_instance()], + cw_response=_zero_connections_response(), + ) + find_idle_rds_instances(mock_boto3_session, _REGION) + call_kwargs = cw.get_metric_statistics.call_args[1] + assert call_kwargs["MetricName"] == "DatabaseConnections" + assert call_kwargs["Statistics"] == ["Maximum"] + + def test_correct_namespace(self, mock_boto3_session): + _, cw = _setup( + mock_boto3_session, + [_make_instance()], + cw_response=_zero_connections_response(), + ) + find_idle_rds_instances(mock_boto3_session, _REGION) + assert cw.get_metric_statistics.call_args[1]["Namespace"] == "AWS/RDS" + + def test_correct_dimension(self, mock_boto3_session): + _, cw = _setup( + mock_boto3_session, + [_make_instance()], + cw_response=_zero_connections_response(), + ) + find_idle_rds_instances(mock_boto3_session, _REGION) + dims = cw.get_metric_statistics.call_args[1]["Dimensions"] + assert dims == [{"Name": "DBInstanceIdentifier", "Value": "test-db"}] + + def test_period_is_idle_days_times_86400(self, mock_boto3_session): + """Period = idle_days_threshold * 86400 satisfies all CW retention constraints.""" + _, cw = _setup( + mock_boto3_session, + [_make_instance()], + cw_response=_zero_connections_response(), + ) + find_idle_rds_instances(mock_boto3_session, _REGION, idle_days_threshold=14) + assert cw.get_metric_statistics.call_args[1]["Period"] == 14 * 86400 + + def test_exactly_one_metric_queried(self, mock_boto3_session): + """Only DatabaseConnections — no CPU or I/O metrics.""" + _, cw = _setup( + mock_boto3_session, + [_make_instance()], + cw_response=_zero_connections_response(), + ) + find_idle_rds_instances(mock_boto3_session, _REGION) + assert cw.get_metric_statistics.call_count == 1 + + def test_missing_datapoints_not_treated_as_zero(self, mock_boto3_session): + """Empty datapoints list must not be interpreted as zero connections.""" + _setup( + mock_boto3_session, + [_make_instance()], + cw_response=_no_datapoints_response(), + ) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + assert findings == [] + + def test_no_cpu_or_io_gates(self, mock_boto3_session): + """CPU and storage I/O are not eligibility gates — zero connections alone emits.""" + call_metrics = [] + + def cw_side_effect(**kwargs): + call_metrics.append(kwargs["MetricName"]) + return _zero_connections_response() + + _setup(mock_boto3_session, [_make_instance()], cw_side_effect=cw_side_effect) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + assert len(findings) == 1 + assert "CPUUtilization" not in call_metrics + assert "ReadIOPS" not in call_metrics + assert "WriteIOPS" not in call_metrics + + +# --------------------------------------------------------------------------- +# TestEvidenceContract +# --------------------------------------------------------------------------- + + +class TestEvidenceContract: + def _get_finding(self, mock_boto3_session): + _setup( + mock_boto3_session, + [_make_instance()], + cw_response=_zero_connections_response(), + ) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + assert len(findings) == 1 + return findings[0] + + def test_evaluation_path(self, mock_boto3_session): + f = self._get_finding(mock_boto3_session) + assert f.details["evaluation_path"] == "idle-rds-instance-review-candidate" + + def test_signals_used_not_empty(self, mock_boto3_session): + f = self._get_finding(mock_boto3_session) + assert len(f.evidence.signals_used) >= 1 + + def test_signals_used_mentions_available_status(self, mock_boto3_session): + f = self._get_finding(mock_boto3_session) + combined = " ".join(f.evidence.signals_used) + assert "available" in combined + + def test_signals_used_mentions_standalone(self, mock_boto3_session): + f = self._get_finding(mock_boto3_session) + combined = " ".join(f.evidence.signals_used) + assert "standalone" in combined.lower() or "read replica" in combined.lower() + + def test_signals_used_mentions_database_connections(self, mock_boto3_session): + f = self._get_finding(mock_boto3_session) + combined = " ".join(f.evidence.signals_used) + assert "DatabaseConnections" in combined + + def test_signals_not_checked_mentions_proxy_layers(self, mock_boto3_session): + f = self._get_finding(mock_boto3_session) + combined = " ".join(f.evidence.signals_not_checked) + assert any(term in combined for term in ("RDS Proxy", "PgBouncer", "connection pool")) + + def test_time_window_matches_threshold(self, mock_boto3_session): + _setup( + mock_boto3_session, + [_make_instance()], + cw_response=_zero_connections_response(), + ) + findings = find_idle_rds_instances(mock_boto3_session, _REGION, idle_days_threshold=30) + assert "30" in findings[0].evidence.time_window + + def test_normalized_status_in_details(self, mock_boto3_session): + f = self._get_finding(mock_boto3_session) + assert f.details["normalized_status"] == "available" + + +# --------------------------------------------------------------------------- +# TestConfidenceModel +# --------------------------------------------------------------------------- + + +class TestConfidenceModel: + def test_always_medium_confidence(self, mock_boto3_session): + """Spec §12: MEDIUM confidence when datapoints present and all zero.""" + _setup( + mock_boto3_session, + [_make_instance()], + cw_response=_zero_connections_response(), + ) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + assert findings[0].confidence.value == "medium" + + def test_no_low_confidence_path(self, mock_boto3_session): + """There is no LOW-confidence finding path — missing datapoints → SKIP ITEM.""" + _setup( + mock_boto3_session, + [_make_instance()], + cw_response=_no_datapoints_response(), + ) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + # Must be empty — not a LOW-confidence finding + assert findings == [] + + +# --------------------------------------------------------------------------- +# TestCostModel +# --------------------------------------------------------------------------- + + +class TestCostModel: + def test_estimated_monthly_cost_usd_is_none(self, mock_boto3_session): + """Spec §7: no hardcoded cost estimates → estimated_monthly_cost_usd = None.""" + _setup( + mock_boto3_session, + [_make_instance()], + cw_response=_zero_connections_response(), + ) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + assert findings[0].estimated_monthly_cost_usd is None + + def test_no_cost_fields_in_details(self, mock_boto3_session): + """No compute/storage cost fields should appear in details.""" + _setup( + mock_boto3_session, + [_make_instance()], + cw_response=_zero_connections_response(), + ) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + d = findings[0].details + for key in d: + assert "cost" not in key.lower(), f"Unexpected cost field: {key}" + + +# --------------------------------------------------------------------------- +# TestRiskModel +# --------------------------------------------------------------------------- + + +class TestRiskModel: + def test_always_medium_risk(self, mock_boto3_session): + _setup( + mock_boto3_session, + [_make_instance()], + cw_response=_zero_connections_response(), + ) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + assert findings[0].risk.value == "medium" + + +# --------------------------------------------------------------------------- +# TestTitleAndReasonContract +# --------------------------------------------------------------------------- + + +class TestTitleAndReasonContract: + def test_title_exact(self, mock_boto3_session): + _setup( + mock_boto3_session, + [_make_instance()], + cw_response=_zero_connections_response(), + ) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + assert findings[0].title == "Idle RDS instance review candidate" + + def test_rule_id_exact(self, mock_boto3_session): + _setup( + mock_boto3_session, + [_make_instance()], + cw_response=_zero_connections_response(), + ) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + assert findings[0].rule_id == "aws.rds.instance.idle" + + def test_resource_type_exact(self, mock_boto3_session): + _setup( + mock_boto3_session, + [_make_instance()], + cw_response=_zero_connections_response(), + ) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + assert findings[0].resource_type == "aws.rds.instance" + + +# --------------------------------------------------------------------------- +# TestPagination +# --------------------------------------------------------------------------- + + +class TestPagination: + def test_multiple_pages_all_processed(self, mock_boto3_session): + """All pages must be exhausted — findings from both pages are emitted.""" + rds = MagicMock() + paginator = MagicMock() + paginator.paginate.return_value = [ + {"DBInstances": [_make_instance(DBInstanceIdentifier="db-1")]}, + {"DBInstances": [_make_instance(DBInstanceIdentifier="db-2")]}, + {"DBInstances": [_make_instance(DBInstanceIdentifier="db-3")]}, + ] + rds.get_paginator.return_value = paginator + + cloudwatch = MagicMock() + cloudwatch.get_metric_statistics.return_value = _zero_connections_response() + + def client_side_effect(service, **kwargs): + return rds if service == "rds" else cloudwatch + + mock_boto3_session.client.side_effect = client_side_effect + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + ids = {f.resource_id for f in findings} + assert ids == {"db-1", "db-2", "db-3"} + + def test_mixed_instances_across_pages(self, mock_boto3_session): + """Active and idle instances can be mixed across pages.""" + rds = MagicMock() + paginator = MagicMock() + paginator.paginate.return_value = [ + {"DBInstances": [_make_instance(DBInstanceIdentifier="idle-db")]}, + { + "DBInstances": [ + _make_instance( + DBInstanceIdentifier="active-db", + DBInstanceStatus="stopped", + ) + ] + }, + ] + rds.get_paginator.return_value = paginator + + cloudwatch = MagicMock() + cloudwatch.get_metric_statistics.return_value = _zero_connections_response() + + def client_side_effect(service, **kwargs): + return rds if service == "rds" else cloudwatch + + mock_boto3_session.client.side_effect = client_side_effect + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + assert len(findings) == 1 + assert findings[0].resource_id == "idle-db" + + +# --------------------------------------------------------------------------- +# TestStandaloneScope +# --------------------------------------------------------------------------- + + +class TestStandaloneScope: + def test_all_three_scope_fields_independently_exclude(self, mock_boto3_session): + """Each standalone-scope exclusion field independently causes SKIP ITEM.""" + scope_cases = [ + {"DBClusterIdentifier": "cluster-a"}, + {"ReadReplicaSourceDBInstanceIdentifier": "source-instance"}, + {"ReadReplicaSourceDBClusterIdentifier": "source-cluster"}, + ] + for overrides in scope_cases: + _setup( + mock_boto3_session, + [_make_instance(**overrides)], + cw_response=_zero_connections_response(), + ) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + assert findings == [], f"Expected skip for scope overrides={overrides}" + + def test_standalone_with_none_scope_fields_emits(self, mock_boto3_session): + """Explicit None values for all scope fields → standalone → EMIT.""" + _setup( + mock_boto3_session, + [ + _make_instance( + DBClusterIdentifier=None, + ReadReplicaSourceDBInstanceIdentifier=None, + ReadReplicaSourceDBClusterIdentifier=None, + ) + ], + cw_response=_zero_connections_response(), + ) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + assert len(findings) == 1 + + def test_read_replica_source_cluster_identifier_field_checked(self, mock_boto3_session): + """ReadReplicaSourceDBClusterIdentifier is the third scope field — must be checked.""" + _setup( + mock_boto3_session, + [_make_instance(ReadReplicaSourceDBClusterIdentifier="my-source-cluster")], + cw_response=_zero_connections_response(), + ) + findings = find_idle_rds_instances(mock_boto3_session, _REGION) + assert findings == [] + + def test_custom_threshold(self, mock_boto3_session): + """idle_days_threshold parameter controls age gate correctly.""" + create_time = datetime.now(timezone.utc) - timedelta(days=20) + _setup( + mock_boto3_session, + [_make_instance(InstanceCreateTime=create_time)], + cw_response=_zero_connections_response(), + ) + # 30-day threshold: 20 days < 30 → SKIP + findings_30 = find_idle_rds_instances(mock_boto3_session, _REGION, idle_days_threshold=30) + assert findings_30 == [] - findings = find_idle_rds_instances(mock_boto3_session, region) - assert len(findings) == 1 - assert findings[0].confidence.value == "low" + # 14-day threshold: 20 days >= 14 → EMIT + findings_14 = find_idle_rds_instances(mock_boto3_session, _REGION, idle_days_threshold=14) + assert len(findings_14) == 1