diff --git a/cmd/gpuaudit/main.go b/cmd/gpuaudit/main.go index ce8d61e..84d2f8e 100644 --- a/cmd/gpuaudit/main.go +++ b/cmd/gpuaudit/main.go @@ -222,6 +222,7 @@ var iamPolicyCmd = &cobra.Command{ "ec2:DescribeInstances", "ec2:DescribeInstanceTypes", "ec2:DescribeRegions", + "ec2:DescribeSpotPriceHistory", }, "Resource": "*", }, diff --git a/internal/analysis/rules.go b/internal/analysis/rules.go index f91bcbe..e86c8c6 100644 --- a/internal/analysis/rules.go +++ b/internal/analysis/rules.go @@ -28,6 +28,7 @@ func analyzeInstance(inst *models.GPUInstance) { ruleSageMakerLowUtil, ruleSageMakerOversized, ruleK8sUnallocatedGPU, + ruleSpotEligible, } for _, rule := range rules { rule(inst) @@ -347,3 +348,50 @@ func ruleK8sUnallocatedGPU(inst *models.GPUInstance) { }) } } + +// Rule 8: On-demand instance eligible for Spot pricing. +func ruleSpotEligible(inst *models.GPUInstance) { + if inst.PricingModel != "on-demand" { + return + } + if inst.UptimeHours < 24 { + return + } + if inst.SpotHourlyCost == nil { + return + } + if inst.HourlyCost <= 0 { + return + } + + spotHourly := *inst.SpotHourlyCost + savingsPercent := ((inst.HourlyCost - spotHourly) / inst.HourlyCost) * 100 + if savingsPercent <= 0 { + return + } + + monthlySavings := (inst.HourlyCost - spotHourly) * 730 + spotMonthlyCost := spotHourly * 730 + + // Higher savings → higher confidence + confidence := 0.35 + (savingsPercent / 120) + if confidence > 0.95 { + confidence = 0.95 + } + + inst.WasteSignals = append(inst.WasteSignals, models.WasteSignal{ + Type: "spot_eligible", + Severity: models.SeverityInfo, + Confidence: confidence, + Evidence: fmt.Sprintf("Spot pricing available at $%.3f/hr vs $%.3f/hr on-demand (%.0f%% savings).", spotHourly, inst.HourlyCost, savingsPercent), + }) + inst.Recommendations = append(inst.Recommendations, models.Recommendation{ + Action: models.ActionChangePricing, + Description: fmt.Sprintf("Spot pricing available at $%.2f/hr (%.0f%% savings). Spot instances may be interrupted — suitable for fault-tolerant workloads.", spotHourly, savingsPercent), + CurrentMonthlyCost: inst.MonthlyCost, + RecommendedMonthlyCost: spotMonthlyCost, + MonthlySavings: monthlySavings, + SavingsPercent: savingsPercent, + Risk: models.RiskHigh, + }) +} diff --git a/internal/analysis/rules_test.go b/internal/analysis/rules_test.go index d8d264d..86970ea 100644 --- a/internal/analysis/rules_test.go +++ b/internal/analysis/rules_test.go @@ -259,3 +259,117 @@ func TestAnalyzeAll_ComputesSavings(t *testing.T) { t.Errorf("expected no signals for healthy instance, got %d", len(instances[1].WasteSignals)) } } + +func TestRuleSpotEligible_FlagsOnDemandWithSpotPrice(t *testing.T) { + spotPrice := 0.556 + inst := models.GPUInstance{ + InstanceID: "i-test", + Source: models.SourceEC2, + PricingModel: "on-demand", + UptimeHours: 48, + HourlyCost: 1.006, + MonthlyCost: 1.006 * 730, + SpotHourlyCost: &spotPrice, + } + + ruleSpotEligible(&inst) + + if len(inst.WasteSignals) != 1 { + t.Fatalf("expected 1 signal, got %d", len(inst.WasteSignals)) + } + if inst.WasteSignals[0].Type != "spot_eligible" { + t.Errorf("expected spot_eligible, got %s", inst.WasteSignals[0].Type) + } + if inst.WasteSignals[0].Severity != models.SeverityInfo { + t.Errorf("expected info severity, got %s", inst.WasteSignals[0].Severity) + } + if len(inst.Recommendations) != 1 { + t.Fatalf("expected 1 recommendation, got %d", len(inst.Recommendations)) + } + if inst.Recommendations[0].Action != models.ActionChangePricing { + t.Errorf("expected change_pricing, got %s", inst.Recommendations[0].Action) + } + expectedSavings := (1.006 - 0.556) * 730 + diff := inst.Recommendations[0].MonthlySavings - expectedSavings + if diff < -0.01 || diff > 0.01 { + t.Errorf("expected savings %.2f, got %.2f", expectedSavings, inst.Recommendations[0].MonthlySavings) + } +} + +func TestRuleSpotEligible_SkipsSpotInstances(t *testing.T) { + spotPrice := 0.556 + inst := models.GPUInstance{ + PricingModel: "spot", + UptimeHours: 48, + SpotHourlyCost: &spotPrice, + } + + ruleSpotEligible(&inst) + + if len(inst.WasteSignals) != 0 { + t.Errorf("expected no signals for spot instance, got %d", len(inst.WasteSignals)) + } +} + +func TestRuleSpotEligible_SkipsRecentInstances(t *testing.T) { + spotPrice := 0.556 + inst := models.GPUInstance{ + PricingModel: "on-demand", + UptimeHours: 12, + SpotHourlyCost: &spotPrice, + } + + ruleSpotEligible(&inst) + + if len(inst.WasteSignals) != 0 { + t.Errorf("expected no signals for recent instance, got %d", len(inst.WasteSignals)) + } +} + +func TestRuleSpotEligible_SkipsWhenNoSpotPrice(t *testing.T) { + inst := models.GPUInstance{ + PricingModel: "on-demand", + UptimeHours: 48, + SpotHourlyCost: nil, + } + + ruleSpotEligible(&inst) + + if len(inst.WasteSignals) != 0 { + t.Errorf("expected no signals when spot price unavailable, got %d", len(inst.WasteSignals)) + } +} + +func TestRuleSpotEligible_ConfidenceScalesWithSavings(t *testing.T) { + tests := []struct { + name string + onDemand float64 + spotPrice float64 + minConfidence float64 + }{ + {"large_savings_60pct", 1.0, 0.4, 0.85}, + {"moderate_savings_40pct", 1.0, 0.6, 0.65}, + {"small_savings_20pct", 1.0, 0.8, 0.5}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + inst := models.GPUInstance{ + PricingModel: "on-demand", + UptimeHours: 48, + HourlyCost: tt.onDemand, + MonthlyCost: tt.onDemand * 730, + SpotHourlyCost: &tt.spotPrice, + } + + ruleSpotEligible(&inst) + + if len(inst.WasteSignals) == 0 { + t.Fatal("expected signal") + } + if inst.WasteSignals[0].Confidence < tt.minConfidence { + t.Errorf("expected confidence >= %.2f, got %.2f", tt.minConfidence, inst.WasteSignals[0].Confidence) + } + }) + } +} diff --git a/internal/models/models.go b/internal/models/models.go index 0fd6557..1f7bb8e 100644 --- a/internal/models/models.go +++ b/internal/models/models.go @@ -85,10 +85,11 @@ type GPUInstance struct { InvocationCount *int64 `json:"invocation_count,omitempty"` // Cost - PricingModel string `json:"pricing_model"` // on-demand, spot, reserved, savings-plan - HourlyCost float64 `json:"hourly_cost"` - MonthlyCost float64 `json:"monthly_cost"` - MTDCost *float64 `json:"mtd_cost,omitempty"` + PricingModel string `json:"pricing_model"` // on-demand, spot, reserved, savings-plan + HourlyCost float64 `json:"hourly_cost"` + MonthlyCost float64 `json:"monthly_cost"` + SpotHourlyCost *float64 `json:"spot_hourly_cost,omitempty"` + MTDCost *float64 `json:"mtd_cost,omitempty"` // Analysis results (populated by analysis engine) WasteSignals []WasteSignal `json:"waste_signals,omitempty"` @@ -98,7 +99,7 @@ type GPUInstance struct { // WasteSignal represents a detected waste indicator on a GPU instance. type WasteSignal struct { - Type string `json:"type"` // idle, low_utilization, oversized_gpu, pricing_mismatch, stale, low_invocations + Type string `json:"type"` // idle, low_utilization, oversized_gpu, pricing_mismatch, stale, low_invocations, spot_eligible Severity Severity `json:"severity"` Confidence float64 `json:"confidence"` // 0.0 - 1.0 Evidence string `json:"evidence"` diff --git a/internal/providers/aws/scanner.go b/internal/providers/aws/scanner.go index d8d5921..b1b1ff1 100644 --- a/internal/providers/aws/scanner.go +++ b/internal/providers/aws/scanner.go @@ -188,6 +188,7 @@ func scanRegion(ctx context.Context, cfg aws.Config, accountID, region string, o if err := EnrichEC2Metrics(ctx, cwClient, ec2Instances, opts.MetricWindow); err != nil { fmt.Fprintf(os.Stderr, " warning: could not enrich EC2 metrics in %s: %v\n", region, err) } + EnrichSpotPrices(ctx, ec2Client, ec2Instances) } allInstances = append(allInstances, ec2Instances...) } diff --git a/internal/providers/aws/spot.go b/internal/providers/aws/spot.go new file mode 100644 index 0000000..d8ddcd6 --- /dev/null +++ b/internal/providers/aws/spot.go @@ -0,0 +1,89 @@ +// Copyright 2026 the gpuaudit authors. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package aws + +import ( + "context" + "fmt" + "os" + "strconv" + "time" + + "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/service/ec2" + ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" + + "github.com/gpuaudit/cli/internal/models" +) + +// SpotPriceClient is the subset of the EC2 API needed for spot price lookups. +type SpotPriceClient interface { + DescribeSpotPriceHistory(ctx context.Context, params *ec2.DescribeSpotPriceHistoryInput, optFns ...func(*ec2.Options)) (*ec2.DescribeSpotPriceHistoryOutput, error) +} + +// EnrichSpotPrices fetches current spot prices for EC2 GPU instances and +// populates SpotHourlyCost on each instance where spot is available. +func EnrichSpotPrices(ctx context.Context, client SpotPriceClient, instances []models.GPUInstance) { + // Collect unique EC2 instance types. + typeSet := make(map[string]bool) + for _, inst := range instances { + if inst.Source == models.SourceEC2 { + typeSet[inst.InstanceType] = true + } + } + if len(typeSet) == 0 { + return + } + + instanceTypes := make([]ec2types.InstanceType, 0, len(typeSet)) + for t := range typeSet { + instanceTypes = append(instanceTypes, ec2types.InstanceType(t)) + } + + input := &ec2.DescribeSpotPriceHistoryInput{ + InstanceTypes: instanceTypes, + ProductDescriptions: []string{"Linux/UNIX"}, + StartTime: aws.Time(time.Now().Add(-1 * time.Hour)), + } + + out, err := client.DescribeSpotPriceHistory(ctx, input) + if err != nil { + fmt.Fprintf(os.Stderr, " warning: could not fetch spot prices: %v\n", err) + return + } + + // Take the most recent price per instance type. The API returns entries + // per (type, AZ) sorted newest-first. We collapse across AZs — spot prices + // within a region are typically within a few percent. A 1-hour window with + // a handful of GPU types fits well within a single API page (1000 entries). + latestPrice := make(map[string]float64) + for _, sp := range out.SpotPriceHistory { + itype := string(sp.InstanceType) + if _, seen := latestPrice[itype]; seen { + continue + } + price, err := strconv.ParseFloat(aws.ToString(sp.SpotPrice), 64) + if err != nil { + continue + } + latestPrice[itype] = price + } + + // Populate SpotHourlyCost on matching instances and correct cost for + // instances already running as spot. + for i := range instances { + if instances[i].Source != models.SourceEC2 { + continue + } + price, ok := latestPrice[instances[i].InstanceType] + if !ok { + continue + } + instances[i].SpotHourlyCost = &price + if instances[i].PricingModel == "spot" { + instances[i].HourlyCost = price + instances[i].MonthlyCost = price * 730 + } + } +} diff --git a/internal/providers/aws/spot_test.go b/internal/providers/aws/spot_test.go new file mode 100644 index 0000000..55c62f9 --- /dev/null +++ b/internal/providers/aws/spot_test.go @@ -0,0 +1,153 @@ +// Copyright 2026 the gpuaudit authors. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package aws + +import ( + "context" + "fmt" + "testing" + "time" + + "github.com/aws/aws-sdk-go-v2/aws" + "github.com/aws/aws-sdk-go-v2/service/ec2" + ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" + + "github.com/gpuaudit/cli/internal/models" +) + +type mockSpotPriceClient struct { + prices []ec2types.SpotPrice + err error +} + +func (m *mockSpotPriceClient) DescribeSpotPriceHistory(ctx context.Context, params *ec2.DescribeSpotPriceHistoryInput, optFns ...func(*ec2.Options)) (*ec2.DescribeSpotPriceHistoryOutput, error) { + if m.err != nil { + return nil, m.err + } + return &ec2.DescribeSpotPriceHistoryOutput{ + SpotPriceHistory: m.prices, + }, nil +} + +func TestEnrichSpotPrices_PopulatesSpotCost(t *testing.T) { + client := &mockSpotPriceClient{ + prices: []ec2types.SpotPrice{ + { + InstanceType: ec2types.InstanceTypeG5Xlarge, + SpotPrice: aws.String("0.556"), + Timestamp: aws.Time(time.Now()), + }, + { + InstanceType: ec2types.InstanceTypeG5Xlarge, + SpotPrice: aws.String("0.500"), + Timestamp: aws.Time(time.Now().Add(-1 * time.Hour)), + }, + }, + } + instances := []models.GPUInstance{ + {InstanceID: "i-1", InstanceType: "g5.xlarge", Source: models.SourceEC2}, + {InstanceID: "i-2", InstanceType: "g5.2xlarge", Source: models.SourceEC2}, + } + + EnrichSpotPrices(context.Background(), client, instances) + + if instances[0].SpotHourlyCost == nil { + t.Fatal("expected spot price for g5.xlarge") + } + if *instances[0].SpotHourlyCost != 0.556 { + t.Errorf("expected 0.556, got %f", *instances[0].SpotHourlyCost) + } + if instances[1].SpotHourlyCost != nil { + t.Error("expected nil spot price for g5.2xlarge (not in API response)") + } +} + +func TestEnrichSpotPrices_SkipsNonEC2(t *testing.T) { + client := &mockSpotPriceClient{ + prices: []ec2types.SpotPrice{ + { + InstanceType: ec2types.InstanceTypeG5Xlarge, + SpotPrice: aws.String("0.556"), + Timestamp: aws.Time(time.Now()), + }, + }, + } + instances := []models.GPUInstance{ + {InstanceID: "ep-1", InstanceType: "ml.g5.xlarge", Source: models.SourceSageMakerEndpoint}, + } + + EnrichSpotPrices(context.Background(), client, instances) + + if instances[0].SpotHourlyCost != nil { + t.Error("expected nil spot price for SageMaker instance") + } +} + +func TestEnrichSpotPrices_HandlesAPIError(t *testing.T) { + client := &mockSpotPriceClient{ + err: fmt.Errorf("access denied"), + } + instances := []models.GPUInstance{ + {InstanceID: "i-1", InstanceType: "g5.xlarge", Source: models.SourceEC2}, + } + + EnrichSpotPrices(context.Background(), client, instances) + + if instances[0].SpotHourlyCost != nil { + t.Error("expected nil spot price after API error") + } +} + +func TestEnrichSpotPrices_EmptyInstances(t *testing.T) { + client := &mockSpotPriceClient{} + EnrichSpotPrices(context.Background(), client, nil) + EnrichSpotPrices(context.Background(), client, []models.GPUInstance{}) +} + +func TestEnrichSpotPrices_CorrectsCostForSpotInstances(t *testing.T) { + client := &mockSpotPriceClient{ + prices: []ec2types.SpotPrice{ + { + InstanceType: ec2types.InstanceTypeG5Xlarge, + SpotPrice: aws.String("0.556"), + Timestamp: aws.Time(time.Now()), + }, + }, + } + instances := []models.GPUInstance{ + { + InstanceID: "i-spot", + InstanceType: "g5.xlarge", + Source: models.SourceEC2, + PricingModel: "spot", + HourlyCost: 1.006, // on-demand price (wrong for spot) + MonthlyCost: 1.006 * 730, + }, + { + InstanceID: "i-ondemand", + InstanceType: "g5.xlarge", + Source: models.SourceEC2, + PricingModel: "on-demand", + HourlyCost: 1.006, + MonthlyCost: 1.006 * 730, + }, + } + + EnrichSpotPrices(context.Background(), client, instances) + + // Spot instance should have corrected cost + if instances[0].HourlyCost != 0.556 { + t.Errorf("spot instance hourly cost: expected 0.556, got %f", instances[0].HourlyCost) + } + expectedMonthlyCost := 0.556 * 730 + const epsilon = 0.0001 + if instances[0].MonthlyCost < expectedMonthlyCost-epsilon || instances[0].MonthlyCost > expectedMonthlyCost+epsilon { + t.Errorf("spot instance monthly cost: expected %f, got %f", expectedMonthlyCost, instances[0].MonthlyCost) + } + + // On-demand instance should keep original cost + if instances[1].HourlyCost != 1.006 { + t.Errorf("on-demand instance hourly cost should be unchanged, got %f", instances[1].HourlyCost) + } +}