Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 62 additions & 41 deletions internal/controllers/projectpurge/purge.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@ package projectpurge

import (
"context"
"errors"
"fmt"
"net"
"strings"
"syscall"
"time"

"golang.org/x/sync/errgroup"
Expand All @@ -12,7 +15,6 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/discovery"
"k8s.io/client-go/dynamic"
"k8s.io/client-go/kubernetes"
Expand Down Expand Up @@ -50,9 +52,13 @@ var protected = map[string]struct{}{
// add "milo-system" if you make it per-project and protect it
}

func (p *Purger) Purge(ctx context.Context, cfg *rest.Config, project string, o Options) error {
// StartPurge runs Phases A through D (discovery, DeleteCollection on namespaced
// resources, delete namespaces, force-finalize namespaces). These are fast
// fire-and-forget operations that issue delete commands without waiting for
// completion. All phases are idempotent and safe to re-run.
func (p *Purger) StartPurge(ctx context.Context, cfg *rest.Config, project string, o Options) error {
if o.Timeout == 0 {
o.Timeout = 5 * time.Minute
o.Timeout = 2 * time.Minute
}
if o.Parallel <= 0 {
o.Parallel = 8
Expand Down Expand Up @@ -101,8 +107,10 @@ func (p *Purger) Purge(ctx context.Context, cfg *rest.Config, project string, o
}
}

// Partition & exclude namespaces & CRDs for explicit phases
var namespaced, cluster []res
// Partition & exclude namespaces & CRDs for explicit phases.
// Cluster-scoped resource deletion is intentionally omitted —
// only namespaced resources and namespaces themselves are purged.
var namespaced []res
for _, r := range all {
if r.gvr.Group == "" && r.gvr.Resource == "namespaces" {
continue
Expand All @@ -112,8 +120,6 @@ func (p *Purger) Purge(ctx context.Context, cfg *rest.Config, project string, o
}
if r.namespaced {
namespaced = append(namespaced, r)
} else {
cluster = append(cluster, r)
}
}

Expand Down Expand Up @@ -149,23 +155,9 @@ func (p *Purger) Purge(ctx context.Context, cfg *rest.Config, project string, o
return err
}

// Phase B: cluster-scoped kinds
// if err := runParallel(deadline, o.Parallel, cluster, func(ctx context.Context, r res) error {
// if err := dyn.Resource(r.gvr).DeleteCollection(ctx, delOpts, listOpts); !ignorable(err) {
// if apierrors.IsForbidden(err) || apierrors.IsUnauthorized(err) {
// return fmt.Errorf("rbac forbids DeleteCollection for %s: %w", r.gvr, err)
// }
// return fmt.Errorf("DeleteCollection %s: %w", r.gvr, err)
// }
// return nil
// }); err != nil {
// return err
// }

// Phase C: delete namespaces themselves (sets DeletionTimestamp)
// Phase B: delete namespaces themselves (sets DeletionTimestamp)
if err := runParallel(deadline, o.Parallel, namespaces, func(ctx context.Context, ns string) error {
if _, ok := protected[ns]; ok {
// Keep the namespace object; we've already deleted its contents in Phase A.
return nil
}
if err := core.CoreV1().Namespaces().Delete(ctx, ns, delOpts); !ignorable(err) {
Expand All @@ -179,22 +171,20 @@ func (p *Purger) Purge(ctx context.Context, cfg *rest.Config, project string, o
return err
}

// Phase D: force-finalize namespaces so we don't rely on a namespace controller
// Phase C: force-finalize namespaces so we dont rely on a namespace controller
if err := runParallel(deadline, o.Parallel, namespaces, func(ctx context.Context, ns string) error {
nso, err := core.CoreV1().Namespaces().Get(ctx, ns, metav1.GetOptions{})
if ignorable(err) { // not found or not served
if ignorable(err) {
return nil
}
if err != nil {
return fmt.Errorf("get namespace %q: %w", ns, err)
}

// If delete hasn’t landed yet, try again (idempotent)
if nso.DeletionTimestamp.IsZero() {
_ = core.CoreV1().Namespaces().Delete(ctx, ns, delOpts)
}

// Clear finalizers to allow immediate removal without a namespace controller
nso.Spec.Finalizers = nil
if _, err := core.CoreV1().Namespaces().Finalize(ctx, nso, metav1.UpdateOptions{}); !ignorable(err) {
if apierrors.IsForbidden(err) || apierrors.IsUnauthorized(err) {
Expand All @@ -207,25 +197,56 @@ func (p *Purger) Purge(ctx context.Context, cfg *rest.Config, project string, o
return err
}

// Phase E: verify all namespaces are gone (so tearing down per-project controllers is safe)
if err := wait.PollUntilContextCancel(deadline, 500*time.Millisecond, true, func(ctx context.Context) (bool, error) {
nsList, err := core.CoreV1().Namespaces().List(ctx, metav1.ListOptions{})
if err != nil {
return false, fmt.Errorf("list namespaces: %w", err)
}
if len(nsList.Items) == 1 && nsList.Items[0].Name == "default" {
return true, nil // all gone
}
// If we have namespaces left, we can’t proceed
return false, nil
return nil
}

}); err != nil {
return fmt.Errorf("timeout waiting for namespaces to disappear: %w", err)
// IsPurgeComplete performs a single namespace list and returns true when only
// the "default" namespace (or no namespaces) remain. Only errors that
// definitively indicate the per-project API server is gone (e.g. connection
// refused) are treated as complete. All other errors (timeouts, 500s, 429s,
// RBAC issues, context cancellation) are returned so the controller can retry.
func (p *Purger) IsPurgeComplete(ctx context.Context, cfg *rest.Config, project string) (bool, error) {
core, err := kubernetes.NewForConfig(cfg)
if err != nil {
return false, fmt.Errorf("building client for project %s: %w", project, err)
}

nsList, err := core.CoreV1().Namespaces().List(ctx, metav1.ListOptions{})
if err != nil {
if isServerGone(err) {
return true, nil
}
return false, fmt.Errorf("listing namespaces for project %s: %w", project, err)
}

// Phase F: we might need to clean up crds in the future
switch len(nsList.Items) {
case 0:
return true, nil
case 1:
return nsList.Items[0].Name == "default", nil
default:
return false, nil
}
}

return nil
// isServerGone returns true when the error indicates the remote API server is
// permanently unreachable — connection refused, or the API endpoint itself no
// longer exists. Transient failures (timeouts, 500s, throttling, RBAC) return
// false so the caller retries.
func isServerGone(err error) bool {
var opErr *net.OpError
if errors.As(err, &opErr) {
if errors.Is(opErr.Err, syscall.ECONNREFUSED) {
return true
}
}
if errors.Is(err, syscall.ECONNREFUSED) {
return true
}
if apierrors.IsNotFound(err) {
return true
}
return false
}

// helper (generic, named)
Expand Down
104 changes: 95 additions & 9 deletions internal/controllers/resourcemanager/project_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/cluster"
"sigs.k8s.io/controller-runtime/pkg/controller"
"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
"sigs.k8s.io/controller-runtime/pkg/handler"
"sigs.k8s.io/controller-runtime/pkg/log"
Expand Down Expand Up @@ -81,32 +82,116 @@
return ctrl.Result{}, fmt.Errorf("get project: %w", err)
}

// Deletion path: run purge, then remove finalizer
// Deletion path: clean up project resources, then remove finalizer
if !project.DeletionTimestamp.IsZero() {
// Best-effort delete the ProjectControlPlane in infra
if r.InfraClient != nil {
var pcp infrastructurev1alpha1.ProjectControlPlane
if err := r.InfraClient.Get(ctx, types.NamespacedName{
Namespace: project.Namespace,
Name: project.Name,
}, &pcp); err == nil && pcp.DeletionTimestamp.IsZero() {
_ = r.InfraClient.Delete(ctx, &pcp)
}
}
if controllerutil.ContainsFinalizer(&project, projectFinalizer) {
projCfg := r.forProject(r.BaseConfig, project.Name)
if err := r.Purger.Purge(ctx, projCfg, project.Name, projectpurge.Options{
Timeout: 10 * time.Minute,

cleanupCond := apimeta.FindStatusCondition(project.Status.Conditions, resourcemanagerv1alpha.ProjectResourceCleanup)

// If awaiting completion, check whether resources have drained.
if cleanupCond != nil && cleanupCond.Status == metav1.ConditionTrue &&
cleanupCond.Reason == resourcemanagerv1alpha.ProjectCleanupAwaitingCompletionReason {

done, err := r.Purger.IsPurgeComplete(ctx, projCfg, project.Name)
if err != nil {
logger.Error(err, "check cleanup completion", "project", project.Name)
return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
}
if done {
// Update ResourceCleanup condition to reflect completion
cleanupDone := metav1.Condition{
Type: resourcemanagerv1alpha.ProjectResourceCleanup,
Status: metav1.ConditionFalse,
Reason: resourcemanagerv1alpha.ProjectCleanupCompleteReason,
Message: "Project resources have been deleted",
ObservedGeneration: project.Generation,
}
if apimeta.SetStatusCondition(&project.Status.Conditions, cleanupDone) {
if err := r.ControlPlaneClient.Status().Update(ctx, &project); err != nil {
return ctrl.Result{}, fmt.Errorf("update cleanup status: %w", err)
}
}

// Re-fetch to get current resourceVersion after status update
if err := r.ControlPlaneClient.Get(ctx, req.NamespacedName, &project); err != nil {
return ctrl.Result{}, fmt.Errorf("re-fetch project: %w", err)
}

// Remove finalizer with fresh object
before := project.DeepCopy()
controllerutil.RemoveFinalizer(&project, projectFinalizer)
if err := r.ControlPlaneClient.Patch(ctx, &project, client.MergeFrom(before)); err != nil {
return ctrl.Result{}, fmt.Errorf("remove finalizer: %w", err)
}
return ctrl.Result{}, nil
}

// Resources still exist — transition back to CleanupStarted
// so the next reconcile re-issues delete commands.
reissue := metav1.Condition{
Type: resourcemanagerv1alpha.ProjectResourceCleanup,
Status: metav1.ConditionTrue,
Reason: resourcemanagerv1alpha.ProjectCleanupStartedReason,
Message: "Re-issuing delete commands for remaining project resources",
ObservedGeneration: project.Generation,
}
if apimeta.SetStatusCondition(&project.Status.Conditions, reissue) {
if err := r.ControlPlaneClient.Status().Update(ctx, &project); err != nil {
return ctrl.Result{}, fmt.Errorf("update cleanup status: %w", err)
}
}
return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
}

// CleanupStarted or no condition yet — issue delete commands.
cleanupStarted := metav1.Condition{
Type: resourcemanagerv1alpha.ProjectResourceCleanup,
Status: metav1.ConditionTrue,
Reason: resourcemanagerv1alpha.ProjectCleanupStartedReason,
Message: "Issuing delete commands for project resources",
ObservedGeneration: project.Generation,
}
if apimeta.SetStatusCondition(&project.Status.Conditions, cleanupStarted) {
if err := r.ControlPlaneClient.Status().Update(ctx, &project); err != nil {
return ctrl.Result{}, fmt.Errorf("update cleanup status: %w", err)
}
}

if err := r.Purger.StartPurge(ctx, projCfg, project.Name, projectpurge.Options{
Timeout: 2 * time.Minute,
Parallel: 16,
}); err != nil {
// requeue to retry purge
return ctrl.Result{RequeueAfter: 2 * time.Second}, fmt.Errorf("purge %q: %w", project.Name, err)
logger.Error(err, "start cleanup", "project", project.Name)
return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
}
before := project.DeepCopy()
controllerutil.RemoveFinalizer(&project, projectFinalizer)
if err := r.ControlPlaneClient.Patch(ctx, &project, client.MergeFrom(before)); err != nil {
return ctrl.Result{}, fmt.Errorf("remove finalizer: %w", err)

// Transition to awaiting completion — subsequent reconciles
// will check IsPurgeComplete instead of re-issuing deletes.
cleanupAwaiting := metav1.Condition{
Type: resourcemanagerv1alpha.ProjectResourceCleanup,
Status: metav1.ConditionTrue,
Reason: resourcemanagerv1alpha.ProjectCleanupAwaitingCompletionReason,
Message: "Waiting for project resources to be removed",

Check warning on line 185 in internal/controllers/resourcemanager/project_controller.go

View check run for this annotation

JoggrBot / Joggr

internal/controllers/resourcemanager/project_controller.go#L90-L185

"docs/architecture/controllers/project-controller/README.md" is outdated: The ProjectController's deletion logic now tracks resource cleanup states using new status conditions (ResourceCleanup, CleanupStarted, CleanupAwaitingCompletion, CleanupComplete), moving project deletion to a multi-phase process while the doc only describes a simple finalizer/removal flow.

Check warning on line 185 in internal/controllers/resourcemanager/project_controller.go

View check run for this annotation

JoggrBot / Joggr

internal/controllers/resourcemanager/project_controller.go#L96-L185

"docs/api/resourcemanager.md" is outdated: The code now results in Project.status.conditions containing types and reasons (ResourceCleanup, CleanupStarted, etc.) that are missing from the Project.status.conditions section of the document, so a reader cannot correctly interpret the Project resource's observed state as pertains to deletion or cleanup.
ObservedGeneration: project.Generation,
}
if apimeta.SetStatusCondition(&project.Status.Conditions, cleanupAwaiting) {
if err := r.ControlPlaneClient.Status().Update(ctx, &project); err != nil {
return ctrl.Result{}, fmt.Errorf("update awaiting status: %w", err)
}
}

return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
}
return ctrl.Result{}, nil
}
Expand Down Expand Up @@ -403,6 +488,7 @@
r.Purger = projectpurge.New()

return ctrl.NewControllerManagedBy(mgr).
WithOptions(controller.Options{MaxConcurrentReconciles: 4}).
For(&resourcemanagerv1alpha.Project{}).
WatchesRawSource(source.TypedKind(
infraCluster.GetCache(),
Expand Down
16 changes: 16 additions & 0 deletions pkg/apis/resourcemanager/v1alpha1/project_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,36 @@
}

const (
// ProjectReady indicates that the project has been provisioned and is ready
// for use.
ProjectReady = "Ready"

// ProjectResourceCleanup indicates that project resources are being deleted
// as part of project teardown.
ProjectResourceCleanup = "ResourceCleanup"
)

const (
// ProjectReadyReason indicates that the project is ready for use.
ProjectReadyReason = "Ready"

// ProjectProvisioningReason indicates that the project is provisioning.
ProjectProvisioningReason = "Provisioning"

// ProjectNameConflict indicates that the project name already exists
ProjectNameConflictReason = "ProjectNameConflict"

// ProjectCleanupStartedReason indicates that resource cleanup has been
// initiated and delete commands are being issued.
ProjectCleanupStartedReason = "CleanupStarted"

// ProjectCleanupAwaitingCompletionReason indicates that delete commands
// have been issued and the controller is waiting for resources to be removed.
ProjectCleanupAwaitingCompletionReason = "CleanupAwaitingCompletion"

Check warning on line 50 in pkg/apis/resourcemanager/v1alpha1/project_types.go

View check run for this annotation

JoggrBot / Joggr

pkg/apis/resourcemanager/v1alpha1/project_types.go#L24-L50

"docs/architecture/controllers/project-controller/README.md" is outdated: New Project status condition types and reasons have been added (ProjectResourceCleanup, ProjectCleanupStartedReason, ProjectCleanupAwaitingCompletionReason, ProjectCleanupCompleteReason), but the doc does not mention these, only describing the Ready condition and generic reasons.

Check warning on line 50 in pkg/apis/resourcemanager/v1alpha1/project_types.go

View check run for this annotation

JoggrBot / Joggr

pkg/apis/resourcemanager/v1alpha1/project_types.go#L24-L50

"docs/api/resourcemanager.md" is outdated: The code introduces the ProjectResourceCleanup condition type with reasons like CleanupStarted, CleanupAwaitingCompletion, and CleanupComplete, none of which are described or referenced anywhere in the 'Project' API reference or its conditions documentation.
// ProjectCleanupCompleteReason indicates that all project resources have
// been deleted.
ProjectCleanupCompleteReason = "CleanupComplete"
)

// +kubebuilder:object:root=true
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
apiVersion: resourcemanager.miloapis.com/v1alpha1
kind: Organization
metadata:
name: test-project-deletion-org
labels:
test.miloapis.com/scenario: "project-deletion"
annotations:
kubernetes.io/description: "Organization for testing project deletion"
kubernetes.io/display-name: "Test Project Deletion Organization"
spec:
type: Standard
10 changes: 10 additions & 0 deletions test/resource-management/project-deletion/02-test-user.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
apiVersion: iam.miloapis.com/v1alpha1
kind: User
metadata:
name: user-admin
labels:
test.miloapis.com/scenario: "project-deletion"
spec:
email: admin@test.local
givenName: ProjectDeleteTest
familyName: Admin
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: resourcemanager.miloapis.com/v1alpha1
kind: OrganizationMembership
metadata:
name: user-admin-membership
namespace: organization-test-project-deletion-org
labels:
test.miloapis.com/scenario: "project-deletion"
spec:
organizationRef:
name: test-project-deletion-org
userRef:
name: "user-admin"
10 changes: 10 additions & 0 deletions test/resource-management/project-deletion/04-test-project.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
apiVersion: resourcemanager.miloapis.com/v1alpha1
kind: Project
metadata:
name: project-deletion-test
labels:
test.miloapis.com/scenario: "project-deletion"
spec:
ownerRef:
kind: Organization
name: test-project-deletion-org
Loading
Loading