forked from virtual-kubelet/virtual-kubelet
/
podcontroller.go
412 lines (353 loc) · 15.6 KB
/
podcontroller.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
// Copyright © 2017 The virtual-kubelet authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package node
import (
"context"
"fmt"
"reflect"
"strconv"
"sync"
"time"
pkgerrors "github.com/pkg/errors"
"github.com/virtual-kubelet/virtual-kubelet/errdefs"
"github.com/virtual-kubelet/virtual-kubelet/log"
"github.com/virtual-kubelet/virtual-kubelet/manager"
"github.com/virtual-kubelet/virtual-kubelet/trace"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/util/wait"
corev1informers "k8s.io/client-go/informers/core/v1"
corev1client "k8s.io/client-go/kubernetes/typed/core/v1"
corev1listers "k8s.io/client-go/listers/core/v1"
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/tools/record"
"k8s.io/client-go/util/workqueue"
)
// PodLifecycleHandler defines the interface used by the PodController to react
// to new and changed pods scheduled to the node that is being managed.
//
// Errors produced by these methods should implement an interface from
// github.com/virtual-kubelet/virtual-kubelet/errdefs package in order for the
// core logic to be able to understand the type of failure.
type PodLifecycleHandler interface {
// CreatePod takes a Kubernetes Pod and deploys it within the provider.
CreatePod(ctx context.Context, pod *corev1.Pod) error
// UpdatePod takes a Kubernetes Pod and updates it within the provider.
UpdatePod(ctx context.Context, pod *corev1.Pod) error
// DeletePod takes a Kubernetes Pod and deletes it from the provider.
DeletePod(ctx context.Context, pod *corev1.Pod) error
// GetPod retrieves a pod by name from the provider (can be cached).
GetPod(ctx context.Context, namespace, name string) (*corev1.Pod, error)
// GetPodStatus retrieves the status of a pod by name from the provider.
GetPodStatus(ctx context.Context, namespace, name string) (*corev1.PodStatus, error)
// GetPods retrieves a list of all pods running on the provider (can be cached).
GetPods(context.Context) ([]*corev1.Pod, error)
}
// PodNotifier notifies callers of pod changes.
// Providers should implement this interface to enable callers to be notified
// of pod status updates asyncronously.
type PodNotifier interface {
// NotifyPods instructs the notifier to call the passed in function when
// the pod status changes.
//
// NotifyPods should not block callers.
NotifyPods(context.Context, func(*corev1.Pod))
}
// PodController is the controller implementation for Pod resources.
type PodController struct {
provider PodLifecycleHandler
// podsInformer is an informer for Pod resources.
podsInformer corev1informers.PodInformer
// podsLister is able to list/get Pod resources from a shared informer's store.
podsLister corev1listers.PodLister
// recorder is an event recorder for recording Event resources to the Kubernetes API.
recorder record.EventRecorder
// ready is a channel which will be closed once the pod controller is fully up and running.
// this channel will never be closed if there is an error on startup.
ready chan struct{}
client corev1client.PodsGetter
resourceManager *manager.ResourceManager
}
// PodControllerConfig is used to configure a new PodController.
type PodControllerConfig struct {
// PodClient is used to perform actions on the k8s API, such as updating pod status
// This field is required
PodClient corev1client.PodsGetter
// PodInformer is used as a local cache for pods
// This should be configured to only look at pods scheduled to the node which the controller will be managing
PodInformer corev1informers.PodInformer
EventRecorder record.EventRecorder
Provider PodLifecycleHandler
// Listers used for filling details for things like downward API in pod spec.
ConfigMapLister corev1listers.ConfigMapLister
SecretLister corev1listers.SecretLister
ServiceLister corev1listers.ServiceLister
}
func NewPodController(cfg PodControllerConfig) (*PodController, error) {
if cfg.PodClient == nil {
return nil, errdefs.InvalidInput("must set core client")
}
if cfg.EventRecorder == nil {
return nil, errdefs.InvalidInput("must set event recorder")
}
if cfg.PodInformer == nil {
return nil, errdefs.InvalidInput("must set informer")
}
if cfg.ConfigMapLister == nil {
return nil, errdefs.InvalidInput("must set config map lister")
}
if cfg.SecretLister == nil {
return nil, errdefs.InvalidInput("must set secret lister")
}
if cfg.ServiceLister == nil {
return nil, errdefs.InvalidInput("must set service lister")
}
rm, err := manager.NewResourceManager(cfg.PodInformer.Lister(), cfg.SecretLister, cfg.ConfigMapLister, cfg.ServiceLister)
if err != nil {
return nil, pkgerrors.Wrap(err, "could not create resource manager")
}
return &PodController{
client: cfg.PodClient,
podsInformer: cfg.PodInformer,
podsLister: cfg.PodInformer.Lister(),
provider: cfg.Provider,
resourceManager: rm,
ready: make(chan struct{}),
recorder: cfg.EventRecorder,
}, nil
}
// Run will set up the event handlers for types we are interested in, as well as syncing informer caches and starting workers.
// It will block until the context is cancelled, at which point it will shutdown the work queue and wait for workers to finish processing their current work items.
func (pc *PodController) Run(ctx context.Context, podSyncWorkers int) error {
k8sQ := workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "syncPodsFromKubernetes")
defer k8sQ.ShutDown()
podStatusQueue := workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "syncPodStatusFromProvider")
pc.runProviderSyncWorkers(ctx, podStatusQueue, podSyncWorkers)
pc.runSyncFromProvider(ctx, podStatusQueue)
defer podStatusQueue.ShutDown()
// Set up event handlers for when Pod resources change.
pc.podsInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: func(pod interface{}) {
if key, err := cache.MetaNamespaceKeyFunc(pod); err != nil {
log.L.Error(err)
} else {
k8sQ.AddRateLimited(key)
}
},
UpdateFunc: func(oldObj, newObj interface{}) {
// Create a copy of the old and new pod objects so we don't mutate the cache.
oldPod := oldObj.(*corev1.Pod).DeepCopy()
newPod := newObj.(*corev1.Pod).DeepCopy()
// We want to check if the two objects differ in anything other than their resource versions.
// Hence, we make them equal so that this change isn't picked up by reflect.DeepEqual.
newPod.ResourceVersion = oldPod.ResourceVersion
// Skip adding this pod's key to the work queue if its .metadata (except .metadata.resourceVersion) and .spec fields haven't changed.
// This guarantees that we don't attempt to sync the pod every time its .status field is updated.
if reflect.DeepEqual(oldPod.ObjectMeta, newPod.ObjectMeta) && reflect.DeepEqual(oldPod.Spec, newPod.Spec) {
return
}
// At this point we know that something in .metadata or .spec has changed, so we must proceed to sync the pod.
if key, err := cache.MetaNamespaceKeyFunc(newPod); err != nil {
log.L.Error(err)
} else {
k8sQ.AddRateLimited(key)
}
},
DeleteFunc: func(pod interface{}) {
if key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(pod); err != nil {
log.L.Error(err)
} else {
k8sQ.AddRateLimited(key)
}
},
})
// Wait for the caches to be synced *before* starting workers.
if ok := cache.WaitForCacheSync(ctx.Done(), pc.podsInformer.Informer().HasSynced); !ok {
return pkgerrors.New("failed to wait for caches to sync")
}
log.G(ctx).Info("Pod cache in-sync")
// Perform a reconciliation step that deletes any dangling pods from the provider.
// This happens only when the virtual-kubelet is starting, and operates on a "best-effort" basis.
// If by any reason the provider fails to delete a dangling pod, it will stay in the provider and deletion won't be retried.
pc.deleteDanglingPods(ctx, podSyncWorkers)
log.G(ctx).Info("starting workers")
for id := 0; id < podSyncWorkers; id++ {
go wait.Until(func() {
// Use the worker's "index" as its ID so we can use it for tracing.
pc.runWorker(ctx, strconv.Itoa(id), k8sQ)
}, time.Second, ctx.Done())
}
close(pc.ready)
log.G(ctx).Info("started workers")
<-ctx.Done()
log.G(ctx).Info("shutting down workers")
return nil
}
// Ready returns a channel which gets closed once the PodController is ready to handle scheduled pods.
// This channel will never close if there is an error on startup.
// The status of this channel after sthudown is indeterminate.
func (pc *PodController) Ready() <-chan struct{} {
return pc.ready
}
// runWorker is a long-running function that will continually call the processNextWorkItem function in order to read and process an item on the work queue.
func (pc *PodController) runWorker(ctx context.Context, workerId string, q workqueue.RateLimitingInterface) {
for pc.processNextWorkItem(ctx, workerId, q) {
}
}
// processNextWorkItem will read a single work item off the work queue and attempt to process it,by calling the syncHandler.
func (pc *PodController) processNextWorkItem(ctx context.Context, workerId string, q workqueue.RateLimitingInterface) bool {
// We create a span only after popping from the queue so that we can get an adequate picture of how long it took to process the item.
ctx, span := trace.StartSpan(ctx, "processNextWorkItem")
defer span.End()
// Add the ID of the current worker as an attribute to the current span.
ctx = span.WithField(ctx, "workerId", workerId)
return handleQueueItem(ctx, q, pc.syncHandler)
}
// syncHandler compares the actual state with the desired, and attempts to converge the two.
func (pc *PodController) syncHandler(ctx context.Context, key string) error {
ctx, span := trace.StartSpan(ctx, "syncHandler")
defer span.End()
// Add the current key as an attribute to the current span.
ctx = span.WithField(ctx, "key", key)
// Convert the namespace/name string into a distinct namespace and name.
namespace, name, err := cache.SplitMetaNamespaceKey(key)
if err != nil {
// Log the error as a warning, but do not requeue the key as it is invalid.
log.G(ctx).Warn(pkgerrors.Wrapf(err, "invalid resource key: %q", key))
return nil
}
// Get the Pod resource with this namespace/name.
pod, err := pc.podsLister.Pods(namespace).Get(name)
if err != nil {
if !errors.IsNotFound(err) {
// We've failed to fetch the pod from the lister, but the error is not a 404.
// Hence, we add the key back to the work queue so we can retry processing it later.
err := pkgerrors.Wrapf(err, "failed to fetch pod with key %q from lister", key)
span.SetStatus(err)
return err
}
// At this point we know the Pod resource doesn't exist, which most probably means it was deleted.
// Hence, we must delete it from the provider if it still exists there.
if err := pc.deletePod(ctx, namespace, name); err != nil {
err := pkgerrors.Wrapf(err, "failed to delete pod %q in the provider", loggablePodNameFromCoordinates(namespace, name))
span.SetStatus(err)
return err
}
return nil
}
// At this point we know the Pod resource has either been created or updated (which includes being marked for deletion).
return pc.syncPodInProvider(ctx, pod)
}
// syncPodInProvider tries and reconciles the state of a pod by comparing its Kubernetes representation and the provider's representation.
func (pc *PodController) syncPodInProvider(ctx context.Context, pod *corev1.Pod) error {
ctx, span := trace.StartSpan(ctx, "syncPodInProvider")
defer span.End()
// Add the pod's attributes to the current span.
ctx = addPodAttributes(ctx, span, pod)
// Check whether the pod has been marked for deletion.
// If it does, guarantee it is deleted in the provider and Kubernetes.
if pod.DeletionTimestamp != nil {
if err := pc.deletePod(ctx, pod.Namespace, pod.Name); err != nil {
err := pkgerrors.Wrapf(err, "failed to delete pod %q in the provider", loggablePodName(pod))
span.SetStatus(err)
return err
}
return nil
}
// Ignore the pod if it is in the "Failed" or "Succeeded" state.
if pod.Status.Phase == corev1.PodFailed || pod.Status.Phase == corev1.PodSucceeded {
log.G(ctx).Warnf("skipping sync of pod %q in %q phase", loggablePodName(pod), pod.Status.Phase)
return nil
}
// Create or update the pod in the provider.
if err := pc.createOrUpdatePod(ctx, pod); err != nil {
err := pkgerrors.Wrapf(err, "failed to sync pod %q in the provider", loggablePodName(pod))
span.SetStatus(err)
return err
}
return nil
}
// deleteDanglingPods checks whether the provider knows about any pods which Kubernetes doesn't know about, and deletes them.
func (pc *PodController) deleteDanglingPods(ctx context.Context, threadiness int) {
ctx, span := trace.StartSpan(ctx, "deleteDanglingPods")
defer span.End()
// Grab the list of pods known to the provider.
pps, err := pc.provider.GetPods(ctx)
if err != nil {
err := pkgerrors.Wrap(err, "failed to fetch the list of pods from the provider")
span.SetStatus(err)
log.G(ctx).Error(err)
return
}
// Create a slice to hold the pods we will be deleting from the provider.
ptd := make([]*corev1.Pod, 0)
// Iterate over the pods known to the provider, marking for deletion those that don't exist in Kubernetes.
// Take on this opportunity to populate the list of key that correspond to pods known to the provider.
for _, pp := range pps {
if _, err := pc.podsLister.Pods(pp.Namespace).Get(pp.Name); err != nil {
if errors.IsNotFound(err) {
// The current pod does not exist in Kubernetes, so we mark it for deletion.
ptd = append(ptd, pp)
continue
}
// For some reason we couldn't fetch the pod from the lister, so we propagate the error.
err := pkgerrors.Wrap(err, "failed to fetch pod from the lister")
span.SetStatus(err)
log.G(ctx).Error(err)
return
}
}
// We delete each pod in its own goroutine, allowing a maximum of "threadiness" concurrent deletions.
semaphore := make(chan struct{}, threadiness)
var wg sync.WaitGroup
wg.Add(len(ptd))
// Iterate over the slice of pods to be deleted and delete them in the provider.
for _, pod := range ptd {
go func(ctx context.Context, pod *corev1.Pod) {
defer wg.Done()
ctx, span := trace.StartSpan(ctx, "deleteDanglingPod")
defer span.End()
semaphore <- struct{}{}
defer func() {
<-semaphore
}()
// Add the pod's attributes to the current span.
ctx = addPodAttributes(ctx, span, pod)
// Actually delete the pod.
if err := pc.deletePod(ctx, pod.Namespace, pod.Name); err != nil {
span.SetStatus(err)
log.G(ctx).Errorf("failed to delete pod %q in provider", loggablePodName(pod))
} else {
log.G(ctx).Infof("deleted leaked pod %q in provider", loggablePodName(pod))
}
}(ctx, pod)
}
// Wait for all pods to be deleted.
wg.Wait()
return
}
// loggablePodName returns the "namespace/name" key for the specified pod.
// If the key cannot be computed, "(unknown)" is returned.
// This method is meant to be used for logging purposes only.
func loggablePodName(pod *corev1.Pod) string {
k, err := cache.MetaNamespaceKeyFunc(pod)
if err != nil {
return "(unknown)"
}
return k
}
// loggablePodNameFromCoordinates returns the "namespace/name" key for the pod identified by the specified namespace and name (coordinates).
func loggablePodNameFromCoordinates(namespace, name string) string {
return fmt.Sprintf("%s/%s", namespace, name)
}