forked from kubernetes/kubernetes
-
Notifications
You must be signed in to change notification settings - Fork 1
/
kuberuntime_container.go
865 lines (778 loc) · 34.2 KB
/
kuberuntime_container.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
/*
Copyright 2016 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package kuberuntime
import (
"errors"
"fmt"
"io"
"math/rand"
"net/url"
"os"
"path/filepath"
"sort"
"strings"
"sync"
"time"
"google.golang.org/grpc"
"github.com/armon/circbuf"
"github.com/golang/glog"
"k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
kubetypes "k8s.io/apimachinery/pkg/types"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/sets"
runtimeapi "k8s.io/kubernetes/pkg/kubelet/apis/cri/v1alpha1/runtime"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/events"
"k8s.io/kubernetes/pkg/kubelet/qos"
"k8s.io/kubernetes/pkg/kubelet/types"
"k8s.io/kubernetes/pkg/kubelet/util/format"
"k8s.io/kubernetes/pkg/util/selinux"
"k8s.io/kubernetes/pkg/util/tail"
)
var (
ErrCreateContainerConfig = errors.New("CreateContainerConfigError")
ErrCreateContainer = errors.New("CreateContainerError")
ErrPostStartHook = errors.New("PostStartHookError")
)
// recordContainerEvent should be used by the runtime manager for all container related events.
// it has sanity checks to ensure that we do not write events that can abuse our masters.
// in particular, it ensures that a containerID never appears in an event message as that
// is prone to causing a lot of distinct events that do not count well.
// it replaces any reference to a containerID with the containerName which is stable, and is what users know.
func (m *kubeGenericRuntimeManager) recordContainerEvent(pod *v1.Pod, container *v1.Container, containerID, eventType, reason, message string, args ...interface{}) {
ref, err := kubecontainer.GenerateContainerRef(pod, container)
if err != nil {
glog.Errorf("Can't make a ref to pod %q, container %v: %v", format.Pod(pod), container.Name, err)
return
}
eventMessage := message
if len(args) > 0 {
eventMessage = fmt.Sprintf(message, args...)
}
// this is a hack, but often the error from the runtime includes the containerID
// which kills our ability to deduplicate events. this protection makes a huge
// difference in the number of unique events
if containerID != "" {
eventMessage = strings.Replace(eventMessage, containerID, container.Name, -1)
}
m.recorder.Event(ref, eventType, reason, eventMessage)
}
// startContainer starts a container and returns a message indicates why it is failed on error.
// It starts the container through the following steps:
// * pull the image
// * create the container
// * start the container
// * run the post start lifecycle hooks (if applicable)
func (m *kubeGenericRuntimeManager) startContainer(podSandboxID string, podSandboxConfig *runtimeapi.PodSandboxConfig, container *v1.Container, pod *v1.Pod, podStatus *kubecontainer.PodStatus, pullSecrets []v1.Secret, podIP string) (string, error) {
// Step 1: pull the image.
imageRef, msg, err := m.imagePuller.EnsureImageExists(pod, container, pullSecrets)
if err != nil {
m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", grpc.ErrorDesc(err))
return msg, err
}
// Step 2: create the container.
ref, err := kubecontainer.GenerateContainerRef(pod, container)
if err != nil {
glog.Errorf("Can't make a ref to pod %q, container %v: %v", format.Pod(pod), container.Name, err)
}
glog.V(4).Infof("Generating ref for container %s: %#v", container.Name, ref)
// For a new container, the RestartCount should be 0
restartCount := 0
containerStatus := podStatus.FindContainerStatusByName(container.Name)
if containerStatus != nil {
restartCount = containerStatus.RestartCount + 1
}
containerConfig, err := m.generateContainerConfig(container, pod, restartCount, podIP, imageRef)
if err != nil {
m.recordContainerEvent(pod, container, "", v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", grpc.ErrorDesc(err))
return grpc.ErrorDesc(err), ErrCreateContainerConfig
}
containerID, err := m.runtimeService.CreateContainer(podSandboxID, containerConfig, podSandboxConfig)
if err != nil {
m.recordContainerEvent(pod, container, containerID, v1.EventTypeWarning, events.FailedToCreateContainer, "Error: %v", grpc.ErrorDesc(err))
return grpc.ErrorDesc(err), ErrCreateContainer
}
err = m.internalLifecycle.PreStartContainer(pod, container, containerID)
if err != nil {
m.recorder.Eventf(ref, v1.EventTypeWarning, events.FailedToStartContainer, "Internal PreStartContainer hook failed: %v", err)
return "Internal PreStartContainer hook failed", err
}
m.recordContainerEvent(pod, container, containerID, v1.EventTypeNormal, events.CreatedContainer, "Created container")
if ref != nil {
m.containerRefManager.SetRef(kubecontainer.ContainerID{
Type: m.runtimeName,
ID: containerID,
}, ref)
}
// Step 3: start the container.
err = m.runtimeService.StartContainer(containerID)
if err != nil {
m.recordContainerEvent(pod, container, containerID, v1.EventTypeWarning, events.FailedToStartContainer, "Error: %v", grpc.ErrorDesc(err))
return grpc.ErrorDesc(err), kubecontainer.ErrRunContainer
}
m.recordContainerEvent(pod, container, containerID, v1.EventTypeNormal, events.StartedContainer, "Started container")
// Symlink container logs to the legacy container log location for cluster logging
// support.
// TODO(random-liu): Remove this after cluster logging supports CRI container log path.
containerMeta := containerConfig.GetMetadata()
sandboxMeta := podSandboxConfig.GetMetadata()
legacySymlink := legacyLogSymlink(containerID, containerMeta.Name, sandboxMeta.Name,
sandboxMeta.Namespace)
containerLog := filepath.Join(podSandboxConfig.LogDirectory, containerConfig.LogPath)
if err := m.osInterface.Symlink(containerLog, legacySymlink); err != nil {
glog.Errorf("Failed to create legacy symbolic link %q to container %q log %q: %v",
legacySymlink, containerID, containerLog, err)
}
// Step 4: execute the post start hook.
if container.Lifecycle != nil && container.Lifecycle.PostStart != nil {
kubeContainerID := kubecontainer.ContainerID{
Type: m.runtimeName,
ID: containerID,
}
msg, handlerErr := m.runner.Run(kubeContainerID, pod, container, container.Lifecycle.PostStart)
if handlerErr != nil {
m.recordContainerEvent(pod, container, kubeContainerID.ID, v1.EventTypeWarning, events.FailedPostStartHook, msg)
if err := m.killContainer(pod, kubeContainerID, container.Name, "FailedPostStartHook", nil); err != nil {
glog.Errorf("Failed to kill container %q(id=%q) in pod %q: %v, %v",
container.Name, kubeContainerID.String(), format.Pod(pod), ErrPostStartHook, err)
}
return msg, ErrPostStartHook
}
}
return "", nil
}
// generateContainerConfig generates container config for kubelet runtime v1.
func (m *kubeGenericRuntimeManager) generateContainerConfig(container *v1.Container, pod *v1.Pod, restartCount int, podIP, imageRef string) (*runtimeapi.ContainerConfig, error) {
opts, err := m.runtimeHelper.GenerateRunContainerOptions(pod, container, podIP)
if err != nil {
return nil, err
}
uid, username, err := m.getImageUser(container.Image)
if err != nil {
return nil, err
}
if uid != nil {
// Verify RunAsNonRoot. Non-root verification only supports numeric user.
if err := verifyRunAsNonRoot(pod, container, *uid); err != nil {
return nil, err
}
} else if username != "" {
glog.Warningf("Non-root verification doesn't support non-numeric user (%s)", username)
}
command, args := kubecontainer.ExpandContainerCommandAndArgs(container, opts.Envs)
containerLogsPath := buildContainerLogsPath(container.Name, restartCount)
restartCountUint32 := uint32(restartCount)
config := &runtimeapi.ContainerConfig{
Metadata: &runtimeapi.ContainerMetadata{
Name: container.Name,
Attempt: restartCountUint32,
},
Image: &runtimeapi.ImageSpec{Image: imageRef},
Command: command,
Args: args,
WorkingDir: container.WorkingDir,
Labels: newContainerLabels(container, pod),
Annotations: newContainerAnnotations(container, pod, restartCount),
Devices: makeDevices(opts),
Mounts: m.makeMounts(opts, container),
LogPath: containerLogsPath,
Stdin: container.Stdin,
StdinOnce: container.StdinOnce,
Tty: container.TTY,
Linux: m.generateLinuxContainerConfig(container, pod, uid, username),
}
// set environment variables
envs := make([]*runtimeapi.KeyValue, len(opts.Envs))
for idx := range opts.Envs {
e := opts.Envs[idx]
envs[idx] = &runtimeapi.KeyValue{
Key: e.Name,
Value: e.Value,
}
}
config.Envs = envs
return config, nil
}
// generateLinuxContainerConfig generates linux container config for kubelet runtime v1.
func (m *kubeGenericRuntimeManager) generateLinuxContainerConfig(container *v1.Container, pod *v1.Pod, uid *int64, username string) *runtimeapi.LinuxContainerConfig {
lc := &runtimeapi.LinuxContainerConfig{
Resources: &runtimeapi.LinuxContainerResources{},
SecurityContext: m.determineEffectiveSecurityContext(pod, container, uid, username),
}
// set linux container resources
var cpuShares int64
cpuRequest := container.Resources.Requests.Cpu()
cpuLimit := container.Resources.Limits.Cpu()
memoryLimit := container.Resources.Limits.Memory().Value()
oomScoreAdj := int64(qos.GetContainerOOMScoreAdjust(pod, container,
int64(m.machineInfo.MemoryCapacity)))
// If request is not specified, but limit is, we want request to default to limit.
// API server does this for new containers, but we repeat this logic in Kubelet
// for containers running on existing Kubernetes clusters.
if cpuRequest.IsZero() && !cpuLimit.IsZero() {
cpuShares = milliCPUToShares(cpuLimit.MilliValue())
} else {
// if cpuRequest.Amount is nil, then milliCPUToShares will return the minimal number
// of CPU shares.
cpuShares = milliCPUToShares(cpuRequest.MilliValue())
}
lc.Resources.CpuShares = cpuShares
if memoryLimit != 0 {
lc.Resources.MemoryLimitInBytes = memoryLimit
}
// Set OOM score of the container based on qos policy. Processes in lower-priority pods should
// be killed first if the system runs out of memory.
lc.Resources.OomScoreAdj = oomScoreAdj
if m.cpuCFSQuota {
// if cpuLimit.Amount is nil, then the appropriate default value is returned
// to allow full usage of cpu resource.
cpuQuota, cpuPeriod := milliCPUToQuota(cpuLimit.MilliValue())
lc.Resources.CpuQuota = cpuQuota
lc.Resources.CpuPeriod = cpuPeriod
}
return lc
}
// makeDevices generates container devices for kubelet runtime v1.
func makeDevices(opts *kubecontainer.RunContainerOptions) []*runtimeapi.Device {
devices := make([]*runtimeapi.Device, len(opts.Devices))
for idx := range opts.Devices {
device := opts.Devices[idx]
devices[idx] = &runtimeapi.Device{
HostPath: device.PathOnHost,
ContainerPath: device.PathInContainer,
Permissions: device.Permissions,
}
}
return devices
}
// makeMounts generates container volume mounts for kubelet runtime v1.
func (m *kubeGenericRuntimeManager) makeMounts(opts *kubecontainer.RunContainerOptions, container *v1.Container) []*runtimeapi.Mount {
volumeMounts := []*runtimeapi.Mount{}
for idx := range opts.Mounts {
v := opts.Mounts[idx]
selinuxRelabel := v.SELinuxRelabel && selinux.SELinuxEnabled()
mount := &runtimeapi.Mount{
HostPath: v.HostPath,
ContainerPath: v.ContainerPath,
Readonly: v.ReadOnly,
SelinuxRelabel: selinuxRelabel,
Propagation: v.Propagation,
}
volumeMounts = append(volumeMounts, mount)
}
// The reason we create and mount the log file in here (not in kubelet) is because
// the file's location depends on the ID of the container, and we need to create and
// mount the file before actually starting the container.
if opts.PodContainerDir != "" && len(container.TerminationMessagePath) != 0 {
// Because the PodContainerDir contains pod uid and container name which is unique enough,
// here we just add a random id to make the path unique for different instances
// of the same container.
cid := makeUID()
containerLogPath := filepath.Join(opts.PodContainerDir, cid)
fs, err := m.osInterface.Create(containerLogPath)
if err != nil {
utilruntime.HandleError(fmt.Errorf("error on creating termination-log file %q: %v", containerLogPath, err))
} else {
fs.Close()
// Chmod is needed because ioutil.WriteFile() ends up calling
// open(2) to create the file, so the final mode used is "mode &
// ~umask". But we want to make sure the specified mode is used
// in the file no matter what the umask is.
if err := m.osInterface.Chmod(containerLogPath, 0666); err != nil {
utilruntime.HandleError(fmt.Errorf("unable to set termination-log file permissions %q: %v", containerLogPath, err))
}
selinuxRelabel := selinux.SELinuxEnabled()
volumeMounts = append(volumeMounts, &runtimeapi.Mount{
HostPath: containerLogPath,
ContainerPath: container.TerminationMessagePath,
SelinuxRelabel: selinuxRelabel,
})
}
}
return volumeMounts
}
// getKubeletContainers lists containers managed by kubelet.
// The boolean parameter specifies whether returns all containers including
// those already exited and dead containers (used for garbage collection).
func (m *kubeGenericRuntimeManager) getKubeletContainers(allContainers bool) ([]*runtimeapi.Container, error) {
filter := &runtimeapi.ContainerFilter{}
if !allContainers {
filter.State = &runtimeapi.ContainerStateValue{
State: runtimeapi.ContainerState_CONTAINER_RUNNING,
}
}
containers, err := m.runtimeService.ListContainers(filter)
if err != nil {
glog.Errorf("getKubeletContainers failed: %v", err)
return nil, err
}
return containers, nil
}
// makeUID returns a randomly generated string.
func makeUID() string {
return fmt.Sprintf("%08x", rand.Uint32())
}
// getTerminationMessage looks on the filesystem for the provided termination message path, returning a limited
// amount of those bytes, or returns true if the logs should be checked.
func getTerminationMessage(status *runtimeapi.ContainerStatus, terminationMessagePath string, fallbackToLogs bool) (string, bool) {
if len(terminationMessagePath) == 0 {
return "", fallbackToLogs
}
for _, mount := range status.Mounts {
if mount.ContainerPath != terminationMessagePath {
continue
}
path := mount.HostPath
data, _, err := tail.ReadAtMost(path, kubecontainer.MaxContainerTerminationMessageLength)
if err != nil {
if os.IsNotExist(err) {
return "", fallbackToLogs
}
return fmt.Sprintf("Error on reading termination log %s: %v", path, err), false
}
return string(data), (fallbackToLogs && len(data) == 0)
}
return "", fallbackToLogs
}
// readLastStringFromContainerLogs attempts to read up to the max log length from the end of the CRI log represented
// by path. It reads up to max log lines.
func (m *kubeGenericRuntimeManager) readLastStringFromContainerLogs(path string) string {
value := int64(kubecontainer.MaxContainerTerminationMessageLogLines)
buf, _ := circbuf.NewBuffer(kubecontainer.MaxContainerTerminationMessageLogLength)
if err := m.ReadLogs(path, "", &v1.PodLogOptions{TailLines: &value}, buf, buf); err != nil {
return fmt.Sprintf("Error on reading termination message from logs: %v", err)
}
return buf.String()
}
// getPodContainerStatuses gets all containers' statuses for the pod.
func (m *kubeGenericRuntimeManager) getPodContainerStatuses(uid kubetypes.UID, name, namespace string) ([]*kubecontainer.ContainerStatus, error) {
// Select all containers of the given pod.
containers, err := m.runtimeService.ListContainers(&runtimeapi.ContainerFilter{
LabelSelector: map[string]string{types.KubernetesPodUIDLabel: string(uid)},
})
if err != nil {
glog.Errorf("ListContainers error: %v", err)
return nil, err
}
statuses := make([]*kubecontainer.ContainerStatus, len(containers))
// TODO: optimization: set maximum number of containers per container name to examine.
for i, c := range containers {
status, err := m.runtimeService.ContainerStatus(c.Id)
if err != nil {
glog.Errorf("ContainerStatus for %s error: %v", c.Id, err)
return nil, err
}
cStatus := toKubeContainerStatus(status, m.runtimeName)
if status.State == runtimeapi.ContainerState_CONTAINER_EXITED {
// Populate the termination message if needed.
annotatedInfo := getContainerInfoFromAnnotations(status.Annotations)
labeledInfo := getContainerInfoFromLabels(status.Labels)
fallbackToLogs := annotatedInfo.TerminationMessagePolicy == v1.TerminationMessageFallbackToLogsOnError && cStatus.ExitCode != 0
tMessage, checkLogs := getTerminationMessage(status, annotatedInfo.TerminationMessagePath, fallbackToLogs)
if checkLogs {
// if dockerLegacyService is populated, we're supposed to use it to fetch logs
if m.legacyLogProvider != nil {
tMessage, err = m.legacyLogProvider.GetContainerLogTail(uid, name, namespace, kubecontainer.ContainerID{Type: m.runtimeName, ID: c.Id})
if err != nil {
tMessage = fmt.Sprintf("Error reading termination message from logs: %v", err)
}
} else {
path := buildFullContainerLogsPath(uid, labeledInfo.ContainerName, annotatedInfo.RestartCount)
tMessage = m.readLastStringFromContainerLogs(path)
}
}
// Use the termination message written by the application is not empty
if len(tMessage) != 0 {
cStatus.Message = tMessage
}
}
statuses[i] = cStatus
}
sort.Sort(containerStatusByCreated(statuses))
return statuses, nil
}
func toKubeContainerStatus(status *runtimeapi.ContainerStatus, runtimeName string) *kubecontainer.ContainerStatus {
annotatedInfo := getContainerInfoFromAnnotations(status.Annotations)
labeledInfo := getContainerInfoFromLabels(status.Labels)
cStatus := &kubecontainer.ContainerStatus{
ID: kubecontainer.ContainerID{
Type: runtimeName,
ID: status.Id,
},
Name: labeledInfo.ContainerName,
Image: status.Image.Image,
ImageID: status.ImageRef,
Hash: annotatedInfo.Hash,
RestartCount: annotatedInfo.RestartCount,
State: toKubeContainerState(status.State),
CreatedAt: time.Unix(0, status.CreatedAt),
}
if status.State != runtimeapi.ContainerState_CONTAINER_CREATED {
// If container is not in the created state, we have tried and
// started the container. Set the StartedAt time.
cStatus.StartedAt = time.Unix(0, status.StartedAt)
}
if status.State == runtimeapi.ContainerState_CONTAINER_EXITED {
cStatus.Reason = status.Reason
cStatus.Message = status.Message
cStatus.ExitCode = int(status.ExitCode)
cStatus.FinishedAt = time.Unix(0, status.FinishedAt)
}
return cStatus
}
// executePreStopHook runs the pre-stop lifecycle hooks if applicable and returns the duration it takes.
func (m *kubeGenericRuntimeManager) executePreStopHook(pod *v1.Pod, containerID kubecontainer.ContainerID, containerSpec *v1.Container, gracePeriod int64) int64 {
glog.V(3).Infof("Running preStop hook for container %q", containerID.String())
start := metav1.Now()
done := make(chan struct{})
go func() {
defer close(done)
defer utilruntime.HandleCrash()
if msg, err := m.runner.Run(containerID, pod, containerSpec, containerSpec.Lifecycle.PreStop); err != nil {
glog.Errorf("preStop hook for container %q failed: %v", containerSpec.Name, err)
m.recordContainerEvent(pod, containerSpec, containerID.ID, v1.EventTypeWarning, events.FailedPreStopHook, msg)
}
}()
select {
case <-time.After(time.Duration(gracePeriod) * time.Second):
glog.V(2).Infof("preStop hook for container %q did not complete in %d seconds", containerID, gracePeriod)
case <-done:
glog.V(3).Infof("preStop hook for container %q completed", containerID)
}
return int64(metav1.Now().Sub(start.Time).Seconds())
}
// restoreSpecsFromContainerLabels restores all information needed for killing a container. In some
// case we may not have pod and container spec when killing a container, e.g. pod is deleted during
// kubelet restart.
// To solve this problem, we've already written necessary information into container labels. Here we
// just need to retrieve them from container labels and restore the specs.
// TODO(random-liu): Add a node e2e test to test this behaviour.
// TODO(random-liu): Change the lifecycle handler to just accept information needed, so that we can
// just pass the needed function not create the fake object.
func (m *kubeGenericRuntimeManager) restoreSpecsFromContainerLabels(containerID kubecontainer.ContainerID) (*v1.Pod, *v1.Container, error) {
var pod *v1.Pod
var container *v1.Container
s, err := m.runtimeService.ContainerStatus(containerID.ID)
if err != nil {
return nil, nil, err
}
l := getContainerInfoFromLabels(s.Labels)
a := getContainerInfoFromAnnotations(s.Annotations)
// Notice that the followings are not full spec. The container killing code should not use
// un-restored fields.
pod = &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
UID: l.PodUID,
Name: l.PodName,
Namespace: l.PodNamespace,
DeletionGracePeriodSeconds: a.PodDeletionGracePeriod,
},
Spec: v1.PodSpec{
TerminationGracePeriodSeconds: a.PodTerminationGracePeriod,
},
}
container = &v1.Container{
Name: l.ContainerName,
Ports: a.ContainerPorts,
TerminationMessagePath: a.TerminationMessagePath,
}
if a.PreStopHandler != nil {
container.Lifecycle = &v1.Lifecycle{
PreStop: a.PreStopHandler,
}
}
return pod, container, nil
}
// killContainer kills a container through the following steps:
// * Run the pre-stop lifecycle hooks (if applicable).
// * Stop the container.
func (m *kubeGenericRuntimeManager) killContainer(pod *v1.Pod, containerID kubecontainer.ContainerID, containerName string, reason string, gracePeriodOverride *int64) error {
var containerSpec *v1.Container
if pod != nil {
if containerSpec = kubecontainer.GetContainerSpec(pod, containerName); containerSpec == nil {
return fmt.Errorf("failed to get containerSpec %q(id=%q) in pod %q when killing container for reason %q",
containerName, containerID.String(), format.Pod(pod), reason)
}
} else {
// Restore necessary information if one of the specs is nil.
restoredPod, restoredContainer, err := m.restoreSpecsFromContainerLabels(containerID)
if err != nil {
return err
}
pod, containerSpec = restoredPod, restoredContainer
}
// From this point , pod and container must be non-nil.
gracePeriod := int64(minimumGracePeriodInSeconds)
switch {
case pod.DeletionGracePeriodSeconds != nil:
gracePeriod = *pod.DeletionGracePeriodSeconds
case pod.Spec.TerminationGracePeriodSeconds != nil:
gracePeriod = *pod.Spec.TerminationGracePeriodSeconds
}
glog.V(2).Infof("Killing container %q with %d second grace period", containerID.String(), gracePeriod)
// Run internal pre-stop lifecycle hook
if err := m.internalLifecycle.PreStopContainer(containerID.ID); err != nil {
return err
}
// Run the pre-stop lifecycle hooks if applicable and if there is enough time to run it
if containerSpec.Lifecycle != nil && containerSpec.Lifecycle.PreStop != nil && gracePeriod > 0 {
gracePeriod = gracePeriod - m.executePreStopHook(pod, containerID, containerSpec, gracePeriod)
}
// always give containers a minimal shutdown window to avoid unnecessary SIGKILLs
if gracePeriod < minimumGracePeriodInSeconds {
gracePeriod = minimumGracePeriodInSeconds
}
if gracePeriodOverride != nil {
gracePeriod = *gracePeriodOverride
glog.V(3).Infof("Killing container %q, but using %d second grace period override", containerID, gracePeriod)
}
err := m.runtimeService.StopContainer(containerID.ID, gracePeriod)
if err != nil {
glog.Errorf("Container %q termination failed with gracePeriod %d: %v", containerID.String(), gracePeriod, err)
} else {
glog.V(3).Infof("Container %q exited normally", containerID.String())
}
message := fmt.Sprintf("Killing container with id %s", containerID.String())
if reason != "" {
message = fmt.Sprint(message, ":", reason)
}
m.recordContainerEvent(pod, containerSpec, containerID.ID, v1.EventTypeNormal, events.KillingContainer, message)
m.containerRefManager.ClearRef(containerID)
return err
}
// killContainersWithSyncResult kills all pod's containers with sync results.
func (m *kubeGenericRuntimeManager) killContainersWithSyncResult(pod *v1.Pod, runningPod kubecontainer.Pod, gracePeriodOverride *int64) (syncResults []*kubecontainer.SyncResult) {
containerResults := make(chan *kubecontainer.SyncResult, len(runningPod.Containers))
wg := sync.WaitGroup{}
wg.Add(len(runningPod.Containers))
for _, container := range runningPod.Containers {
go func(container *kubecontainer.Container) {
defer utilruntime.HandleCrash()
defer wg.Done()
killContainerResult := kubecontainer.NewSyncResult(kubecontainer.KillContainer, container.Name)
if err := m.killContainer(pod, container.ID, container.Name, "Need to kill Pod", gracePeriodOverride); err != nil {
killContainerResult.Fail(kubecontainer.ErrKillContainer, err.Error())
}
containerResults <- killContainerResult
}(container)
}
wg.Wait()
close(containerResults)
for containerResult := range containerResults {
syncResults = append(syncResults, containerResult)
}
return
}
// pruneInitContainersBeforeStart ensures that before we begin creating init
// containers, we have reduced the number of outstanding init containers still
// present. This reduces load on the container garbage collector by only
// preserving the most recent terminated init container.
func (m *kubeGenericRuntimeManager) pruneInitContainersBeforeStart(pod *v1.Pod, podStatus *kubecontainer.PodStatus) {
// only the last execution of each init container should be preserved, and only preserve it if it is in the
// list of init containers to keep.
initContainerNames := sets.NewString()
for _, container := range pod.Spec.InitContainers {
initContainerNames.Insert(container.Name)
}
for name := range initContainerNames {
count := 0
for _, status := range podStatus.ContainerStatuses {
if status.Name != name || !initContainerNames.Has(status.Name) || status.State != kubecontainer.ContainerStateExited {
continue
}
count++
// keep the first init container for this name
if count == 1 {
continue
}
// prune all other init containers that match this container name
glog.V(4).Infof("Removing init container %q instance %q %d", status.Name, status.ID.ID, count)
if err := m.removeContainer(status.ID.ID); err != nil {
utilruntime.HandleError(fmt.Errorf("failed to remove pod init container %q: %v; Skipping pod %q", status.Name, err, format.Pod(pod)))
continue
}
// remove any references to this container
if _, ok := m.containerRefManager.GetRef(status.ID); ok {
m.containerRefManager.ClearRef(status.ID)
} else {
glog.Warningf("No ref for container %q", status.ID)
}
}
}
}
// Remove all init containres. Note that this function does not check the state
// of the container because it assumes all init containers have been stopped
// before the call happens.
func (m *kubeGenericRuntimeManager) purgeInitContainers(pod *v1.Pod, podStatus *kubecontainer.PodStatus) {
initContainerNames := sets.NewString()
for _, container := range pod.Spec.InitContainers {
initContainerNames.Insert(container.Name)
}
for name := range initContainerNames {
count := 0
for _, status := range podStatus.ContainerStatuses {
if status.Name != name || !initContainerNames.Has(status.Name) {
continue
}
count++
// Purge all init containers that match this container name
glog.V(4).Infof("Removing init container %q instance %q %d", status.Name, status.ID.ID, count)
if err := m.removeContainer(status.ID.ID); err != nil {
utilruntime.HandleError(fmt.Errorf("failed to remove pod init container %q: %v; Skipping pod %q", status.Name, err, format.Pod(pod)))
continue
}
// Remove any references to this container
if _, ok := m.containerRefManager.GetRef(status.ID); ok {
m.containerRefManager.ClearRef(status.ID)
} else {
glog.Warningf("No ref for container %q", status.ID)
}
}
}
}
// findNextInitContainerToRun returns the status of the last failed container, the
// next init container to start, or done if there are no further init containers.
// Status is only returned if an init container is failed, in which case next will
// point to the current container.
func findNextInitContainerToRun(pod *v1.Pod, podStatus *kubecontainer.PodStatus) (status *kubecontainer.ContainerStatus, next *v1.Container, done bool) {
if len(pod.Spec.InitContainers) == 0 {
return nil, nil, true
}
// If there are failed containers, return the status of the last failed one.
for i := len(pod.Spec.InitContainers) - 1; i >= 0; i-- {
container := &pod.Spec.InitContainers[i]
status := podStatus.FindContainerStatusByName(container.Name)
if status != nil && isContainerFailed(status) {
return status, container, false
}
}
// There are no failed containers now.
for i := len(pod.Spec.InitContainers) - 1; i >= 0; i-- {
container := &pod.Spec.InitContainers[i]
status := podStatus.FindContainerStatusByName(container.Name)
if status == nil {
continue
}
// container is still running, return not done.
if status.State == kubecontainer.ContainerStateRunning {
return nil, nil, false
}
if status.State == kubecontainer.ContainerStateExited {
// all init containers successful
if i == (len(pod.Spec.InitContainers) - 1) {
return nil, nil, true
}
// all containers up to i successful, go to i+1
return nil, &pod.Spec.InitContainers[i+1], false
}
}
return nil, &pod.Spec.InitContainers[0], false
}
// GetContainerLogs returns logs of a specific container.
func (m *kubeGenericRuntimeManager) GetContainerLogs(pod *v1.Pod, containerID kubecontainer.ContainerID, logOptions *v1.PodLogOptions, stdout, stderr io.Writer) (err error) {
status, err := m.runtimeService.ContainerStatus(containerID.ID)
if err != nil {
return fmt.Errorf("failed to get container status %q: %v", containerID, err)
}
labeledInfo := getContainerInfoFromLabels(status.Labels)
annotatedInfo := getContainerInfoFromAnnotations(status.Annotations)
path := buildFullContainerLogsPath(pod.UID, labeledInfo.ContainerName, annotatedInfo.RestartCount)
return m.ReadLogs(path, containerID.ID, logOptions, stdout, stderr)
}
// GetExec gets the endpoint the runtime will serve the exec request from.
func (m *kubeGenericRuntimeManager) GetExec(id kubecontainer.ContainerID, cmd []string, stdin, stdout, stderr, tty bool) (*url.URL, error) {
req := &runtimeapi.ExecRequest{
ContainerId: id.ID,
Cmd: cmd,
Tty: tty,
Stdin: stdin,
Stdout: stdout,
Stderr: stderr,
}
resp, err := m.runtimeService.Exec(req)
if err != nil {
return nil, err
}
return url.Parse(resp.Url)
}
// GetAttach gets the endpoint the runtime will serve the attach request from.
func (m *kubeGenericRuntimeManager) GetAttach(id kubecontainer.ContainerID, stdin, stdout, stderr, tty bool) (*url.URL, error) {
req := &runtimeapi.AttachRequest{
ContainerId: id.ID,
Stdin: stdin,
Stdout: stdout,
Stderr: stderr,
Tty: tty,
}
resp, err := m.runtimeService.Attach(req)
if err != nil {
return nil, err
}
return url.Parse(resp.Url)
}
// RunInContainer synchronously executes the command in the container, and returns the output.
func (m *kubeGenericRuntimeManager) RunInContainer(id kubecontainer.ContainerID, cmd []string, timeout time.Duration) ([]byte, error) {
stdout, stderr, err := m.runtimeService.ExecSync(id.ID, cmd, timeout)
// NOTE(tallclair): This does not correctly interleave stdout & stderr, but should be sufficient
// for logging purposes. A combined output option will need to be added to the ExecSyncRequest
// if more precise output ordering is ever required.
return append(stdout, stderr...), err
}
// removeContainer removes the container and the container logs.
// Notice that we remove the container logs first, so that container will not be removed if
// container logs are failed to be removed, and kubelet will retry this later. This guarantees
// that container logs to be removed with the container.
// Notice that we assume that the container should only be removed in non-running state, and
// it will not write container logs anymore in that state.
func (m *kubeGenericRuntimeManager) removeContainer(containerID string) error {
glog.V(4).Infof("Removing container %q", containerID)
// Call internal container post-stop lifecycle hook.
if err := m.internalLifecycle.PostStopContainer(containerID); err != nil {
return err
}
// Remove the container log.
// TODO: Separate log and container lifecycle management.
if err := m.removeContainerLog(containerID); err != nil {
return err
}
// Remove the container.
return m.runtimeService.RemoveContainer(containerID)
}
// removeContainerLog removes the container log.
func (m *kubeGenericRuntimeManager) removeContainerLog(containerID string) error {
// Remove the container log.
status, err := m.runtimeService.ContainerStatus(containerID)
if err != nil {
return fmt.Errorf("failed to get container status %q: %v", containerID, err)
}
labeledInfo := getContainerInfoFromLabels(status.Labels)
annotatedInfo := getContainerInfoFromAnnotations(status.Annotations)
path := buildFullContainerLogsPath(labeledInfo.PodUID, labeledInfo.ContainerName, annotatedInfo.RestartCount)
if err := m.osInterface.Remove(path); err != nil && !os.IsNotExist(err) {
return fmt.Errorf("failed to remove container %q log %q: %v", containerID, path, err)
}
// Remove the legacy container log symlink.
// TODO(random-liu): Remove this after cluster logging supports CRI container log path.
legacySymlink := legacyLogSymlink(containerID, labeledInfo.ContainerName, labeledInfo.PodName,
labeledInfo.PodNamespace)
if err := m.osInterface.Remove(legacySymlink); err != nil && !os.IsNotExist(err) {
return fmt.Errorf("failed to remove container %q log legacy symbolic link %q: %v",
containerID, legacySymlink, err)
}
return nil
}
// DeleteContainer removes a container.
func (m *kubeGenericRuntimeManager) DeleteContainer(containerID kubecontainer.ContainerID) error {
return m.removeContainer(containerID.ID)
}