Fix issue with 100% CPU usage in logs.go.

Resolves: GoogleContainerTools#531 See also: kubernetes/client-go#12 There is an issue in which the Pods watcher gets into a infinite tight loop and begins consuming 100% of the CPU. This happens after skaffold dev has been running for a while (~30 mins) and once it starts, it doesn't stop. The issue was narrowed down by @ajbouh to the event polling loop in `logs.go`, which was not checking if the `ResultChan()` is closed or not. Kubernetes actually closes the connection after a timeout (default is in the range of 30-60 mins according to the related issue linked to above). In this case, the intended solution is to start the watcher again. This refactors the polling into two nested loops. One to start (and restart) the Pods watcher itself and another to receive and process the events from the watcher. If the `ResultChan()` is closed, the entire watcher loop is restarted and log tailing continues. There is a subtle difference in error handling as a result of this change. Previously any error returned from `client.Pods("").Watch()` would be immediately returned from the `Watch()` func in `logs.go`. This is no longer possible since the watcher is initialized in the goroutine started by that func. As such, in the case the watcher cannot be initialized, we simply log the error and stop tailing logs. Open to suggestions as to be a better way to handle this error. Retrying in a tight loop seems potentially problematic in the error scenario.
d11wtq · Jun 19, 2018 · b4b6c6f · b4b6c6f
1 parent 86c7179
commit b4b6c6f
Showing 1 changed file with 33 additions and 22 deletions.
diff --git a/pkg/skaffold/kubernetes/log.go b/pkg/skaffold/kubernetes/log.go
@@ -66,32 +66,43 @@ func (a *LogAggregator) Start(ctx context.Context) error {
 	}
 	client := kubeclient.CoreV1()
 
-	a.startTime = time.Now()
-
-	watcher, err := client.Pods("").Watch(meta_v1.ListOptions{
-		IncludeUninitialized: true,
-	})
-	if err != nil {
-		return err
-	}
-
 	go func() {
+	retryLoop:
 		for {
-			select {
-			case <-ctx.Done():
-				return
-			case evt := <-watcher.ResultChan():
-				if evt.Type != watch.Added && evt.Type != watch.Modified {
-					continue
-				}
+			a.startTime = time.Now()
 
-				pod, ok := evt.Object.(*v1.Pod)
-				if !ok {
-					continue
-				}
+			watcher, err := client.Pods("").Watch(meta_v1.ListOptions{
+				IncludeUninitialized: true,
+			})
+
+			if err != nil {
+				logrus.Errorf("initializing pod watcher %s", err)
+				return
+			}
 
-				if a.podSelector.Select(pod) {
-					a.streamLogs(ctx, client, pod)
+		eventLoop:
+			for {
+				select {
+				case <-ctx.Done():
+					return
+				case evt, ok := <-watcher.ResultChan():
+					if !ok {
+						// expected: server connection timeout
+						continue retryLoop
+					}
+
+					if evt.Type != watch.Added && evt.Type != watch.Modified {
+						continue eventLoop
+					}
+
+					pod, ok := evt.Object.(*v1.Pod)
+					if !ok {
+						continue eventLoop
+					}
+
+					if a.podSelector.Select(pod) {
+						a.streamLogs(ctx, client, pod)
+					}
 				}
 			}
 		}