-
Notifications
You must be signed in to change notification settings - Fork 800
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix the issue that io delay injected does not take effect #121
Changes from 3 commits
2442712
56c0e96
89189f9
7ab71cc
a9e2259
11de512
52d49c2
1286047
9e436b2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,7 @@ import ( | |
"golang.org/x/sync/errgroup" | ||
|
||
"github.com/go-logr/logr" | ||
"github.com/golang/protobuf/ptypes/empty" | ||
|
||
"github.com/pingcap/chaos-mesh/api/v1alpha1" | ||
"github.com/pingcap/chaos-mesh/controllers/twophase" | ||
|
@@ -161,29 +162,31 @@ func (r *Reconciler) cleanFinalizersAndRecover(ctx context.Context, iochaos *v1a | |
func (r *Reconciler) recoverPod(ctx context.Context, pod *v1.Pod, iochaos *v1alpha1.IoChaos) error { | ||
r.Log.Info("Recovering", "namespace", pod.Namespace, "name", pod.Name) | ||
|
||
var ns v1.Namespace | ||
if err := r.Get(ctx, types.NamespacedName{Name: pod.Namespace}, &ns); err != nil { | ||
return err | ||
} | ||
cctx, cancel := context.WithTimeout(ctx, 2*time.Minute) | ||
defer cancel() | ||
err := wait.PollUntil(2*time.Second, func() (bool, error) { | ||
if err := r.recoverInjectAction(ctx, pod, iochaos); err != nil { | ||
if utils.IsCaredNetError(err) { | ||
r.Log.Info("Recover I/O chaos action, network is not ok, retrying...", | ||
"namespace", pod.Namespace, "name", pod.Name) | ||
return false, nil | ||
} | ||
|
||
annotations := ns.GetAnnotations() | ||
if annotations == nil { | ||
annotations = make(map[string]string) | ||
} | ||
return false, err | ||
} | ||
|
||
if _, ok := annotations[v1alpha1.WebhookInitPodAnnotationKey]; ok { | ||
return r.recoverInjectAction(ctx, pod, iochaos) | ||
} | ||
r.Log.Info("Recover I/O chaos action successfully") | ||
|
||
if err := utils.UnsetIoInjection(ctx, r.Client, pod, iochaos); err != nil { | ||
r.Log.Error(err, "failed to unset I/O injection", | ||
return true, nil | ||
}, cctx.Done()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When network is not ok, Poll loop and There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. timeout err |
||
|
||
if err != nil { | ||
r.Log.Error(err, "failed to recover I/O chaos action", | ||
"namespace", pod.Namespace, "name", pod.Name) | ||
return err | ||
} | ||
|
||
return r.Delete(ctx, pod, &client.DeleteOptions{ | ||
GracePeriodSeconds: new(int64), | ||
}) | ||
return nil | ||
} | ||
|
||
func (r *Reconciler) injectAllPods(ctx context.Context, pods []v1.Pod, iochaos *v1alpha1.IoChaos) error { | ||
|
@@ -210,67 +213,29 @@ func (r *Reconciler) injectAllPods(ctx context.Context, pods []v1.Pod, iochaos * | |
func (r *Reconciler) injectPod(ctx context.Context, pod *v1.Pod, iochaos *v1alpha1.IoChaos) error { | ||
r.Log.Info("Inject I/O chaos action", "namespace", pod.Namespace, "name", pod.Name) | ||
|
||
if err := utils.SetIoInjection(ctx, r.Client, pod, iochaos); err != nil { | ||
r.Log.Error(err, "failed to set I/O injection", | ||
"namespace", pod.Namespace, "name", pod.Name) | ||
return err | ||
} | ||
|
||
var ns v1.Namespace | ||
if err := r.Get(ctx, types.NamespacedName{Name: pod.Namespace}, &ns); err != nil { | ||
return err | ||
} | ||
|
||
annotations := ns.GetAnnotations() | ||
if annotations == nil { | ||
annotations = make(map[string]string) | ||
} | ||
|
||
if _, ok := annotations[v1alpha1.WebhookInitPodAnnotationKey]; !ok { | ||
// need to recreate pod when to inject sidecar | ||
time.Sleep(1 * time.Second) | ||
err := r.Delete(ctx, pod, &client.DeleteOptions{ | ||
GracePeriodSeconds: new(int64), | ||
}) | ||
if err != nil { | ||
return err | ||
} | ||
} | ||
|
||
// TODO: optimize inject action | ||
go func() { | ||
cctx, cancel := context.WithTimeout(ctx, 5*time.Minute) | ||
defer cancel() | ||
err := wait.PollUntil(2*time.Second, func() (bool, error) { | ||
var npod v1.Pod | ||
err := r.Client.Get(ctx, types.NamespacedName{ | ||
Namespace: pod.Namespace, | ||
Name: pod.Name, | ||
}, &npod) | ||
if err != nil { | ||
r.Log.Error(err, "failed to get pod", "namespace", pod.Namespace, "name", pod.Name) | ||
cctx, cancel := context.WithTimeout(ctx, 2*time.Minute) | ||
defer cancel() | ||
err := wait.PollUntil(2*time.Second, func() (bool, error) { | ||
if err := r.injectAction(ctx, pod, iochaos); err != nil { | ||
if utils.IsCaredNetError(err) { | ||
r.Log.Info("Inject I/O chaos action, network is not ok, retrying...", | ||
"namespace", pod.Namespace, "name", pod.Name) | ||
return false, nil | ||
} | ||
|
||
if err := r.injectAction(ctx, &npod, iochaos); err != nil { | ||
if utils.IsCaredNetError(err) { | ||
r.Log.Info("Inject I/O chaos action, network is not ok, retrying...", | ||
"namespace", pod.Namespace, "name", pod.Name) | ||
return false, nil | ||
} | ||
return false, err | ||
} | ||
|
||
return false, err | ||
} | ||
r.Log.Info("Inject I/O chaos action successfully") | ||
|
||
r.Log.Info("Inject I/O chaos action successfully") | ||
return true, nil | ||
}, cctx.Done()) | ||
|
||
return true, nil | ||
}, cctx.Done()) | ||
if err != nil { | ||
r.Log.Error(err, "failed to inject I/O chaos action", | ||
"namespace", pod.Namespace, "name", pod.Name) | ||
} | ||
}() | ||
if err != nil { | ||
r.Log.Error(err, "failed to inject I/O chaos action", | ||
"namespace", pod.Namespace, "name", pod.Name) | ||
return err | ||
} | ||
|
||
return nil | ||
} | ||
|
@@ -294,7 +259,13 @@ func (r *Reconciler) injectAction(ctx context.Context, pod *v1.Pod, iochaos *v1a | |
return err | ||
} | ||
|
||
_, err = cli.SetFault(ctx, req) | ||
if len(req.Methods) > 0 { | ||
_, err = cli.SetFault(ctx, req) | ||
return err | ||
} | ||
|
||
// inject fault to all methods if the the methods is empty. | ||
_, err = cli.SetFaultAll(ctx, req) | ||
return err | ||
} | ||
|
||
|
@@ -311,6 +282,6 @@ func (r *Reconciler) recoverInjectAction(ctx context.Context, pod *v1.Pod, iocha | |
return err | ||
} | ||
|
||
_, err = cli.RecoverAll(ctx, nil) | ||
_, err = cli.RecoverAll(ctx, &empty.Empty{}) | ||
return err | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why add the default branch? this may cause there exist multiple configWatchers that notify the eventsCh at the same time?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, this
branch
just used to restartconfigWatcher.Watch
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I misunderstood the stopCh before, I thought it was used by configWatcher to notify the outer goroutine that it had been finished.
If
configWatcher.Watch
oreventsCh
first received the msg from stopCh before the outer goroutine, theconfigWatcher.Watch
will be restarted forever because case <- stopCh at line 187 won't receive msg again? how to solve this problem?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
stopCh
will be closed, so all receiver will receive this event.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Got, that's ok
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Restarting is ok for ErrWatchChannelClosed case, is that ok for other err?