Skip to content

Commit

Permalink
Health 1 (#463)
Browse files Browse the repository at this point in the history
* rework healthcontroller timeouts

* change static duration

* varname
  • Loading branch information
roffe authored and murali-reddy committed Jun 21, 2018
1 parent 58da2d4 commit 17f92de
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 28 deletions.
14 changes: 7 additions & 7 deletions pkg/cmd/kube-router.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,13 @@ func (kr *KubeRouter) Run() error {

stopCh := make(chan struct{})

hc, err := healthcheck.NewHealthController(kr.Config)
if err != nil {
return errors.New("Failed to create health controller: " + err.Error())
}
wg.Add(1)
go hc.Run(healthChan, stopCh, &wg)

if !(kr.Config.RunFirewall || kr.Config.RunServiceProxy || kr.Config.RunRouter) {
glog.Info("Router, Firewall or Service proxy functionality must be specified. Exiting!")
os.Exit(0)
Expand Down Expand Up @@ -158,13 +165,6 @@ func (kr *KubeRouter) Run() error {
go nsc.Run(healthChan, stopCh, &wg)
}

hc, err := healthcheck.NewHealthController(kr.Config)
if err != nil {
return errors.New("Failed to create health controller: " + err.Error())
}
wg.Add(1)
go hc.Run(healthChan, stopCh, &wg)

// Handle SIGINT and SIGTERM
ch := make(chan os.Signal)
signal.Notify(ch, syscall.SIGINT, syscall.SIGTERM)
Expand Down
58 changes: 37 additions & 21 deletions pkg/healthcheck/health_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,14 @@ type HealthController struct {
//HealthStats is holds the latest heartbeats
type HealthStats struct {
sync.Mutex
Healthy bool
MetricsControllerAlive time.Time
NetworkPolicyControllerAlive time.Time
NetworkRoutingControllerAlive time.Time
NetworkServicesControllerAlive time.Time
Healthy bool
MetricsControllerAlive time.Time
NetworkPolicyControllerAlive time.Time
NetworkPolicyControllerAliveTTL time.Duration
NetworkRoutingControllerAlive time.Time
NetworkRoutingControllerAliveTTL time.Duration
NetworkServicesControllerAlive time.Time
NetworkServicesControllerAliveTTL time.Duration
}

//SendHeartBeat sends a heartbeat on the passed channel
Expand Down Expand Up @@ -73,37 +76,53 @@ func (hc *HealthController) HandleHeartbeat(beat *ControllerHeartbeat) {
hc.Status.Lock()
defer hc.Status.Unlock()

switch component := beat.Component; component {
case "NSC":
hc.Status.NetworkServicesControllerAlive = time.Now()
case "NRC":
hc.Status.NetworkRoutingControllerAlive = time.Now()
case "NPC":
hc.Status.NetworkPolicyControllerAlive = time.Now()
case "MC":
hc.Status.MetricsControllerAlive = time.Now()
switch {
// The first heartbeat will set the initial gracetime the controller has to report in, A static time is added as well when checking to allow for load variation in sync time
case beat.Component == "NSC":
if hc.Status.NetworkServicesControllerAliveTTL == 0 {
hc.Status.NetworkServicesControllerAliveTTL = time.Since(hc.Status.NetworkServicesControllerAlive)
}
hc.Status.NetworkServicesControllerAlive = beat.LastHeartBeat

case beat.Component == "NRC":
if hc.Status.NetworkRoutingControllerAliveTTL == 0 {
hc.Status.NetworkRoutingControllerAliveTTL = time.Since(hc.Status.NetworkRoutingControllerAlive)
}
hc.Status.NetworkRoutingControllerAlive = beat.LastHeartBeat

case beat.Component == "NPC":
if hc.Status.NetworkPolicyControllerAliveTTL == 0 {
hc.Status.NetworkPolicyControllerAliveTTL = time.Since(hc.Status.NetworkPolicyControllerAlive)
}
hc.Status.NetworkPolicyControllerAlive = beat.LastHeartBeat

case beat.Component == "MC":
hc.Status.MetricsControllerAlive = beat.LastHeartBeat
}
}

//CheckHealth evaluates the time since last heartbeat to decide if the controller is running or not
// CheckHealth evaluates the time since last heartbeat to decide if the controller is running or not
func (hc *HealthController) CheckHealth() bool {
health := true
graceTime := time.Duration(1500 * time.Millisecond)

if hc.Config.RunFirewall {
if time.Since(hc.Status.NetworkPolicyControllerAlive) > hc.Config.IPTablesSyncPeriod+5*time.Second {

if time.Since(hc.Status.NetworkPolicyControllerAlive) > hc.Config.IPTablesSyncPeriod+hc.Status.NetworkPolicyControllerAliveTTL+graceTime {
glog.Error("Network Policy Controller heartbeat missed")
health = false
}
}

if hc.Config.RunRouter {
if time.Since(hc.Status.NetworkRoutingControllerAlive) > hc.Config.RoutesSyncPeriod+5*time.Second {
if time.Since(hc.Status.NetworkRoutingControllerAlive) > hc.Config.RoutesSyncPeriod+hc.Status.NetworkRoutingControllerAliveTTL+graceTime {
glog.Error("Network Routing Controller heartbeat missed")
health = false
}
}

if hc.Config.RunServiceProxy {
if time.Since(hc.Status.NetworkServicesControllerAlive) > hc.Config.IpvsSyncPeriod+5*time.Second {
if time.Since(hc.Status.NetworkServicesControllerAlive) > hc.Config.IpvsSyncPeriod+hc.Status.NetworkServicesControllerAliveTTL+graceTime {
glog.Error("NetworkService Controller heartbeat missed")
health = false
}
Expand Down Expand Up @@ -143,9 +162,6 @@ func (hc *HealthController) Run(healthChan <-chan *ControllerHeartbeat, stopCh <
hc.HTTPEnabled = false
}

//Give the controllers a few seconds to start before checking health
time.Sleep(60 * time.Second)

for {
select {
case <-stopCh:
Expand Down

0 comments on commit 17f92de

Please sign in to comment.