Skip to content
This repository has been archived by the owner on Jan 21, 2022. It is now read-only.

Commit

Permalink
Health Manager v.2 (preview)
Browse files Browse the repository at this point in the history
This is a complete re-write of the vcap/health_manager component.

It seeks to be 100% backward compatible behaviorally.

See README.md file for details.

Change-Id: I892fa1532955431c11f2cdfda8d4d8b6f9d2728d
  • Loading branch information
Bob Nugmanov committed Apr 20, 2012
1 parent 861f58f commit 0e2bf2e
Show file tree
Hide file tree
Showing 30 changed files with 2,410 additions and 1 deletion.
15 changes: 15 additions & 0 deletions .gitignore
@@ -0,0 +1,15 @@
/vendor/cache/*
*~
.idea
\#*\#
.\#*
.bundle
bundler
spec_reports
spec_coverage
ci-artifacts-dir
ci-working-dir
*.rbc
*.swp
.rvmrc
*.pid
13 changes: 13 additions & 0 deletions Gemfile
@@ -0,0 +1,13 @@
source "http://rubygems.org"


gem "rake"
gem "rspec"

gem "vcap_common", ">= 1.0.8"
gem "vcap_logging"

gem 'eventmachine', :git => 'git://github.com/cloudfoundry/eventmachine.git', :branch => 'release-0.12.11-cf'
gem "yajl-ruby"
gem "rest-client"
gem "em-http-request"
64 changes: 64 additions & 0 deletions Gemfile.lock
@@ -0,0 +1,64 @@
GIT
remote: git://github.com/cloudfoundry/eventmachine.git
revision: 2806c630d8631d5dcf9fb2555f665b829052aabe
branch: release-0.12.11-cf
specs:
eventmachine (0.12.11.cloudfoundry.3)

GEM
remote: http://rubygems.org/
specs:
addressable (2.2.7)
daemons (1.1.8)
diff-lcs (1.1.3)
em-http-request (0.3.0)
addressable (>= 2.0.0)
escape_utils
eventmachine (>= 0.12.9)
escape_utils (0.2.4)
json_pure (1.6.6)
mime-types (1.18)
nats (0.4.22)
daemons (>= 1.1.4)
eventmachine (>= 0.12.10)
json_pure (>= 1.6.1)
thin (>= 1.3.1)
posix-spawn (0.3.6)
rack (1.4.1)
rake (0.9.2.2)
rest-client (1.6.7)
mime-types (>= 1.16)
rspec (2.9.0)
rspec-core (~> 2.9.0)
rspec-expectations (~> 2.9.0)
rspec-mocks (~> 2.9.0)
rspec-core (2.9.0)
rspec-expectations (2.9.1)
diff-lcs (~> 1.1.3)
rspec-mocks (2.9.0)
thin (1.3.1)
daemons (>= 1.0.9)
eventmachine (>= 0.12.6)
rack (>= 1.0.0)
vcap_common (1.0.10)
eventmachine (~> 0.12.11.cloudfoundry.3)
nats (~> 0.4.22.beta.8)
posix-spawn (~> 0.3.6)
thin (~> 1.3.1)
yajl-ruby (~> 0.8.3)
vcap_logging (0.1.4)
rake
yajl-ruby (0.8.3)

PLATFORMS
ruby

DEPENDENCIES
em-http-request
eventmachine!
rake
rest-client
rspec
vcap_common (>= 1.0.8)
vcap_logging
yajl-ruby
2 changes: 1 addition & 1 deletion README.md
Expand Up @@ -15,7 +15,7 @@ issued for missing/extra instances, respectively.

Additionally, Health Manager collects and exposes statistics and
health status for individual applications, as well as aggregates for
frameworks, runtimes, etc. (WIP at this time)
frameworks, runtimes, etc.

## AppState

Expand Down
7 changes: 7 additions & 0 deletions Rakefile
@@ -0,0 +1,7 @@
require "rspec/core/rake_task"
require "rspec/core/version"

desc "Run all examples"
RSpec::Core::RakeTask.new(:spec) do |t|
t.rspec_opts = %w[--color --format documentation]
end
28 changes: 28 additions & 0 deletions bin/bulk_util.rb
@@ -0,0 +1,28 @@
#!/usr/bin/env ruby
home = File.join(File.dirname(__FILE__),'..')
ENV['BUNDLE_GEMFILE'] = "#{home}/Gemfile"

require 'rubygems'
require 'bundler/setup'
require File.join(home, 'lib','health_manager')

trap('INT') { NATS.stop { EM.stop }}
trap('SIGTERM') { NATS.stop { EM.stop }}


EM::run {

NATS.start :uri => ENV['NATS_URI'] || 'nats://nats:nats@192.168.24.128:4222' do
config = {
'bulk' => {'host'=> ENV['BULK_URL'] || 'api.vcap.me', 'batch_size' => '2'},
}
VCAP::Logging.setup_from_config({'level'=>ENV['LOG_LEVEL'] || 'debug'})

prov = HealthManager::BulkBasedExpectedStateProvider.new(config)
prov.each_droplet do |id, droplet|
puts "Droplet #{id}:"
puts droplet.inspect
end
EM.add_timer(5) { EM.stop { NATS.stop } }
end
}
14 changes: 14 additions & 0 deletions bin/health_manager
@@ -0,0 +1,14 @@
#!/usr/bin/env ruby
home = File.join(File.dirname(__FILE__),'..')

ENV['BUNDLE_GEMFILE'] = "#{home}/Gemfile"
require 'bundler/setup'

require File.join(home, 'lib','health_manager')

hm = HealthManager::Manager.new()

trap('INT') { hm.shutdown }
trap('SIGTERM') { hm.shutdown }

hm.start
47 changes: 47 additions & 0 deletions config/health_manager.yml
@@ -0,0 +1,47 @@
---
# Local_route is the IP address of a well known server on your network, it
# is used to choose the right ip address (think of hosts that have multiple nics
# and IP addresses assigned to them) of the host running the Health Manager. Default
# value of nil, should work in most cases.
# local_route: 127.0.0.1

# NATS message bus URI
mbus: nats://nats:nats@192.168.24.128:4222/
logging:
level: warn
pid: /var/vcap/sys/run/healthmanager.pid

queue_batch_size: 10

intervals:
# Interval for collecting statistics about this cloudfoundry instance.
# Amongst other things, data collected includes number of users, number of
# applications and memory usage.
database_scan: 10
# Time to wait before starting analysis for stopped applications.
droplet_lost: 30
# Interval between scans for analysis of applications.
droplets_analysis: 5
# An application is deemed to be flapping if it is found to be in a crashed
# state (after a restart following every crash) for more than "flapping_death"
# number of times in an interval that is "flapping_timeout" long.
flapping_death: 2
flapping_timeout: 180
# Time to wait before trying to restart an application after a crash is
# detected
restart_timeout: 20
# Time to wait before analyzing the state of an application that has been
# started/restarted
stable_state: 60

#number of start requests send each second (subject to EM timer limitations)
#default value is 50.
dequeueing_rate: 50

# Used for /healthz and /vars endpoints. If not provided random
# values will be generated on component start. Uncomment to use
# static values.
status:
port: 54321
user: thin
password: thin
136 changes: 136 additions & 0 deletions lib/health_manager.rb
@@ -0,0 +1,136 @@
# HealthManager 2.0. (c) 2011-2012 VMware, Inc.
$:.unshift(File.dirname(__FILE__))

require 'yaml'
require 'yajl'
require 'optparse'
require 'time'
require 'nats/client'

require 'vcap/common'
require 'vcap/component'
require 'vcap/logging'
require 'vcap/priority_queue'

require 'health_manager/constants'
require 'health_manager/common'
require 'health_manager/app_state'
require 'health_manager/app_state_provider'
require 'health_manager/nats_based_known_state_provider'
require 'health_manager/bulk_based_expected_state_provider'
require 'health_manager/scheduler'
require 'health_manager/nudger'
require 'health_manager/harmonizer'
require 'health_manager/varz_common'
require 'health_manager/varz'
require 'health_manager/reporter'

module HealthManager
class Manager
include HealthManager::Common
#primarily for testing
attr_reader :scheduler
attr_reader :known_state_provider
attr_reader :expected_state_provider

def initialize(config={})
args = parse_args
@config = read_config_from_file(args[:config_file]).merge(config)

@logging_config = @config['logging']
@logging_config = {'level' => ENV['LOG_LEVEL']} if ENV['LOG_LEVEL'] #ENV override
@logging_config ||= {'level' => 'info'} #fallback value

VCAP::Logging.setup_from_config(@logging_config)

logger.info("HealthManager: initializing")

@varz = Varz.new(@config)
@reporter = Reporter.new(@config)
@scheduler = Scheduler.new(@config)
@known_state_provider = AppStateProvider.get_known_state_provider(@config)
@expected_state_provider = AppStateProvider.get_expected_state_provider(@config)
@nudger = Nudger.new(@config)
@harmonizer = Harmonizer.new(@config)

register_hm_components
end

def register_as_vcap_component

logger.info("registering VCAP component")
logger.debug("config: #{sanitized_config}")

status_config = @config['status'] || {}
VCAP::Component.register(:type => 'HealthManager',
:host => VCAP.local_ip(@config['local_route']),
:index => @config['index'],
:config => sanitized_config,
:port => status_config['port'],
:user => status_config['user'],
:password => status_config['password'])

end

def create_pid_file
@pid_file = @config['pid']
begin
FileUtils.mkdir_p(File.dirname(@pid_file))
rescue => e
logger.fatal("Can't create pid directory, exiting: #{e}")
end
File.open(@pid_file, 'wb') { |f| f.puts "#{Process.pid}" }
logger.debug("pid file written: #{@pid_file}")
end

def start
logger.info("starting...")

EM.epoll
NATS.start :uri => get_nats_uri do
@varz.prepare
@reporter.prepare
@harmonizer.prepare
@expected_state_provider.start
@known_state_provider.start

unless ENV[HM_SHADOW]=='false'
logger.info("creating Shadower")
@shadower = Shadower.new(@config)
@shadower.subscribe
end

register_as_vcap_component
create_pid_file if @config['pid']

@scheduler.start #blocking call
end
end

def shutdown
logger.info("shutting down...")
NATS.stop { EM.stop }
logger.info("...good bye.")
end

def read_config_from_file(config_file)
config_path = ENV['CLOUD_FOUNDRY_CONFIG_PATH'] || File.join(File.dirname(__FILE__),'../config')
config_file ||= File.join(config_path, 'health_manager.yml')
begin
config = YAML.load_file(config_file)
rescue => e
$stderr.puts "Could not read configuration file #{config_file}: #{e}"
exit 1
end
config
end

def get_nats_uri
ENV[NATS_URI] || @config['mbus']
end

def self.now
Time.now.to_i
end
end
end

0 comments on commit 0e2bf2e

Please sign in to comment.