Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

adding recipe for mysql replication monitoring

  • Loading branch information...
commit ce47106af03d643f5c9f0aaff93413eb3c7d752d 1 parent 98fa0df
@tpoland tpoland authored
View
3  cookbooks/main/recipes/default.rb
@@ -63,3 +63,6 @@
#require_recipe "eybackup_verbose"
#require_recipe 'nginx'
+
+#uncomment to include the mysql_replication_check recipe
+#require_recipe "mysql_replication_check"
View
35 cookbooks/mysql_replication_check/README.rdoc
@@ -0,0 +1,35 @@
+= DESCRIPTION:
+
+This cookbook installs and configures a replication_check script on your master database instance.
+
+= PRE-REQUISITES:
+
+Exim or SSMTP must be configured on your instances in order for this monitoring script to send e-mail from your AppCloud database instances. The ey-cloud-recipes repo contains recipes for configuring both of these.
+
+Gems for json and yaml must be included.
+
+= USAGE:
+
+Step 1) In the ./recipes/default.rb add your sender and recipient email addresses for alerts in the 'check_vars' section. Adjustments to the monitoring and alerting thresholds can also be made here.
+check_vars = {
+ # the number of seconds delay at which a warning alert is issued; default 30 minutes (1800 seconds)
+ :warn_level => 1800,
+ # the number of seconds delay at which a critical alert is issued; default 2 hours (7200 seconds)
+ :crit_level => 7200,
+ # valid email address to receive alerts
+ :mail_recipient => '42@douglasadams.com',
+ # valid email address for sending email alerts
+ :mail_sender => '42@douglasadams.com'
+}
+
+Step 2) Commit and upload: `ey recipes upload -e <ENV>` and then apply: `ey recipes apply -e <ENV>`
+
+= NOTES:
+
+Engine Yard Support is not notified of these replication alerts currently. If you require additional assistance with a replication failure please file a ticket with our support team.
+
+The default configuration sends one successful check message per day but this can be modified by adjusting the cron schedule that removes the send_message.txt file.
+
+The default check interval for replication is 1 hour. This can be modified by adjusting the cron schedule for the replication_check.rb script.
+
+After a failure message has been sent the next successful check will send a success message regardless of whether one has already been sent for the current date.
View
61 cookbooks/mysql_replication_check/recipes/default.rb
@@ -0,0 +1,61 @@
+#
+# Cookbook Name:: mysql_replication_check
+# Recipe:: default
+#
+#
+# Cookbook Name:: mysql_replication_check
+# Recipe:: default
+#
+
+# installation path for monitoring tool
+install_path = '/data/monitoring'
+
+# configuration of replication check script
+check_vars = {
+ # the number of seconds delay at which a warning alert is issued; default 30 minutes (1800 seconds)
+ :warn_level => 1800,
+ # the number of seconds delay at which a critical alert is issued; default 2 hours (7200 seconds)
+ :crit_level => 7200,
+ # valid email address to receive alerts
+ :mail_recipient => '',
+ # valid email address for sending email alerts
+ :mail_sender => ''
+}
+
+if node[:instance_role].include?('db_master')
+
+ directory "#{install_path}" do
+ owner 'root'
+ group 'root'
+ mode 0755
+ action :create
+ recursive true
+ end
+
+ template "#{install_path}/replication_check.rb" do
+ source "replication_check.rb.erb"
+ owner 'root'
+ group 'root'
+ mode 0755
+ variables(check_vars)
+ end
+
+ cron "replica_check" do
+ minute '10'
+ hour '*'
+ day '*'
+ month '*'
+ weekday '*'
+ command "#{install_path}/replication_check.rb"
+ end
+
+ cron "allow_success_message" do
+ minute '30'
+ hour '8'
+ day '*'
+ month '*'
+ weekday '*'
+ command "rm #{install_path}/send_message.txt"
+ end
+
+end
View
154 cookbooks/mysql_replication_check/templates/default/replication_check.rb.erb
@@ -0,0 +1,154 @@
+#!/usr/bin/ruby
+
+require 'rubygems'
+require 'json'
+require 'yaml'
+
+# Usage Notes
+# * after adding a slave you must perform a deploy so the slave is part of the dna.json file
+# * yaml and json gems are required
+# * ssmtp provider required with recipe https://github.com/engineyard/ey-cloud-recipes/tree/master/cookbooks/ssmtp
+
+chef_file = '/etc/chef/dna.json'
+exit if not File.exists?(chef_file)
+
+WARN_LEVEL = <%= @warn_level %>
+CRIT_LEVEL = <%= @crit_level %>
+RECIPIENT = "<%= @mail_recipient %>"
+SENDER = "<%= @mail_sender %>"
+
+send_message_file = File.dirname(__FILE__) + "/send_message.txt"
+
+
+# retrieve password from .mytop file
+def get_password
+ dbpass = %x{cat /root/.mytop |grep pass |awk -F= '{print $2}'}.chomp
+ failure_message() if dbpass.length < 1
+ dbpass
+end
+
+# convert input into yaml
+def yaml_result(data, host_details)
+ begin
+ parse = data.gsub(/^\*.*$/,'').gsub(/^/,' ').gsub(/^\s+/, ' ').gsub(/database:/,'database').gsub(/Query:/,'Query')
+ yml = YAML.load(parse)
+ rescue
+ error=$!.to_s
+ send_message("Alert(FAIL) #{host_details['environment']} Replication Monitoring Check Failed", "The replication check for #{host_details['environment']} failed with the message:<br />--<br />#{error.gsub(/`/,'')}.<br />--<br />If reporting this error please provide these details to the Engine Yard support team.")
+ exit 1
+ end
+end
+
+# manage messages
+def send_message(sbj, msg)
+ recipient=RECIPIENT
+ sender=SENDER
+%x{sendmail #{recipient} << EOT
+To: #{recipient}
+From: #{sender}
+Subject: #{sbj}
+Content-Type: text/html
+<br />
+<font size="-1">#{msg}</font><br />
+The settings for this replication check script can be managed/changed through your custom chef recipes.
+<br /><br />
+Thank you,<br />
+Engine Yard AppCloud<br />
+EOT
+}
+
+end
+
+chef_config = JSON.parse(File.read(chef_file))
+host_results = Array.new
+
+failure_found=0
+message=""
+environment=""
+
+if chef_config["db_slaves"].empty? or chef_config["db_slaves"].nil?
+ puts "No slave found, you may need to run deploy"
+else
+ # record replica details
+ chef_config["db_slaves"].each do |slave|
+ puts "Checking host: '#{slave}'"
+ host_details = Hash.new
+ host_details["host"] = slave
+ host_details["environment"] = chef_config["engineyard"]["environment"]["name"]
+ environment = host_details["environment"]
+ chef_config["engineyard"]["environment"]["instances"].each do |instance|
+ if instance["private_hostname"] == slave
+ host_details["public_hostname"] = instance["public_hostname"]
+ host_details["instance_id"] = instance["id"]
+ host_details["role"] = instance["role"]
+ end
+ end
+
+ replica_header = "Host: #{host_details['host']}<br />
+Public Hostname: #{host_details['public_hostname']}<br />
+Instance Id: #{host_details['instance_id']}<br />
+Instance Role: #{host_details['role']}"
+
+ # confirm replica live
+ result = %x{/usr/bin/mysql -uroot -p#{get_password} -N -h#{slave} -e'select 1'}.chomp
+ if result.to_i != 1
+ host_details["message"] = "Status: <span style='color:red'>Failure</span><br />
+#{replica_header}<br />
+Alert Type: <span style='color:red'>Replication Failure</span><br />
+Details: Unable to connect to replica database on #{slave} message: #{result}"
+ failure_found = 1
+ else
+ # check replication status
+ result = %x{/usr/bin/mysql -uroot -p#{get_password} -h#{slave} -e'show slave status\\G'}.chomp
+ slave_status = yaml_result(result, host_details)
+ host_details["Seconds_Behind_Master"] = slave_status["Seconds_Behind_Master"]
+ host_details["Slave_IO_Running"] = slave_status["Slave_IO_Running"]
+ host_details["Slave_SQL_Running"] = slave_status["Slave_SQL_Running"]
+ host_details["Last_Error"] = slave_status["Last_Error"] unless slave_status["Last_Error"].nil?
+
+ if (not host_details["Slave_IO_Running"] or not host_details["Slave_SQL_Running"]) and host_details["Last_Error"].nil?
+ failure_found = 1
+ host_details["message"] = "Status: <span style='color:red'>Failure</span><br />
+#{replica_header}<br />
+Alert Type: <span style='color:red'>Replication Failure</span><br />
+Details: One of the replication threads is not running but there is not an error listed by 'show slave status\\G'; replication appears to be intentionally stopped"
+ elsif (not host_details["Slave_IO_Running"] or not host_details["Slave_SQL_Running"]) and not host_details["Last_Error"].empty?
+ failure_found=1
+ host_details["message"] = "Status: <span style='color:red'>Failure</span><br />
+#{replica_header}<br />
+Alert Type: <span style='color:red'>Replication Failure</span><br />
+Details: Replication has failed on #{slave} with the error #{host_details['Last_Error']}"
+ elsif host_details["Seconds_Behind_Master"] > CRIT_LEVEL
+ failure_found=1
+ host_details["message"] = "Status: <span style='color:red'>Failure</span><br />
+#{replica_header}<br />
+Alert Type: <span style='color:red'>Critical Replication Delay</span><br />
+Details: Replication has fallen more than #{CRIT_LEVEL} seconds behind the master, current delay is #{host_details["Seconds_Behind_Master"]} seconds behind the master."
+ elsif host_details["Seconds_Behind_Master"] > WARN_LEVEL
+ failure_found=1
+ host_details["message"] = "Status: <span style='color:red'>WARNING</span><br />
+#{replica_header}<br />
+Alert Type: <span style='color:red'>Warning Replication Delay</span><br />
+Details: Replication has fallen more than #{WARN_LEVEL} seconds behind the master, current delay is #{host_details["Seconds_Behind_Master"]} seconds behind the master."
+ else
+ host_details["message"] = "Status: <span style='color:green'>Success</span><br />
+#{replica_header}<br />
+Alert Type: Replication Status<br />
+Details: Replication check succeeded, replica is healthy and is #{host_details["Seconds_Behind_Master"]} seconds behind the master."
+ end
+
+ end
+ message = message + host_details["message"] + "<br /><br />"
+ host_results << host_details
+ end
+end
+
+if failure_found != 0
+ subject = "Alert(FAIL) #{environment} Replication Failure Found"
+ %x{rm #{send_message_file}} if File.exists?(send_message_file)
+else
+ subject = "Alert(OKAY) #{environment} Replication Check Successful"
+end
+
+send_message(subject, message) unless File.exists?(send_message_file)
+%x{date > #{send_message_file}} unless failure_found == 1
Please sign in to comment.
Something went wrong with that request. Please try again.