Skip to content

Commit

Permalink
Remove manual logstash process management
Browse files Browse the repository at this point in the history
Logstash process is managed by system service, which makes it
dangerous to call kill -9 on it manually. Instead of manual
intervention, we let the system tools take care of the process
management.

Manually killing the logstash process was required since SystemV init
scripts cannot handle processes that do not terminate voluntarily. By
using upstart to manage the logstash process, we bypass this problem
completely, since any process that does not terminate voluntarily
withing five seconds is sent SIGKILL that forcefully terminates it.
  • Loading branch information
Tadej Borovšak authored and Tadej Borovšak committed May 16, 2017
1 parent 3a0fe8a commit 0546ee0
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 73 deletions.
28 changes: 28 additions & 0 deletions src/init/dmon-ls.conf
@@ -0,0 +1,28 @@
description "Logstash service"

start on (net-device-up
and local-filesystems
and runlevel [2345])
stop on runlevel [016]

respawn
respawn limit 10 30

limit nofile 32000 32000

setuid ubuntu
setgid ubuntu

env heap_size="1g"

script

HOME="/opt/logstash"
LS_JAVA_OPTS="-Djava.io.tmpdir=/opt/logstash"
LS_HEAP_SIZE="$heap_size"
export HOME LS_HEAP_SIZE LS_JAVA_OPTS
exec /opt/logstash/bin/logstash agent \
-f /opt/DICE-Monitoring/src/conf/logstash.conf \
-l /opt/DICE-Monitoring/src/logs/logstash.log

end script
88 changes: 15 additions & 73 deletions src/pyDMON.py
Expand Up @@ -3943,12 +3943,6 @@ def post(self):
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
return response

if checkPID(qSCore.LSCorePID) is True:
subprocess.call(['kill', '-9', str(qSCore.LSCorePID)])
app.logger.info('[%s] : [INFO] Killed LS Instance at %s',
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'),
str(qSCore.LSCorePID))

try:
template = templateEnv.get_template(lsTemp)
# print >>sys.stderr, template
Expand Down Expand Up @@ -4066,74 +4060,22 @@ def post(self):
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), str(lsPidf),
str(qSCore.LSCorePID))

if checkPID(qSCore.LSCorePID) is True:
try:
subprocess.check_call(["service", "dmon-ls", "restart", qSCore.LSCoreHeap, qSCore.LSCoreWorkers])
except Exception as inst:
app.logger.error("[%s] : [ERROR] Cannot restart LS Core service with %s and %s",
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst),
inst.args)
response = jsonify({'Status': 'Error', 'Message': 'Cannot restart LS Core'})
response.status_code = 500
return response
lsPID = check_proc(lsPIDFileLoc)
if not lsPID:
app.logger.error("[%s] : [ERROR] Can't read pidfile for ls core",
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
response = jsonify({'Status': 'Error', 'Message': 'Cannot read lscore pid file'})
response.status_code = 500
return response
qSCore.ESCorePID = lsPID
qSCore.ESCoreStatus = 'Running'
response = jsonify({'Status': 'LS Core Restarted', 'PID': lsPID})
response.status_code = 201
return response
elif checkPID(int(lsPidf)) is True:
try:
subprocess.check_call(["service", "dmon-ls", "restart", qSCore.LSCoreHeap, qSCore.LSCoreWorkers])
except Exception as inst:
app.logger.error("[%s] : [ERROR] Cannot restart detached LS Core service with %s and %s",
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst),
inst.args)
response = jsonify({'Status': 'Error', 'Message': 'Cannot restart detached LS Core'})
response.status_code = 500
return response
lsPID = check_proc(lsPIDFileLoc)
if not lsPID:
app.logger.error("[%s] : [ERROR] Can't read pidfile for ls core",
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
response = jsonify({'Status': 'Error', 'Message': 'Cannot read ls core pid file'})
response.status_code = 500
return response
qSCore.LSCorePID = lsPID
qSCore.LSCoreStatus = 'Running'
response = jsonify({'Status': 'LS Core Restarted and attached', 'PID': lsPID})
response.status_code = 201
return response
else:
try:
subprocess.check_call(["service", "dmon-ls", "start", qSCore.LSCoreHeap, qSCore.LSCoreWorkers])
except Exception as inst:
app.logger.error("[%s] : [ERROR] Cannot start LS Core service with %s and %s",
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst),
inst.args)
response = jsonify({'Status': 'Error', 'Message': 'Cannot start LS Core'})
response.status_code = 500
return response
lsPID = check_proc(lsPIDFileLoc)
if not lsPID:
app.logger.error("[%s] : [ERROR] Can't read pidfile for ls core",
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'))
response = jsonify({'Status': 'Error', 'Message': 'Cannot read lscore pid file'})
response.status_code = 500
return response
qSCore.LSCorePID = lsPID
qSCore.LSCoreStatus = 'Running'
response = jsonify({'Status': 'LS Core Started', 'PID': lsPID, 'Storm': stormStatus, 'YarnHistory': yarnStatus})
response.status_code = 201
app.logger.info("[%s] : [INFO] LS Core started with PID %s, Storm %s and YanrHistory %s",
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), lsPID, stormStatus, yarnStatus)
try:
subprocess.check_call([
"initctl", "restart", "dmon-ls",
"heap_size={}".format(os.environ['LS_HEAP_SIZE'])
])
except Exception as inst:
app.logger.error("[%s] : [ERROR] Cannot restart LS Core service with %s and %s",
datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst),
inst.args)
response = jsonify({'Status': 'Error', 'Message': 'Cannot restart LS Core'})
response.status_code = 500
return response
qSCore.ESCoreStatus = 'Running'
response = jsonify({'Status': 'LS Core Restarted', 'PID': 0})
response.status_code = 201
return response


@dmon.route('/v1/overlord/core/ls/<hostFQDN>/status')
Expand Down

0 comments on commit 0546ee0

Please sign in to comment.