-
Notifications
You must be signed in to change notification settings - Fork 12
Do not bail off if LAPI is unreachable - rather retry! #94
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,40 +1,126 @@ | ||
lua_package_path '/usr/local/lua/crowdsec/?.lua;;'; | ||
lua_shared_dict crowdsec_cache 50m; | ||
|
||
lua_ssl_trusted_certificate /etc/ssl/certs/ca-certificates.crt; | ||
|
||
# | ||
# INITIALIZATION (runs once on Nginx start) | ||
# | ||
init_by_lua_block { | ||
cs = require "crowdsec" | ||
local ok, err = cs.init("/etc/crowdsec/bouncers/crowdsec-nginx-bouncer.conf", "crowdsec-nginx-bouncer/v1.1.3") | ||
if ok == nil then | ||
ngx.log(ngx.ERR, "[Crowdsec] " .. err) | ||
error() | ||
end | ||
-- ## FIX 1: Use pcall and a global flag for safer initialization ## | ||
-- This ensures that if CrowdSec fails to start, the other blocks will not run and cause errors. | ||
cs = require "crowdsec" | ||
local config_path = "/data/crowdsec/crowdsec.conf" | ||
local user_agent = "crowdsec-npmplus-bouncer/v1.1.3" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why change the user agent ? |
||
|
||
local ok, err = pcall(cs.init, config_path, user_agent) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The only way init can return an error is if the configuration file does not exist, which needs to be kept as a fatal error, as it's not recoverable. |
||
if not ok then | ||
ngx.log(ngx.ERR, "[Crowdsec] FATAL: Init failed: " .. tostring(err)) | ||
_G.CROWDSEC_INIT_FAILED = true | ||
else | ||
ngx.log(ngx.ALERT, "[Crowdsec] Initialisation done") | ||
_G.CROWDSEC_INIT_FAILED = false | ||
end | ||
} | ||
|
||
map $server_addr $unix { | ||
default 0; | ||
"~unix:" 1; | ||
default 0; | ||
"~unix:" 1; | ||
} | ||
|
||
# | ||
# PER-REQUEST PROCESSING (checks every request against the local cache) | ||
# | ||
access_by_lua_block { | ||
local cs = require "crowdsec" | ||
if ngx.var.unix == "1" then | ||
ngx.log(ngx.DEBUG, "[Crowdsec] Unix socket request ignoring...") | ||
else | ||
cs.Allow(ngx.var.remote_addr) | ||
end | ||
if _G.CROWDSEC_INIT_FAILED then return end | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The WAF should be allowed to run, even if LAPI is down. |
||
|
||
local cs = require "crowdsec" | ||
if ngx.var.unix == "1" then return end | ||
|
||
local ok, err = pcall(cs.Allow, ngx.var.remote_addr) | ||
if not ok then | ||
ngx.log(ngx.ERR, "[Crowdsec] Allow() failed unexpectedly: " .. tostring(err)) | ||
end | ||
} | ||
|
||
# | ||
# WORKER INITIALIZATION (runs the crash-proof supervisor) | ||
# | ||
init_worker_by_lua_block { | ||
cs = require "crowdsec" | ||
local mode = cs.get_mode() | ||
if string.lower(mode) == "stream" then | ||
ngx.log(ngx.INFO, "Initializing stream mode for worker " .. tostring(ngx.worker.id())) | ||
cs.SetupStream() | ||
if _G.CROWDSEC_INIT_FAILED then | ||
ngx.log(ngx.WARN, "[Crowdsec] Bouncer is disabled due to initialization failure.") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In a default configuration, nginx does not write warning to the error log, so users are very unlikely to see this message. |
||
return | ||
end | ||
|
||
local cs = require "crowdsec" | ||
local mode = cs.get_mode() | ||
|
||
if ngx.worker.id() == 0 and string.lower(mode) == "stream" then | ||
|
||
local STATE = ngx.shared.crowdsec_cache | ||
local LAST_ATTEMPT_KEY = "cs_last_attempt_ts" | ||
local DELAY_KEY = "cs_retry_delay_s" | ||
|
||
local INITIAL_RETRY_DELAY = 10 | ||
local MAX_RETRY_DELAY = 300 -- Increased max delay for longer outages | ||
local SUPERVISOR_CHECK_INTERVAL = 5 | ||
|
||
local function stream_worker_thread() | ||
STATE:set(DELAY_KEY, INITIAL_RETRY_DELAY) | ||
ngx.log(ngx.INFO, "[Crowdsec] Background stream worker started, retry delay reset to " .. INITIAL_RETRY_DELAY .. "s.") | ||
|
||
local ok, err = pcall(cs.SetupStream) | ||
|
||
-- ## FIX 2: Add logging for clean exits and mark worker as inactive ## | ||
-- This ensures the supervisor knows to restart the worker after a normal cycle. | ||
if not ok then | ||
-- This is a real crash. The worker process will likely die. | ||
ngx.log(ngx.ERR, "[Crowdsec] Background stream worker CRASHED: " .. tostring(err)) | ||
else | ||
-- This is a clean exit, which we know is normal behavior. | ||
ngx.log(ngx.INFO, "[Crowdsec] Background stream worker completed its cycle and stopped cleanly.") | ||
end | ||
-- This is CRITICAL. It tells the supervisor the worker needs to be restarted. | ||
STATE:set("cs_stream_is_active", false) | ||
end | ||
|
||
if ngx.worker.id() == 0 then | ||
ngx.log(ngx.INFO, "Initializing metrics for worker " .. tostring(ngx.worker.id())) | ||
cs.SetupMetrics() | ||
local supervisor | ||
supervisor = function(premature) | ||
if premature then return end | ||
|
||
local is_active = STATE:get("cs_stream_is_active") | ||
|
||
-- Note: in your version, this was inside the next `if`. Moved it out to cover all cases. | ||
if is_active then return end | ||
|
||
local last_attempt_ts = STATE:get(LAST_ATTEMPT_KEY) or 0 | ||
local current_delay = STATE:get(DELAY_KEY) or INITIAL_RETRY_DELAY | ||
|
||
if ngx.now() >= last_attempt_ts + current_delay then | ||
ngx.log(ngx.INFO, "[Crowdsec] Supervisor: Attempting to start stream worker. Current retry delay is " .. current_delay .. "s.") | ||
|
||
STATE:set(LAST_ATTEMPT_KEY, ngx.now()) | ||
local next_delay = math.min(current_delay * 2, MAX_RETRY_DELAY) | ||
STATE:set(DELAY_KEY, next_delay) | ||
|
||
local ok, err = ngx.thread.spawn(stream_worker_thread) | ||
if not ok then | ||
ngx.log(ngx.ERR, "[Crowdsec] Supervisor: FATAL - Failed to spawn worker thread: " .. tostring(err)) | ||
end | ||
end | ||
|
||
ngx.timer.at(SUPERVISOR_CHECK_INTERVAL, supervisor) | ||
end | ||
} | ||
|
||
ngx.log(ngx.INFO, "[Crowdsec] Supervisor initializing...") | ||
STATE:set(LAST_ATTEMPT_KEY, 0) | ||
STATE:set(DELAY_KEY, INITIAL_RETRY_DELAY) | ||
STATE:set("cs_stream_is_active", false) | ||
|
||
ngx.timer.at(1, supervisor) | ||
end | ||
|
||
if ngx.worker.id() == 0 then | ||
pcall(cs.SetupMetrics) | ||
end | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This path is specific to your installation.