From 42c258d1ee7352fd32732dcff0a79187e646be61 Mon Sep 17 00:00:00 2001 From: priyankub Date: Wed, 10 Sep 2025 23:05:21 -0400 Subject: [PATCH 1/4] Do not bail off if LAPI is unreachable - rather retry! --- nginx/crowdsec_nginx.conf | 154 ++++++++++++++++++++++++++++++++------ 1 file changed, 132 insertions(+), 22 deletions(-) diff --git a/nginx/crowdsec_nginx.conf b/nginx/crowdsec_nginx.conf index 86f5b68..9f0aa28 100644 --- a/nginx/crowdsec_nginx.conf +++ b/nginx/crowdsec_nginx.conf @@ -1,40 +1,150 @@ lua_package_path '/usr/local/lua/crowdsec/?.lua;;'; +# This shared dictionary will store both the decision cache and the LAPI health status. lua_shared_dict crowdsec_cache 50m; + lua_ssl_trusted_certificate /etc/ssl/certs/ca-certificates.crt; + +# +# INITIALIZATION (runs once on Nginx start) +# init_by_lua_block { - cs = require "crowdsec" - local ok, err = cs.init("/etc/crowdsec/bouncers/crowdsec-nginx-bouncer.conf", "crowdsec-nginx-bouncer/v1.1.3") - if ok == nil then - ngx.log(ngx.ERR, "[Crowdsec] " .. err) - error() - end + cs = require "crowdsec" + + local config_path = "/data/crowdsec/crowdsec.conf" + local user_agent = "crowdsec-npmplus-bouncer/v1.1.1" + + -- Attempt to initialize the CrowdSec library + local ok, err = pcall(cs.init, config_path, user_agent) + if not ok then + ngx.log(ngx.ERR, "[Crowdsec] FATAL: Init failed: " .. tostring(err)) + -- We will let Nginx start, but CrowdSec bouncer will be disabled. + -- We set a global flag to indicate initialization failure. + _G.CROWDSEC_INIT_FAILED = true + else ngx.log(ngx.ALERT, "[Crowdsec] Initialisation done") + _G.CROWDSEC_INIT_FAILED = false + -- Initialize the LAPI status as healthy in the shared cache. + ngx.shared.crowdsec_cache:set("lapi_unreachable", false) + end } map $server_addr $unix { - default 0; - "~unix:" 1; + default 0; + "~unix:" 1; } +# +# PER-REQUEST PROCESSING (runs for every request) +# access_by_lua_block { - local cs = require "crowdsec" - if ngx.var.unix == "1" then - ngx.log(ngx.DEBUG, "[Crowdsec] Unix socket request ignoring...") - else - cs.Allow(ngx.var.remote_addr) - end + -- Do not run if initialization failed + if _G.CROWDSEC_INIT_FAILED then + return + end + + local cs = require "crowdsec" + + -- Ignore requests over unix sockets + if ngx.var.unix == "1" then + return + end + + -- Fail-open fast: If LAPI is known to be down, allow traffic immediately. + local lapi_down = ngx.shared.crowdsec_cache:get("lapi_unreachable") + if lapi_down then + ngx.log(ngx.DEBUG, "[Crowdsec] LAPI is marked as unreachable, allowing request.") + return + end + + -- Safely call the Allow function using a protected call to prevent crashes + local ok, err = pcall(cs.Allow, ngx.var.remote_addr) + + -- If the call fails, log the error but allow the request to proceed. + if not ok then + ngx.log(ngx.ERR, "[Crowdsec] Allow() failed: " .. tostring(err)) + -- Check if it's a connection error. If so, mark LAPI as down. + -- You may need to adjust the string matching based on actual errors. + if type(err) == "string" and (err:find("timed out") or err:find("connection refused") or err:find("HTTP error")) then + ngx.log(ngx.WARN, "[Crowdsec] Marking LAPI as unreachable due to connection error.") + ngx.shared.crowdsec_cache:set("lapi_unreachable", true) + end + -- In any error case, we "fail open" by default. + return + end } +# +# WORKER INITIALIZATION (runs once for each Nginx worker) +# init_worker_by_lua_block { - cs = require "crowdsec" - local mode = cs.get_mode() - if string.lower(mode) == "stream" then - ngx.log(ngx.INFO, "Initializing stream mode for worker " .. tostring(ngx.worker.id())) - cs.SetupStream() + -- Do not run if global initialization failed + if _G.CROWDSEC_INIT_FAILED then + ngx.log(ngx.WARN, "[Crowdsec] Bouncer is disabled due to initialization failure.") + return + end + + local cs = require "crowdsec" + local mode = cs.get_mode() + + -- We only need the background task on one worker to avoid redundant checks. + if ngx.worker.id() == 0 then + -- Define retry backoff parameters + local INITIAL_RETRY_DELAY = 10 -- seconds + local MAX_RETRY_DELAY = 300 -- 5 minutes + local current_retry_delay = INITIAL_RETRY_DELAY + + local lapi_check_and_setup -- Forward declaration for the recursive timer + + lapi_check_and_setup = function(premature) + if premature then + return + end + + ngx.log(ngx.INFO, "[Crowdsec] Worker 0: Running LAPI health check and stream setup...") + + -- Use pcall for the entire setup logic to catch any error + local ok, err_or_result + if string.lower(mode) == "stream" then + ok, err_or_result = pcall(cs.SetupStream) + else + -- For non-stream mode, we can just ping + ok, err_or_result = pcall(cs.ping) + end + + if ok then + -- SUCCESS! + ngx.log(ngx.INFO, "[Crowdsec] LAPI connection successful. Bouncer is active.") + ngx.shared.crowdsec_cache:set("lapi_unreachable", false) + -- Reset the retry delay for the next time it might fail. + current_retry_delay = INITIAL_RETRY_DELAY + + -- In stream mode, SetupStream already runs a recurring timer. + -- We only need to reschedule this check if we are NOT in stream mode + -- or if the initial setup failed. We'll schedule a periodic health check. + if string.lower(mode) ~= "stream" then + ngx.timer.at(MAX_RETRY_DELAY, lapi_check_and_setup) + end + else + -- FAILURE! + ngx.log(ngx.ERR, "[Crowdsec] LAPI connection failed: " .. tostring(err_or_result)) + ngx.shared.crowdsec_cache:set("lapi_unreachable", true) + + -- Schedule the next retry with exponential backoff + ngx.log(ngx.WARN, "[Crowdsec] Scheduling next LAPI check in " .. current_retry_delay .. " seconds.") + ngx.timer.at(current_retry_delay, lapi_check_and_setup) + + -- Increase delay for the next attempt, up to the max + current_retry_delay = math.min(current_retry_delay * 2, MAX_RETRY_DELAY) + end end - if ngx.worker.id() == 0 then - ngx.log(ngx.INFO, "Initializing metrics for worker " .. tostring(ngx.worker.id())) - cs.SetupMetrics() + -- Start the first check shortly after the worker starts. + ngx.timer.at(5, lapi_check_and_setup) + + -- Setup Metrics (also on worker 0) + local ok, err = pcall(cs.SetupMetrics) + if not ok then + ngx.log(ngx.ERR, "[Crowdsec] SetupMetrics failed: " .. tostring(err)) end + end } From 24c063c1016e981b37f3a55f1143a7340bb73b58 Mon Sep 17 00:00:00 2001 From: priyankub Date: Thu, 11 Sep 2025 23:55:12 -0400 Subject: [PATCH 2/4] A bit more fine tuning for logging --- nginx/crowdsec_nginx.conf | 112 +++++++++++++++----------------------- 1 file changed, 43 insertions(+), 69 deletions(-) diff --git a/nginx/crowdsec_nginx.conf b/nginx/crowdsec_nginx.conf index 9f0aa28..68125aa 100644 --- a/nginx/crowdsec_nginx.conf +++ b/nginx/crowdsec_nginx.conf @@ -5,26 +5,22 @@ lua_shared_dict crowdsec_cache 50m; lua_ssl_trusted_certificate /etc/ssl/certs/ca-certificates.crt; # -# INITIALIZATION (runs once on Nginx start) +# PER-REQUEST PROCESSING (simplified) # -init_by_lua_block { - cs = require "crowdsec" +access_by_lua_block { + if _G.CROWDSEC_INIT_FAILED then return end + + local cs = require "crowdsec" + if ngx.var.unix == "1" then return end - local config_path = "/data/crowdsec/crowdsec.conf" - local user_agent = "crowdsec-npmplus-bouncer/v1.1.1" + -- We now rely only on the cache. If the background stream is down, + -- we fail open by not finding a decision for the IP. + local ok, err = pcall(cs.Allow, ngx.var.remote_addr) - -- Attempt to initialize the CrowdSec library - local ok, err = pcall(cs.init, config_path, user_agent) if not ok then - ngx.log(ngx.ERR, "[Crowdsec] FATAL: Init failed: " .. tostring(err)) - -- We will let Nginx start, but CrowdSec bouncer will be disabled. - -- We set a global flag to indicate initialization failure. - _G.CROWDSEC_INIT_FAILED = true - else - ngx.log(ngx.ALERT, "[Crowdsec] Initialisation done") - _G.CROWDSEC_INIT_FAILED = false - -- Initialize the LAPI status as healthy in the shared cache. - ngx.shared.crowdsec_cache:set("lapi_unreachable", false) + -- This will now only catch critical errors during the pcall itself, + -- not transient connection errors which are handled by the background worker. + ngx.log(ngx.ERR, "[Crowdsec] Allow() failed unexpectedly: " .. tostring(err)) end } @@ -74,74 +70,52 @@ access_by_lua_block { } # -# WORKER INITIALIZATION (runs once for each Nginx worker) +# WORKER INITIALIZATION (with Supervisor Pattern) # init_worker_by_lua_block { - -- Do not run if global initialization failed if _G.CROWDSEC_INIT_FAILED then ngx.log(ngx.WARN, "[Crowdsec] Bouncer is disabled due to initialization failure.") return end - + local cs = require "crowdsec" local mode = cs.get_mode() - -- We only need the background task on one worker to avoid redundant checks. + -- Run the supervisor on a single worker to avoid duplication. if ngx.worker.id() == 0 then - -- Define retry backoff parameters - local INITIAL_RETRY_DELAY = 10 -- seconds - local MAX_RETRY_DELAY = 300 -- 5 minutes - local current_retry_delay = INITIAL_RETRY_DELAY - - local lapi_check_and_setup -- Forward declaration for the recursive timer - - lapi_check_and_setup = function(premature) - if premature then - return - end - - ngx.log(ngx.INFO, "[Crowdsec] Worker 0: Running LAPI health check and stream setup...") - - -- Use pcall for the entire setup logic to catch any error - local ok, err_or_result - if string.lower(mode) == "stream" then - ok, err_or_result = pcall(cs.SetupStream) - else - -- For non-stream mode, we can just ping - ok, err_or_result = pcall(cs.ping) - end - - if ok then - -- SUCCESS! - ngx.log(ngx.INFO, "[Crowdsec] LAPI connection successful. Bouncer is active.") - ngx.shared.crowdsec_cache:set("lapi_unreachable", false) - -- Reset the retry delay for the next time it might fail. - current_retry_delay = INITIAL_RETRY_DELAY - - -- In stream mode, SetupStream already runs a recurring timer. - -- We only need to reschedule this check if we are NOT in stream mode - -- or if the initial setup failed. We'll schedule a periodic health check. - if string.lower(mode) ~= "stream" then - ngx.timer.at(MAX_RETRY_DELAY, lapi_check_and_setup) + -- Only start the supervisor if we are in stream mode. + if string.lower(mode) == "stream" then + -- How long to wait before restarting the stream after a crash. + local STREAM_RESTART_DELAY = 15 -- seconds + local stream_supervisor + + stream_supervisor = function(premature) + if premature then return end + + -- The cs.SetupStream function is a long-running process that + -- contains its own loop and timers. We wrap the entire + -- thing in a pcall. If it ever crashes (like with the 'nil value' error), + -- pcall will catch it, and we can restart it. + ngx.log(ngx.INFO, "[Crowdsec] Supervisor: Starting CrowdSec decision stream...") + local ok, err = pcall(cs.SetupStream) + + if not ok then + ngx.log(ngx.ERR, "[Crowdsec] Supervisor: Stream process crashed with error: " .. tostring(err)) + else + -- This line is unlikely to be reached unless SetupStream exits cleanly. + ngx.log(ngx.INFO, "[Crowdsec] Supervisor: Stream process exited without error.") end - else - -- FAILURE! - ngx.log(ngx.ERR, "[Crowdsec] LAPI connection failed: " .. tostring(err_or_result)) - ngx.shared.crowdsec_cache:set("lapi_unreachable", true) - -- Schedule the next retry with exponential backoff - ngx.log(ngx.WARN, "[Crowdsec] Scheduling next LAPI check in " .. current_retry_delay .. " seconds.") - ngx.timer.at(current_retry_delay, lapi_check_and_setup) - - -- Increase delay for the next attempt, up to the max - current_retry_delay = math.min(current_retry_delay * 2, MAX_RETRY_DELAY) + -- No matter how it exited (crash or clean exit), schedule a restart. + ngx.log(ngx.INFO, "[Crowdsec] Supervisor: Restarting stream in " .. STREAM_RESTART_DELAY .. " seconds.") + ngx.timer.at(STREAM_RESTART_DELAY, stream_supervisor) end - end - -- Start the first check shortly after the worker starts. - ngx.timer.at(5, lapi_check_and_setup) + -- Start the supervisor for the first time. + ngx.timer.at(0, stream_supervisor) + end - -- Setup Metrics (also on worker 0) + -- Metrics setup can remain the same. local ok, err = pcall(cs.SetupMetrics) if not ok then ngx.log(ngx.ERR, "[Crowdsec] SetupMetrics failed: " .. tostring(err)) From 960ff4a4f20f8ac8399221d6d7b1684189b6e403 Mon Sep 17 00:00:00 2001 From: priyankub Date: Fri, 12 Sep 2025 12:17:28 -0400 Subject: [PATCH 3/4] Fixing missing blocks --- nginx/crowdsec_nginx.conf | 159 +++++++++++++++++++------------------- 1 file changed, 80 insertions(+), 79 deletions(-) diff --git a/nginx/crowdsec_nginx.conf b/nginx/crowdsec_nginx.conf index 68125aa..13ba6bd 100644 --- a/nginx/crowdsec_nginx.conf +++ b/nginx/crowdsec_nginx.conf @@ -1,26 +1,24 @@ -lua_package_path '/usr/local/lua/crowdsec/?.lua;;'; -# This shared dictionary will store both the decision cache and the LAPI health status. lua_shared_dict crowdsec_cache 50m; lua_ssl_trusted_certificate /etc/ssl/certs/ca-certificates.crt; # -# PER-REQUEST PROCESSING (simplified) +# INITIALIZATION (runs once on Nginx start) # -access_by_lua_block { - if _G.CROWDSEC_INIT_FAILED then return end - - local cs = require "crowdsec" - if ngx.var.unix == "1" then return end - - -- We now rely only on the cache. If the background stream is down, - -- we fail open by not finding a decision for the IP. - local ok, err = pcall(cs.Allow, ngx.var.remote_addr) - +init_by_lua_block { + -- ## FIX 1: Use pcall and a global flag for safer initialization ## + -- This ensures that if CrowdSec fails to start, the other blocks will not run and cause errors. + cs = require "crowdsec" + local config_path = "/data/crowdsec/crowdsec.conf" + local user_agent = "crowdsec-npmplus-bouncer/v1.1.3" + + local ok, err = pcall(cs.init, config_path, user_agent) if not ok then - -- This will now only catch critical errors during the pcall itself, - -- not transient connection errors which are handled by the background worker. - ngx.log(ngx.ERR, "[Crowdsec] Allow() failed unexpectedly: " .. tostring(err)) + ngx.log(ngx.ERR, "[Crowdsec] FATAL: Init failed: " .. tostring(err)) + _G.CROWDSEC_INIT_FAILED = true + else + ngx.log(ngx.ALERT, "[Crowdsec] Initialisation done") + _G.CROWDSEC_INIT_FAILED = false end } @@ -30,47 +28,22 @@ map $server_addr $unix { } # -# PER-REQUEST PROCESSING (runs for every request) +# PER-REQUEST PROCESSING (checks every request against the local cache) # access_by_lua_block { - -- Do not run if initialization failed - if _G.CROWDSEC_INIT_FAILED then - return - end + if _G.CROWDSEC_INIT_FAILED then return end local cs = require "crowdsec" + if ngx.var.unix == "1" then return end - -- Ignore requests over unix sockets - if ngx.var.unix == "1" then - return - end - - -- Fail-open fast: If LAPI is known to be down, allow traffic immediately. - local lapi_down = ngx.shared.crowdsec_cache:get("lapi_unreachable") - if lapi_down then - ngx.log(ngx.DEBUG, "[Crowdsec] LAPI is marked as unreachable, allowing request.") - return - end - - -- Safely call the Allow function using a protected call to prevent crashes local ok, err = pcall(cs.Allow, ngx.var.remote_addr) - - -- If the call fails, log the error but allow the request to proceed. if not ok then - ngx.log(ngx.ERR, "[Crowdsec] Allow() failed: " .. tostring(err)) - -- Check if it's a connection error. If so, mark LAPI as down. - -- You may need to adjust the string matching based on actual errors. - if type(err) == "string" and (err:find("timed out") or err:find("connection refused") or err:find("HTTP error")) then - ngx.log(ngx.WARN, "[Crowdsec] Marking LAPI as unreachable due to connection error.") - ngx.shared.crowdsec_cache:set("lapi_unreachable", true) - end - -- In any error case, we "fail open" by default. - return + ngx.log(ngx.ERR, "[Crowdsec] Allow() failed unexpectedly: " .. tostring(err)) end } # -# WORKER INITIALIZATION (with Supervisor Pattern) +# WORKER INITIALIZATION (runs the crash-proof supervisor) # init_worker_by_lua_block { if _G.CROWDSEC_INIT_FAILED then @@ -81,44 +54,72 @@ init_worker_by_lua_block { local cs = require "crowdsec" local mode = cs.get_mode() - -- Run the supervisor on a single worker to avoid duplication. - if ngx.worker.id() == 0 then - -- Only start the supervisor if we are in stream mode. - if string.lower(mode) == "stream" then - -- How long to wait before restarting the stream after a crash. - local STREAM_RESTART_DELAY = 15 -- seconds - local stream_supervisor - - stream_supervisor = function(premature) - if premature then return end - - -- The cs.SetupStream function is a long-running process that - -- contains its own loop and timers. We wrap the entire - -- thing in a pcall. If it ever crashes (like with the 'nil value' error), - -- pcall will catch it, and we can restart it. - ngx.log(ngx.INFO, "[Crowdsec] Supervisor: Starting CrowdSec decision stream...") - local ok, err = pcall(cs.SetupStream) + if ngx.worker.id() == 0 and string.lower(mode) == "stream" then + + local STATE = ngx.shared.crowdsec_cache + local LAST_ATTEMPT_KEY = "cs_last_attempt_ts" + local DELAY_KEY = "cs_retry_delay_s" + + local INITIAL_RETRY_DELAY = 10 + local MAX_RETRY_DELAY = 300 -- Increased max delay for longer outages + local SUPERVISOR_CHECK_INTERVAL = 5 + + local function stream_worker_thread() + STATE:set(DELAY_KEY, INITIAL_RETRY_DELAY) + ngx.log(ngx.INFO, "[Crowdsec] Background stream worker started, retry delay reset to " .. INITIAL_RETRY_DELAY .. "s.") + + local ok, err = pcall(cs.SetupStream) + + -- ## FIX 2: Add logging for clean exits and mark worker as inactive ## + -- This ensures the supervisor knows to restart the worker after a normal cycle. + if not ok then + -- This is a real crash. The worker process will likely die. + ngx.log(ngx.ERR, "[Crowdsec] Background stream worker CRASHED: " .. tostring(err)) + else + -- This is a clean exit, which we know is normal behavior. + ngx.log(ngx.INFO, "[Crowdsec] Background stream worker completed its cycle and stopped cleanly.") + end + -- This is CRITICAL. It tells the supervisor the worker needs to be restarted. + STATE:set("cs_stream_is_active", false) + end + local supervisor + supervisor = function(premature) + if premature then return end + + local is_active = STATE:get("cs_stream_is_active") + + -- Note: in your version, this was inside the next `if`. Moved it out to cover all cases. + if is_active then return end + + local last_attempt_ts = STATE:get(LAST_ATTEMPT_KEY) or 0 + local current_delay = STATE:get(DELAY_KEY) or INITIAL_RETRY_DELAY + + if ngx.now() >= last_attempt_ts + current_delay then + ngx.log(ngx.INFO, "[Crowdsec] Supervisor: Attempting to start stream worker. Current retry delay is " .. current_delay .. "s.") + + STATE:set(LAST_ATTEMPT_KEY, ngx.now()) + local next_delay = math.min(current_delay * 2, MAX_RETRY_DELAY) + STATE:set(DELAY_KEY, next_delay) + + local ok, err = ngx.thread.spawn(stream_worker_thread) if not ok then - ngx.log(ngx.ERR, "[Crowdsec] Supervisor: Stream process crashed with error: " .. tostring(err)) - else - -- This line is unlikely to be reached unless SetupStream exits cleanly. - ngx.log(ngx.INFO, "[Crowdsec] Supervisor: Stream process exited without error.") + ngx.log(ngx.ERR, "[Crowdsec] Supervisor: FATAL - Failed to spawn worker thread: " .. tostring(err)) end - - -- No matter how it exited (crash or clean exit), schedule a restart. - ngx.log(ngx.INFO, "[Crowdsec] Supervisor: Restarting stream in " .. STREAM_RESTART_DELAY .. " seconds.") - ngx.timer.at(STREAM_RESTART_DELAY, stream_supervisor) end - -- Start the supervisor for the first time. - ngx.timer.at(0, stream_supervisor) + ngx.timer.at(SUPERVISOR_CHECK_INTERVAL, supervisor) end + + ngx.log(ngx.INFO, "[Crowdsec] Supervisor initializing...") + STATE:set(LAST_ATTEMPT_KEY, 0) + STATE:set(DELAY_KEY, INITIAL_RETRY_DELAY) + STATE:set("cs_stream_is_active", false) + + ngx.timer.at(1, supervisor) + end - -- Metrics setup can remain the same. - local ok, err = pcall(cs.SetupMetrics) - if not ok then - ngx.log(ngx.ERR, "[Crowdsec] SetupMetrics failed: " .. tostring(err)) - end + if ngx.worker.id() == 0 then + pcall(cs.SetupMetrics) end -} +} \ No newline at end of file From 326fab8d83f0a9febfd3dae67086ff8437351100 Mon Sep 17 00:00:00 2001 From: priyankub Date: Fri, 12 Sep 2025 12:21:23 -0400 Subject: [PATCH 4/4] Fix lua package path --- nginx/crowdsec_nginx.conf | 1 + 1 file changed, 1 insertion(+) diff --git a/nginx/crowdsec_nginx.conf b/nginx/crowdsec_nginx.conf index 13ba6bd..4684fbc 100644 --- a/nginx/crowdsec_nginx.conf +++ b/nginx/crowdsec_nginx.conf @@ -1,3 +1,4 @@ +lua_package_path '/usr/local/lua/crowdsec/?.lua;;'; lua_shared_dict crowdsec_cache 50m; lua_ssl_trusted_certificate /etc/ssl/certs/ca-certificates.crt;