diff --git a/crowdsec-docs/docusaurus.config.ts b/crowdsec-docs/docusaurus.config.ts index 5a6133218..c876f3b1b 100644 --- a/crowdsec-docs/docusaurus.config.ts +++ b/crowdsec-docs/docusaurus.config.ts @@ -6,6 +6,8 @@ import { themes } from "prism-react-renderer"; import tailwindPlugin from "./plugins/tailwind-config"; import { ctiApiSidebar, guidesSideBar, remediationSideBar } from "./sidebarsUnversioned"; +const extractPreprocessor = require("./plugins/extract-preprocessor"); + const generateCurrentAndNextRedirects = (s) => [ { from: `/docs/${s}`, @@ -220,6 +222,7 @@ const config: Config = { admonitions: true, headingIds: true, }, + preprocessor:extractPreprocessor }, stylesheets: [ { @@ -290,7 +293,7 @@ const config: Config = { current: { path: "/next", }, - }, + } }, blog: { showReadingTime: true, @@ -317,7 +320,7 @@ const config: Config = { ["./plugins/gtag/index.ts", { trackingID: "G-0TFBMNTDFQ" }], ["@docusaurus/plugin-client-redirects", { redirects }], tailwindPlugin, - ], + ] }; export default config; diff --git a/crowdsec-docs/plugins/extract-preprocessor.js b/crowdsec-docs/plugins/extract-preprocessor.js new file mode 100644 index 000000000..aaa07affc --- /dev/null +++ b/crowdsec-docs/plugins/extract-preprocessor.js @@ -0,0 +1,104 @@ +const fs = require('fs'); +const path = require('path'); + +// --- CONFIGURATION --- +// The directories to scan for snippets +const DOCS_DIRS = ['./docs', './unversioned']; +// --------------------- + +const snippetRegistry = new Map(); +let isIndexed = false; + +// Helper: Recursively find all .md/.mdx files +const getAllFiles = (dirPath, arrayOfFiles = []) => { + if (!fs.existsSync(dirPath)) return arrayOfFiles; + + const files = fs.readdirSync(dirPath); + files.forEach((file) => { + const fullPath = path.join(dirPath, file); + if (fs.statSync(fullPath).isDirectory()) { + getAllFiles(fullPath, arrayOfFiles); + } else if (file.endsWith('.md') || file.endsWith('.mdx')) { + arrayOfFiles.push(fullPath); + } + }); + return arrayOfFiles; +}; + +// Helper: Extract Doc ID from Frontmatter +const getDocId = (content, filename) => { + const idMatch = content.match(/^---\s+[\s\S]*?\nid:\s*(.*?)\s*[\n\r]/m); + if (idMatch && idMatch[1]) { + return idMatch[1].replace(/['"]/g, '').trim(); + } + return filename; +}; + +// --- CORE LOGIC --- +const buildIndex = () => { + if (isIndexed) return; + console.log('[ExtractPreprocessor] ⚡ Indexing snippets via Regex...'); + + const allFiles = []; + DOCS_DIRS.forEach(dir => getAllFiles(path.resolve(process.cwd(), dir), allFiles)); + + let count = 0; + + // Regex to find:
CONTENT
+ // We use [\s\S]*? to match content across multiple lines (lazy match) + const extractRegex = /]*>([\s\S]*?)<\/div>/g; + + allFiles.forEach(filePath => { + try { + const content = fs.readFileSync(filePath, 'utf8'); + const filename = path.basename(filePath, path.extname(filePath)); + const docId = getDocId(content, filename); + + let match; + // Loop through all matches in the file + while ((match = extractRegex.exec(content)) !== null) { + let [fullTag, extractId, snippetContent] = match; + + // Clean up the content (optional: trim leading/trailing newlines) + snippetContent = snippetContent.replace(/^\n+|\n+$/g, ''); + + // Generate Key: "docId:snippetId" + // If the ID already has a colon, assume user provided full ID + const key = extractId.includes(':') ? extractId : `${docId}:${extractId}`; + + snippetRegistry.set(key, snippetContent); + console.log(`[ExtractPreprocessor] ⚡ Indexed snippet: ${key}`); + count++; + } + } catch (e) { + console.warn(`[ExtractPreprocessor] Failed to read ${filePath}`); + } + }); + + isIndexed = true; + console.log(`[ExtractPreprocessor] ⚡ Indexed ${count} snippets.`); +}; + +// This function is called by Docusaurus for EVERY markdown file +const preprocessor = ({ filePath, fileContent }) => { + // 1. Ensure Index exists (runs once) + buildIndex(); + + // 2. Regex to find:
+ // Matches
OR
+ const copyRegex = /\s*(?:<\/div>)?/g; + + // 3. Replace with content + return fileContent.replace(copyRegex, (match, requestedId) => { + if (snippetRegistry.has(requestedId)) { + // Return the stored snippet content + return snippetRegistry.get(requestedId); + } else { + console.error(`[ExtractPreprocessor] ❌ Snippet not found: "${requestedId}" in ${path.basename(filePath)}`); + // Return an error message in the UI so you see it + return `> **Error: Snippet "${requestedId}" not found.**`; + } + }); +}; + +module.exports = preprocessor; \ No newline at end of file diff --git a/crowdsec-docs/unversioned/getting_started/installation/whm.mdx b/crowdsec-docs/unversioned/getting_started/installation/whm.mdx index 11fd5f429..cbe991473 100644 --- a/crowdsec-docs/unversioned/getting_started/installation/whm.mdx +++ b/crowdsec-docs/unversioned/getting_started/installation/whm.mdx @@ -148,7 +148,7 @@ Most of the time it will be a port conflict or config file error - Check the logs for error - In CrowdSec's logs sudo less /var/log/crowdsec.log: Note that it might be very verbose. - You can also check: sudo journalctl -u crowdsec -- Ultimately, you can check the [Security Engine Troubleshooting section](/troubleshooting/security_engine.mdx) +- Ultimately, you can check the [Security Engine Troubleshooting section](/u/troubleshooting/security_engine.mdx) ### Changing port configuration diff --git a/crowdsec-docs/unversioned/getting_started/post_installation/acquisition_troubleshoot.mdx b/crowdsec-docs/unversioned/getting_started/post_installation/acquisition_troubleshoot.mdx index fd175fbd0..de42414c6 100644 --- a/crowdsec-docs/unversioned/getting_started/post_installation/acquisition_troubleshoot.mdx +++ b/crowdsec-docs/unversioned/getting_started/post_installation/acquisition_troubleshoot.mdx @@ -21,7 +21,7 @@ The first thing to check is that the log file is found and readable by the Crowd Within the CrowdSec log file it will log if the file was found or not. -Log file locations change by distribution, you can find the default log location [outlined here](/troubleshooting/security_engine.mdx#where-are-the-logs-stored). +Log file locations change by distribution, you can find the default log location [outlined here](/u/troubleshooting/security_engine.mdx#where-are-the-logs-stored). + +| Issue | Criticality | Summary | Resolution | +|-------|-------------|---------|------------| +| **Security Engine Offline** | 🔥 Critical | Security Engine has not reported to Console for 24+ hours | [Troubleshooting](/u/troubleshooting/issue_security_engine_offline) | +| **Engine No Alerts** | ⚠️ High | No alerts generated in the last 48 hours | [Troubleshooting](/u/troubleshooting/issue_engine_no_alerts) | +| **Engine Too Many Alerts** | ⚠️ High | More than 250,000 alerts in 6 hours | [Troubleshooting](/u/troubleshooting/issue_engine_too_many_alerts) | +| **Log Processor Offline** | 🔥 Critical | Log Processor has not checked in with LAPI for 24+ hours | [Troubleshooting](/u/troubleshooting/issue_log_processor_offline) | +| **LP No Alerts** | ⚠️ High | Log Processor has not generated alerts in 48 hours | [Troubleshooting](/u/troubleshooting/issue_lp_no_alerts) | +| **LP No Logs Read** | 🔥 Critical | No logs acquired in the last 24 hours | [Troubleshooting](/u/troubleshooting/issue_lp_no_logs_read) | +| **LP No Logs Parsed** | 🔥 Critical | Logs read but none parsed in the last 48 hours | [Troubleshooting](/u/troubleshooting/issue_lp_no_logs_parsed) | +| **Firewall Integration Offline** | 🔥 Critical | Firewall has not pulled from BLaaS endpoint for 24+ hours | [Troubleshooting](/u/troubleshooting/issue_fw_integration_offline) | +| **RC Integration Offline** | 🔥 Critical | Remediation Component has not pulled from endpoint for 24+ hours | [Troubleshooting](/u/troubleshooting/issue_rc_integration_offline) | + +
+## Issue Dependencies + +Some issues are related and share common root causes: + +- **Engine No Alerts** may be caused by: + - LP No Logs Read + - LP No Logs Parsed + - Scenarios not installed or in simulation mode + +- **LP No Alerts** may be caused by: + - LP No Logs Read + - LP No Logs Parsed + - Scenarios not matching the parsed events + +Understanding these dependencies helps you troubleshoot more efficiently by addressing root causes first. + +## Future Enhancements + +For planned and experimental health checks, see [Future Console Health Check Issues](/u/troubleshooting/future_console_issues) page for planned features including: + +- Enhanced configuration validation +- Blocklists optimization recommendations +- Collection update notifications +- False positive prevention checks +- Premium feature recommendation based on detected benefit + +## Getting Help + +If you've followed the troubleshooting guides and still need assistance: + +- [Discourse](https://discourse.crowdsec.net/) +- [Discord](https://discord.gg/crowdsec) +- [GitHub Issues](https://github.com/crowdsecurity/crowdsec/issues) \ No newline at end of file diff --git a/crowdsec-docs/unversioned/troubleshooting/future_console_issues.md b/crowdsec-docs/unversioned/troubleshooting/future_console_issues.md new file mode 100644 index 000000000..5b9b7e7a2 --- /dev/null +++ b/crowdsec-docs/unversioned/troubleshooting/future_console_issues.md @@ -0,0 +1,240 @@ +--- +title: Future Console Health Check Issues +id: future_console_issues +--- + +This page lists potential health check issues and recommendations that may be added to the CrowdSec Console in future versions. These are categorized by type and priority to help guide feature development. + +:::info +These features are planned or under consideration and are not yet available in the Console. This documentation is maintained for planning purposes. +::: + +## Overview + +This page documents **17 future issues** across four main categories: + +- **Configuration Issues** (4 issues) - Initial setup and component configuration +- **Maintenance & Updates** (4 issues) - Version updates and collection management +- **Configuration Validation** (3 issues) - Detecting misconfigurations and optimization opportunities +- **Premium Features & Enhancements** (6 issues) - Value-added features and intelligent upgrade recommendations + +## Configuration Issues + +### No Security Engine or Blocklist Integration Configured + +- **Criticality**: 💡 Recommended +- **Trigger**: Organization has neither Security Engines (LAPI) nor Blocklist-as-a-Service (BLaaS) integrations configured +- **Description**: Account is set up but has no active detection or blocklist infrastructure +- **Impact**: No threat detection or proactive blocking capabilities +- **Category**: Initial Configuration + +### No Scenarios Installed + +- **Criticality**: 🔥 Critical +- **Trigger**: Security Engine has zero scenarios installed +- **Description**: No detection rules configured to identify threats +- **Impact**: Even if logs are parsed, no alerts can be generated +- **Category**: Configuration + +### No Notification Channels Configured + +- **Criticality**: 💡 Recommended (bonus for Premium users) +- **Trigger**: No notification integrations configured for Console alerts +- **Description**: User won't receive proactive notifications about stack health issues +- **Impact**: Delayed response to critical problems +- **Notes**: Recommended as a Premium feature benefit +- **Category**: Configuration + +### Alert Context Not Activated + +- **Criticality**: 💡 Recommended +- **Trigger**: Alert context enrichment is disabled in Console settings +- **Description**: Missing valuable CTI context data for alert analysis +- **Impact**: Reduced threat intelligence and harder troubleshooting +- **Category**: Configuration Enhancement + +## Maintenance & Updates + +### Security Engine Version Outdated + +- **Criticality**: 💡 Recommended +- **Trigger**: Security Engine running an older version when a new stable release is available +- **Description**: Missing bug fixes, performance improvements, security patches, and new features +- **Impact**: Potential vulnerabilities, reduced performance, or missing functionality +- **Requirements**: Version reporting from Security Engine, release tracking system +- **Notes**: Could highlight major version upgrades separately (e.g., 1.6.x → 1.7.x with significant new features) +- **Category**: Maintenance + +### Remediation Component Version Outdated + +- **Criticality**: 💡 Recommended +- **Trigger**: Active remediation components (bouncers) running outdated versions +- **Description**: Remediation components missing features, bug fixes, or security patches from newer releases +- **Impact**: Reduced remediation effectiveness, potential vulnerabilities, or missing compatibility +- **Requirements**: Bouncer version reporting from FOSS/backend, release tracking for all bouncer types +- **Category**: Maintenance + +### Collection Version Outdated + +- **Criticality**: 💡 Recommended +- **Trigger**: Installed collections have newer versions available on the Hub +- **Description**: Using outdated detection rules and parsers, potentially missing scenarios from updated collections +- **Impact**: Missing newer attack patterns, parser improvements, and additional scenarios added to collection +- **Requirements**: Hub version comparison, backend processing +- **Notes**: Includes detecting when collection on Hub has new scenarios not present in installed version +- **Category**: Maintenance + +### Incomplete Scenario Installation from Collection + +- **Criticality**: ⚠️ High +- **Trigger**: Scenarios installed but not representing the complete collection (missing scenarios compared to Hub collection definition) +- **Description**: Partial collection installation leaves detection gaps +- **Impact**: Reduced detection coverage for specific attack types within the collection scope +- **Requirements**: Collection definition comparison between installed and Hub versions +- **Category**: Configuration Validation + +## Configuration Validation & Optimization + +### Acquisition and Collection Mismatch + +- **Criticality**: 💡 Recommended +- **Trigger**: Collection installed (e.g., nginx) but no corresponding acquisition configuration for that log type +- **Description**: Detection rules installed but no logs being collected to trigger them +- **Impact**: Wasted resources, collection cannot function as intended +- **Example**: NGINX collection installed but no nginx access logs configured in acquisition +- **Category**: Configuration Validation + +### Long-Duration Decisions + +- **Criticality**: 🌟 Bonus (informational) +- **Trigger**: Active decisions with TTL exceeding threshold (e.g., 30+ days) +- **Description**: Very long bans may indicate manual decisions that should be reviewed +- **Impact**: No direct functional impact but may need periodic review +- **Notes**: Informational alert for housekeeping +- **Category**: Maintenance + +### Decisions Against Legitimate IPs + +- **Criticality**: ⚠️ High +- **Trigger**: Active decisions against known legitimate IP ranges (Let's Encrypt, CDN providers, cloud services, etc.) +- **Description**: Potentially blocking legitimate service traffic +- **Impact**: Service disruption (e.g., SSL certificate renewal failures, CDN issues, API connectivity problems) +- **Requirements**: Maintained database of known legitimate IP ranges and services +- **Category**: False Positive Prevention + +## Premium Features & Intelligent Recommendations + +### Alert Volume Over Free Quota + +- **Criticality**: 🌟 Bonus (informational/upgrade opportunity) +- **Trigger**: Alert volume approaching or exceeding free tier limits +- **Description**: High alert activity may benefit from Premium tier features +- **Impact**: Opportunity to upgrade for enhanced capabilities +- **Notes**: Informational nudge toward Premium upgrade for heavy users +- **Category**: Upgrade Opportunity + +### Notification Overload - Premium Recommended + +- **Criticality**: 💡 Recommended +- **Trigger**: Community user with multiple Security Engines OR high alert/activity volume +- **Description**: Complex setup would benefit from notification channels to track issues across infrastructure +- **Impact**: Missing visibility across distributed deployment or high-activity environment +- **Notes**: Highlight Premium notification features for managing complex deployments +- **Category**: Enhancement - Upgrade Opportunity + +### AIUA Not Activated (Premium User) + +- **Criticality**: 💡 Recommended +- **Trigger**: Premium tier user without "Am I Under Attack" (AIUA) feature enabled +- **Description**: Premium feature not utilized despite availability +- **Impact**: Not leveraging paid feature for automated attack detection and response +- **Notes**: Premium feature - ensure paid users activate available capabilities +- **Category**: Premium Feature Activation + +### AIUA Not Activated (Community User) + +- **Criticality**: 🌟 Bonus (informational) +- **Trigger**: Community tier user without AIUA enabled +- **Description**: Missing automated attack detection available in Premium tiers +- **Impact**: Manual attack detection vs automated Premium feature +- **Notes**: Possible upgrade to Premium for automated attack detection +- **Category**: Enhancement - Upgrade Opportunity + +### High-Value Blocklist Available (Same Tier - >30%) + +- **Criticality**: 💡 Recommended +- **Trigger**: Blocklist with >30% protection prediction (Alakazam score) available for user's current tier but not subscribed +- **Description**: High-impact blocklist available at current subscription level could significantly improve protection +- **Impact**: Missing substantial proactive threat blocking opportunity +- **Requirements**: Alakazam efficiency prediction calculation based on user's threat profile +- **Example**: Community user not subscribed to high-efficiency free blocklist, or Premium user not using available Premium blocklist +- **Category**: Enhancement - Optimization + +### High-Value Blocklist Available (Upper Tier - >30%) + +- **Criticality**: 🌟 Bonus (informational/upgrade opportunity) +- **Trigger**: Premium/Platinum blocklist with >30% protection prediction available in higher tier +- **Description**: Significant protection improvement available through tier upgrade +- **Impact**: Major reduction in attack surface through proactive blocking +- **Requirements**: Alakazam efficiency prediction showing concrete benefit of upgrade +- **Example**: Community user could block 35% of threats with Premium BL, or Premium user could block 40% with Platinum BL +- **Notes**: Data-driven upgrade showing measurable security benefit of upgrading +- **Category**: Enhancement - Upgrade Opportunity + +## Criticality Levels Explained + +### Critical + +Issues that represent complete failure of core functionality. Immediate attention required. + +### High + +Important issues that should be addressed soon. May significantly impact protection effectiveness. + +### Recommended + +Improvements that would enhance security posture or operational efficiency. Should be addressed when possible. + +### Bonus + +Informational, optimization opportunities, or value-demonstration items. Low priority but helpful for optimization, housekeeping, or demonstrating ROI/upgrade value. + +## Key Features + +### Alakazam Protection Prediction + +The **Alakazam scoring system** analyzes your specific threat profile (alerts, attack patterns, geographic sources) and calculates the **predicted effectiveness** of each blocklist: + +- **>30% threshold**: Significant protection improvement recommended +- **Personalized**: Based on your actual threat landscape, not generic statistics +- **Tier-aware**: Shows both same-tier optimizations and upgrade opportunities +- **Data-driven upgrade**: Concrete, measurable benefit (e.g., "Block 35% of your threats preemptively") + +### Smart Collection Management + +- **Version tracking**: Detect when Hub collections gain new scenarios +- **Acquisition alignment**: Ensure installed collections match your log sources +- **Completeness validation**: Identify partial installations missing key scenarios + +## Implementation Requirements + +These future issues require: + +- **Version Tracking**: Security Engine, bouncer, and Hub collection version reporting +- **Alakazam Prediction Engine**: Personalized blocklist efficiency scoring based on user's threat profile +- **Legitimate IP Database**: Curated list of known good IPs (CDNs, certificate authorities, cloud providers) +- **Collection Definition Comparison**: Track scenario additions/changes in Hub collections +- **Activity Metrics**: Alert volume, Security Engine count, notification usage patterns + +## Related Pages + +- [Current Console Health Check Issues](/u/troubleshooting/console_issues) - Issues currently available in the Console +- [Troubleshooting Overview](/u/troubleshooting/intro) - General troubleshooting resources + +## Feedback + +These future issues are based on user feedback and operational insights. If you have suggestions for additional health checks or recommendations, please: + +- Share on [Discourse](https://discourse.crowdsec.net/) +- Join the discussion on [Discord](https://discord.gg/crowdsec) +- Open an issue on [GitHub](https://github.com/crowdsecurity/crowdsec-docs/issues) diff --git a/crowdsec-docs/unversioned/troubleshooting/intro.md b/crowdsec-docs/unversioned/troubleshooting/intro.md index 3072d0979..d3fd0a108 100644 --- a/crowdsec-docs/unversioned/troubleshooting/intro.md +++ b/crowdsec-docs/unversioned/troubleshooting/intro.md @@ -15,10 +15,29 @@ If you have any suggestions for this please open an [issue here](https://github. Also, checkout our 🩺 [**Stack Health-Check page**](/u/getting_started/health_check) to make sure your **Detection**, **Community Sharing** and **Remediation** are working properly -Here you'll also find Troubleshooting by topic: -* [Security Engine Troubleshooting](/troubleshooting/security_engine.mdx) -* [Remediation Components Troubleshooting](/troubleshooting/remediation_components.mdx) -* [CTI Troubleshooting](/troubleshooting/cti.mdx) +## Console Health Check Issues + +If you received a health check alert from the CrowdSec Console, check out the [**Console Health Check Issues**](/u/troubleshooting/console_issues) page for a complete list of issues, their trigger conditions, and dedicated troubleshooting guides. + +## Troubleshooting by Topic + +* [Security Engine Troubleshooting](/u/troubleshooting/security_engine) +* [Remediation Components Troubleshooting](/u/troubleshooting/remediation_components) +* [CTI Troubleshooting](/u/troubleshooting/cti) + +## Troubleshooting by Issue + +Individual troubleshooting guides for specific Console alerts: + +* [Security Engine Offline](/u/troubleshooting/issue_security_engine_offline) - Security Engine not reporting to Console +* [Engine No Alerts](/u/troubleshooting/issue_engine_no_alerts) - No alerts generated in 48 hours +* [Engine Too Many Alerts](/u/troubleshooting/issue_engine_too_many_alerts) - Abnormally high alert volume +* [Log Processor Offline](/u/troubleshooting/issue_log_processor_offline) - Log Processor not checking in +* [LP No Alerts](/u/troubleshooting/issue_lp_no_alerts) - Log Processor not generating alerts +* [LP No Logs Read](/u/troubleshooting/issue_lp_no_logs_read) - No logs being acquired +* [LP No Logs Parsed](/u/troubleshooting/issue_lp_no_logs_parsed) - Logs read but not parsed +* [Firewall Integration Offline](/u/troubleshooting/issue_fw_integration_offline) - Firewall bouncer not pulling decisions +* [RC Integration Offline](/u/troubleshooting/issue_rc_integration_offline) - Remediation component not pulling decisions ## Community support @@ -72,3 +91,7 @@ When using `cscli` to list your parsers, scenarios and collections, some might a ### Which information is sent to your services ? See [CAPI documentation](/docs/next/central_api/intro). + +### stack Health issues list + +
diff --git a/crowdsec-docs/unversioned/troubleshooting/issue_engine_no_alerts.md b/crowdsec-docs/unversioned/troubleshooting/issue_engine_no_alerts.md new file mode 100644 index 000000000..2fba19054 --- /dev/null +++ b/crowdsec-docs/unversioned/troubleshooting/issue_engine_no_alerts.md @@ -0,0 +1,181 @@ +--- +title: Engine No Alerts +id: issue_engine_no_alerts +--- + +The **Engine No Alerts** issue appears when your Security Engine has been running but hasn't generated any alerts in the last **48 hours**. + +## What Triggers This Issue + +- **Trigger condition**: No alerts generated for 48 hours +- **Criticality**: ⚠️ High +- **Impact**: Your detection system may not be working as expected + +## Common Root Causes + +- **Scenarios in simulation mode**: Detection scenarios are installed but set to simulation mode, preventing actual alerts. +- **Are appropriate collections installed**: make sure you have the detection scenarios and/or appsec rules covering your services needs +- **Low/no-traffic environment**: If your service handles very few request or is not open to the internet it's usually to observe low/no malicious activity. +- **Legitimate low-activity environment**: Your defenses preceding your service might be good enough that you don't detect additional malicious behaviors (CrowdSec blocklists or other protections may already deflect most malicious activity) + + + +**Other Issues** +- 🔗 **[No logs being read](/u/troubleshooting/issue_lp_no_logs_read)**: The acquisition configuration may be missing, disabled, or pointing to empty log sources. +- 🔗 **[No logs being parsed](/u/troubleshooting/issue_lp_no_logs_parsed)**: Logs are being read but parsers can't process them due to format mismatches or missing collections. + +## How to Diagnose + +If it's not due to [other issues](#otherIssues), here are the diagnosis and resolutions for other root causes. + +### Check if scenarios are in simulation mode + +Verify whether your scenarios are set to simulation mode, which prevents them from generating alerts: + +```bash +# On host +sudo cscli simulation status + +# Docker +docker exec crowdsec cscli simulation status + +# Kubernetes +kubectl exec -n crowdsec -it $(kubectl get pods -n crowdsec -l type=lapi -o name) -- cscli simulation status +``` + +If scenarios are listed, they're in simulation mode and won't be sent to CrowdSec console (they should however still appear in `cscli alerts list`). + +### Check if appropriate collections are installed + +Verify you have collections matching your protected services: + +```bash +# On host +sudo cscli collections list + +# Docker +docker exec crowdsec cscli collections list + +# Kubernetes +kubectl exec -n crowdsec -it $(kubectl get pods -n crowdsec -l type=lapi -o name) -- cscli collections list +``` + +Compare your installed collections against your actual services (nginx, apache, ssh, etc.). Missing collections means no detection rules for those services. + +### Evaluate your service activity level + +Check how much traffic your service is processing: + +```bash +# On host +sudo cscli metrics show acquisition parsers + +# Docker +docker exec crowdsec cscli metrics show acquisition parsers + +# Kubernetes +kubectl exec -n crowdsec -it $(kubectl get pods -n crowdsec -l type=lapi -o name) -- cscli metrics show acquisition parsers +``` + +Look at "Lines parsed" - if this number is very low (dozens or hundreds per day), you may simply have insufficient traffic volume for malicious activity to appear. + +### Check if proactive defenses are blocking threats upstream + +If you have CrowdSec blocklists or other protection layers active, they may be blocking malicious traffic before it reaches your scenarios: + +```bash +# On host +sudo cscli decisions list +sudo cscli metrics show bouncers + +# Docker +docker exec crowdsec cscli decisions list +docker exec crowdsec cscli metrics show bouncers + +# Kubernetes +kubectl exec -n crowdsec -it $(kubectl get pods -n crowdsec -l type=lapi -o name) -- cscli decisions list +kubectl exec -n crowdsec -it $(kubectl get pods -n crowdsec -l type=lapi -o name) -- cscli metrics show bouncers +``` + +High numbers of active decisions or bouncer blocks indicate your proactive defenses are working - malicious actors never reach your log-based detection. + +## How to Resolve + +### If scenarios are in simulation mode + +Disable simulation mode to allow alerts to be generated: + +```bash +# On host +sudo cscli simulation disable --all +sudo systemctl reload crowdsec + +# Docker +docker exec crowdsec cscli simulation disable --all +docker restart crowdsec + +# Kubernetes +kubectl exec -n crowdsec -it $(kubectl get pods -n crowdsec -l type=lapi -o name) -- cscli simulation disable --all +kubectl rollout restart deployment/crowdsec -n crowdsec +``` + +You can also disable simulation for specific scenarios only: + +```bash +sudo cscli simulation disable crowdsecurity/ssh-bf +sudo systemctl reload crowdsec +``` + +### If appropriate collections are missing + +Install collections matching your protected services. Visit the [CrowdSec Hub](https://hub.crowdsec.net/) to find collections for your stack: + +- **Web servers**: `crowdsecurity/nginx`, `crowdsecurity/apache2`, `crowdsecurity/caddy` +- **SSH**: `crowdsecurity/sshd` +- **Linux base**: `crowdsecurity/linux` +- **AppSec/WAF**: `crowdsecurity/appsec-*` collections for application-level protection + +Install collections using: + +```bash +# On host +sudo cscli collections install crowdsecurity/nginx +sudo systemctl reload crowdsec + +# Docker +docker exec crowdsec cscli collections install crowdsecurity/nginx +docker restart crowdsec + +# Kubernetes +kubectl exec -n crowdsec -it $(kubectl get pods -n crowdsec -l type=lapi -o name) -- cscli collections install crowdsecurity/nginx +kubectl rollout restart deployment/crowdsec -n crowdsec +``` + +### If this is a low-traffic environment + +For services with minimal traffic or limited internet exposure: + +1. **Verify detection is working** by triggering test scenarios as described in the [Health Check guide](/u/getting_started/health_check/#trigger-crowdsecs-test-scenarios) +2. **Consider this normal** - If your detection is properly working, low traffic may means fewer threats to detect and you can ignore the issue for now. + +### If proactive defenses are already handling threats + +This is actually a **positive outcome** - your blocklists and bouncers are preventing malicious traffic from reaching your services: + +1. **Verify your setup is working** by running the [Health Check detection tests](/u/getting_started/health_check#-detection-checks) to confirm scenarios can still trigger when needed +2. **Monitor bouncer metrics** to see how many threats are being blocked: `sudo cscli metrics show bouncers` +3. **Review active decisions** to understand what threats are being prevented: `sudo cscli decisions list` +4. **Keep the Console enrolled** to maintain visibility into your protection posture even if local alerts are minimal + +## Related Issues + +- [LP No Logs Read](/u/troubleshooting/lp_no_logs_read) - If acquisition is not working +- [LP No Logs Parsed](/u/troubleshooting/lp_no_logs_parsed) - If parsing is failing +- [Security Engine Troubleshooting](/u/troubleshooting/security_engine) - General Security Engine issues + +## Getting Help + +If you've verified logs are being read and parsed correctly but still see no alerts: + +- Check [Discourse](https://discourse.crowdsec.net/) for similar cases +- Ask on [Discord](https://discord.gg/crowdsec) with your `cscli metrics` output \ No newline at end of file diff --git a/crowdsec-docs/unversioned/troubleshooting/issue_engine_too_many_alerts.md b/crowdsec-docs/unversioned/troubleshooting/issue_engine_too_many_alerts.md new file mode 100644 index 000000000..e6ac43a6d --- /dev/null +++ b/crowdsec-docs/unversioned/troubleshooting/issue_engine_too_many_alerts.md @@ -0,0 +1,175 @@ +--- +title: Engine Too Many Alerts +id: issue_engine_too_many_alerts +--- + +The **Engine Too Many Alerts** issue appears when your Security Engine generates an abnormally high volume of alerts (more than 250,000 in a 6-hour period). This usually indicates a misconfigured scenario, false positives, or an ongoing large-scale attack. + +## What Triggers This Issue + +- **Trigger condition**: More than 250,000 alerts in 6 hours +- **Criticality**: High +- **Impact**: May indicate misconfiguration, performance issues, or a real large scale attack. + +## Common Root Causes + +- **Misconfigured or overly sensitive scenario**: A scenario with thresholds set too low or matching too broadly can trigger excessive alerts. +- **Log duplication**: The same log file is being read multiple times due to acquisition misconfiguration. +- **Parser creating duplicate events**: A parser issue causing the same log line to generate multiple events. +- **Actual large-scale attack**: A genuine distributed attack (DDoS, brute force campaign) targeting your infrastructure. + +## How to Diagnose + +### Check alert volume by scenario + +Identify which scenarios are generating the most alerts: + +```bash +# On host +sudo cscli alerts list -l 100 + +# Docker +docker exec crowdsec cscli alerts list -l 100 + +# Kubernetes +kubectl exec -n crowdsec -it $(kubectl get pods -n crowdsec -l type=lapi -o name) -- cscli alerts list -l 100 +``` + +Look for patterns: +- Is one scenario dominating the alert count? +- Are the same IPs repeatedly triggering alerts? +- Are alerts legitimate threats or false positives? + +### Check metrics for scenario overflow + +```bash +# On host +sudo cscli metrics show scenarios + +# Docker +docker exec crowdsec cscli metrics show scenarios + +# Kubernetes +kubectl exec -n crowdsec -it $(kubectl get pods -n crowdsec -l type=lapi -o name) -- cscli metrics show scenarios +``` + +Look for scenarios with extremely high "Overflow" counts or "Current count" numbers. + +### Check for log duplication + +Review acquisition configuration to ensure log files aren't listed multiple times: + +```bash +# On host +sudo cat /etc/crowdsec/acquis.yaml +sudo ls -la /etc/crowdsec/acquis.d/ + +# Docker +docker exec crowdsec cat /etc/crowdsec/acquis.yaml + +# Kubernetes +kubectl get configmap -n crowdsec crowdsec-config -o yaml | grep -A 20 acquis +``` + +Also check metrics for duplicate acquisition sources: + +```bash +sudo cscli metrics show acquisition +``` + +## How to Resolve + +### For misconfigured scenarios + +#### Put the problematic scenario in simulation mode + +This allows you to investigate without generating alerts: + +```bash +# On host +sudo cscli simulation enable crowdsecurity/scenario-name + +# Docker +docker exec crowdsec cscli simulation enable crowdsecurity/scenario-name + +# Kubernetes +kubectl exec -n crowdsec -it $(kubectl get pods -n crowdsec -l type=lapi -o name) -- cscli simulation enable crowdsecurity/scenario-name +``` + +Then reload: +```bash +sudo systemctl reload crowdsec +``` + +#### Tune the scenario threshold + +If the scenario is triggering too easily, you can create a custom version with adjusted thresholds. See the [scenario documentation](/docs/scenarios/intro) for details on customizing scenarios. + +#### Use whitelists + +If specific IPs or patterns are causing false positives, create a whitelist. See [Parser Whitelists](/docs/log_processor/whitelist/intro) or [Profiles](/docs/local_api/profiles/intro). + +### For log duplication + +Remove duplicate entries from your acquisition configuration: + +1. Edit acquisition files: `/etc/crowdsec/acquis.yaml` or files in `/etc/crowdsec/acquis.d/` +2. Ensure each log source appears only once +3. Restart CrowdSec: `sudo systemctl restart crowdsec` + +### For legitimate large-scale attacks + +If you're experiencing a real attack: + +1. **Verify your remediation components are working** to block attackers +2. **Check that decisions are being applied**: `cscli decisions list` +3. **Consider increasing timeout durations** in profiles if attackers are returning +4. **Subscribe to Community Blocklist** for proactive blocking of known malicious IPs +5. **Monitor your infrastructure** for the attack's impact + +### For parser issues + +If a parser is creating duplicate events: + +1. Use `cscli explain` to test parsing: + ```bash + sudo cscli explain --log "" --type + ``` +2. Check if the log line generates multiple events incorrectly +3. Review parser configuration or report the issue to the [CrowdSec Hub](https://github.com/crowdsecurity/hub/issues) + +## Verify Resolution + +After making changes: + +1. Restart or reload CrowdSec: `sudo systemctl restart crowdsec` +2. Monitor alert generation for 30 minutes: + ```bash + watch -n 30 'cscli alerts list | head -20' + ``` +3. Check metrics: `sudo cscli metrics show scenarios` +4. Verify alert volume has returned to normal levels + +## Performance Impact + +Excessive alerts can impact performance: + +- **High memory usage**: Each active scenario bucket consumes memory +- **Database growth**: Large numbers of alerts increase database size +- **API latency**: Bouncers may experience slower decision pulls + +If performance is degraded, consider: +- Cleaning old alerts: `cscli alerts delete --all` (after investigation) +- Reviewing database maintenance: [Database documentation](/docs/local_api/database) + +## Related Issues + +- [Security Engine Troubleshooting](/u/troubleshooting/security_engine) - General Security Engine issues +- [LP No Logs Parsed](/u/troubleshooting/lp_no_logs_parsed) - If parsing is creating unusual events + +## Getting Help + +If you need assistance analyzing alert patterns: + +- Share anonymized alert samples on [Discourse](https://discourse.crowdsec.net/) +- Ask on [Discord](https://discord.gg/crowdsec) with your `cscli metrics show scenarios` output diff --git a/crowdsec-docs/unversioned/troubleshooting/issue_fw_integration_offline.md b/crowdsec-docs/unversioned/troubleshooting/issue_fw_integration_offline.md new file mode 100644 index 000000000..63b674868 --- /dev/null +++ b/crowdsec-docs/unversioned/troubleshooting/issue_fw_integration_offline.md @@ -0,0 +1,208 @@ +--- +title: Firewall Integration Offline +id: issue_fw_integration_offline +--- + +The **Firewall Integration Offline** issue appears when a firewall that is configured to pull blocklists directly from CrowdSec's Blocklist-as-a-Service (BLaaS) endpoint has not pulled the list for more than 24 hours. This means your firewall is no longer receiving the latest threat intelligence and blocked IPs. + +## What Triggers This Issue + +- **Trigger condition**: No pull from BLaaS endpoint for 24 hours +- **Criticality**: 🔥 Critical +- **Impact**: Firewall blocklist is not being updated - new threats are not being blocked - Firewall potentially malfunctioning. + +## Common Root Causes + +- **Firewall rule disabled or removed**: The firewall rule that pulls from external blocklists no longer exists or has been disabled. +- **BLaaS credentials invalid**: The basic auth credentials configured in the firewall for accessing the BLaaS endpoint is incorrect, expired, or has been regenerated. +- **Network connectivity issues**: The firewall cannot reach the BLaaS endpoint due to network problems, DNS issues, or routing failures. +- **Firewall offline**: The firewall itself is powered off, unreachable, or not processing rules. + +## How to Diagnose + +### Check if the firewall is running and has access to BLaaS endpoint + +// a few lines describe generic ways for them to check their firewall is workin and can ping https://admin.api.crowdsec.net + +### Check if the firewall rule for external blocklist still exists + +Access your firewall's management interface and verify: + +1. **Navigate to the external blocklist configuration section** (varies by vendor): + - FortiGate: Security Fabric → External Connectors → Threat Feeds + - Palo Alto: Objects → External Dynamic Lists + - ... + +2. **Verify the rule exists and is valid:** + - Is the CrowdSec blocklist rule present? + - Is it enabled/active? + - Check the URL configured - should point to `https://admin.api.crowdsec.net/...` + - Some firewalls have a "test" function for external feeds access + +### Check BLaaS endpoint credentials + +Verify the basic auth credentials configured in your firewall matches the one from the Console: + +**Get the correct basic auth credentials from CrowdSec Console:** +If you lost the credentials you can regenerate them: + - Navigate to **Blocklists** → **Integrations**: select your firewall integration + - Click **Configuration** → **Refresh Credentials** if you suspect the key is wrong (this will generate a new one) + - Copy the displayed API key or authentication header +**Check authentication method:** + - Some firewalls use HTTP headers (`X-Api-Key: `) + - Others may use URL parameters (`?api_key=`) + - Some may offer basic auth forms that are not functional *(Checkpoint among other)*, you can put the credentials directly into the URL: `https://:@https://admin.api.crowdsec.net/...` + +### Test connectivity to BLaaS endpoint + +From a host on the same network as your firewall (or from the firewall's CLI if available): + +```bash +# Test network connectivity +curl -I https://admin.api.crowdsec.net/ + +# Test with Credentials +curl -I https://:admin.api.crowdsec.net/v1/integrations//content + +# Expected response: JSON with decisions or empty list +# Should NOT return 401 Unauthorized or 403 Forbidden +``` + +If you get connection errors: +- DNS resolution failures - check DNS configuration +- Connection timeouts - firewall outbound rules may be blocking +- SSL/TLS errors - firewall may need updated root certificates + +### Check firewall logs + +Review your firewall's logs for errors related to external blocklist updates: + +**Common log locations by vendor:** +*Path to logs may vary depending on your firewall version, check your documentation.* +- **FortiGate**: Log & Report → System Events → filter for "Threat Feed" +- **Palo Alto**: Monitor → System Logs → filter for "External Dynamic List" +- **pfSense**: Status → System Logs → Firewall +- **OPNsense**: System → Log Files → Firewall + +**Look for error messages like:** +- `failed to download` - connectivity issue +- `authentication failed` or `401` - API key invalid +- `SSL certificate verification failed` - certificate trust issue +- `timeout` - network connectivity or endpoint unreachable +- `invalid format` - blocklist format mismatch + +## How to Resolve + +### If the firewall rule is disabled or missing + +Re-enable or recreate the external blocklist rule: + +### If BLaaS credentials are invalid + +Update the API key in your firewall configuration: + +1. **Regenerate API key in Console** (if needed): + - Navigate to **Integrations** → **Blocklists** → select firewall integration + - Click **Refresh Credentials** + - Copy the new API key + +2. **Update firewall configuration** with the new API key: + - Edit the external blocklist rule + - Update the authentication header or API key field + - Save and apply changes + +3. **Trigger manual update** to test: + - Most firewalls have a "Refresh Now" or "Update" button + - Click it to force an immediate pull from BLaaS + - Check logs for success or errors + +### If network connectivity is failing + +Fix network issues preventing firewall from reaching BLaaS: + +1. **Check firewall outbound rules:** + - Ensure firewall allows outbound HTTPS (port 443) to `admin.api.crowdsec.net` + - Verify no egress filtering is blocking the connection + - Check if firewall's management interface has internet access + +2. **Verify DNS resolution:** + ```bash + # From firewall CLI or nearby host + nslookup admin.api.crowdsec.net + dig admin.api.crowdsec.net + ``` + + If DNS fails, configure firewall to use public DNS (8.8.8.8, 1.1.1.1) temporarily + +3. **Check proxy settings:** + - If firewall uses a proxy for outbound connections, verify proxy configuration + - Ensure proxy allows HTTPS connections to CrowdSec endpoints + - Test proxy with: `curl -x : https://admin.api.crowdsec.net/` + +4. **Test from firewall CLI:** + - If firewall has CLI access, test connectivity directly: + ```bash + # Example for pfSense/OPNsense + curl -I https://admin.api.crowdsec.net/ + + # Example for FortiGate + execute ping admin.api.crowdsec.net + execute telnet admin.api.crowdsec.net 443 + ``` + +5. **Check SSL/TLS certificate trust:** + - Ensure firewall trusts public CA certificates + - Update firewall's certificate store if needed + - Temporarily disable certificate verification for testing (then fix properly) + +### If the firewall is offline + +Restore firewall connectivity: + +1. **Physical/Virtual access:** + - Check if firewall hardware is powered on + - For virtual firewalls, verify VM is running + - Check network cables and interfaces + +2. **Management access:** + - Connect via console/KVM if network management is down + - Verify management interface IP configuration + - Check firewall's default gateway + +3. **After restoring connectivity:** + - Trigger manual blocklist update + - Verify last pull timestamp updates in Console + - Monitor firewall logs for successful updates + +## Verify Resolution + +After making changes: + +1. **Trigger manual update on firewall:** + - Use the firewall's "Refresh" or "Update Now" function + - Wait 30-60 seconds for the pull to complete + +2. **Check in CrowdSec Console:** + - Navigate to **Integrations** → **Blocklists** + - Verify the "Last Pull" timestamp has updated to a recent time (within last few minutes) + - The offline alert should clear automatically during next polling cycle + +3. **Verify blocklist is populated:** + - Check your firewall shows IP addresses in the blocklist + - Number of entries should match your subscription tier and decisions + - Example: FortiGate → System → External Resources → view entries + +## Related Issues + +- [RC Integration Offline](/u/troubleshooting/issue_rc_integration_offline) - Similar issue for remediation components (bouncers) +- [Security Engine Offline](/u/troubleshooting/issue_security_engine_offline) - If using agent-based deployment +- [Blocklist Integration Setup](/u/integrations/blocklists/intro) - Initial setup guide + +## Getting Help + +If your firewall integration still shows as offline: + +- Check firewall vendor's documentation for external blocklist configuration +- Share firewall logs on [Discourse](https://discourse.crowdsec.net/) +- Ask on [Discord](https://discord.gg/crowdsec) with firewall model and error messages +- Contact CrowdSec support via Console if BLaaS endpoint issues persist diff --git a/crowdsec-docs/unversioned/troubleshooting/issue_log_processor_offline.md b/crowdsec-docs/unversioned/troubleshooting/issue_log_processor_offline.md new file mode 100644 index 000000000..53dc6b89b --- /dev/null +++ b/crowdsec-docs/unversioned/troubleshooting/issue_log_processor_offline.md @@ -0,0 +1,296 @@ +--- +title: Log Processor Offline +id: issue_log_processor_offline +--- + +When a Log Processor (Security Engine used to read log in a distributed setup) has not checked in with the Local API (LAPI) of the central Security Engine for more than 24 hours. + +## What Triggers This Issue + +- **Trigger condition**: Log Processor has not checked in with Local API for more than 24 hours +- **Criticality**: 🔥 Critical +- **Impact**: Services supposed to be watched by this LP are not anymore - potential threats undetected + +## Common Root Causes + +- **Service stopped or stuck**: The crowdsec service of this LP has crashed, hung, or was manually stopped on the agent host. +- **Machine not validated or credentials revoked**: The agent's credentials are pending validation, were removed from the central LAPI, or the credentials file is missing/corrupt. +- **Local API unreachable from agent**: Network issues, firewall rules, or configuration errors prevent the agent from connecting to the LAPI endpoint. +- **Local API service unavailable**: The central LAPI service itself is down or not responding, affecting all agents trying to connect *(would have triggered an other issue)*. + +## How to Diagnose + +### Check if the service is stopped or stuck + +- Confirm the service state on the host: + +```bash +sudo systemctl status crowdsec +sudo journalctl -u crowdsec -n 50 +``` + +- For containerised deployments, verify the workload is still running: + +```bash +docker ps --filter name=crowdsec +kubectl get pods -n crowdsec +``` + +- On the LAPI node, run `sudo cscli machines list` and check whether the `Last Update` column is older than 24 hours for the affected machine. + +### Check if machine credentials are valid + +From the LAPI host: + +```bash +# On host +sudo cscli machines list + +# Docker +docker exec crowdsec cscli machines list + +# Kubernetes +kubectl exec -n crowdsec -it $(kubectl get pods -n crowdsec -l type=lapi -o name) -- cscli machines list +``` + +- If the machine shows in `PENDING` state or is missing entirely, credentials need validation +- On the agent host, ensure `/etc/crowdsec/local_api_credentials.yaml` exists and contains valid login and password +- If you recently reinstalled or renamed the machine, it must be re-validated + +### Check if the Local API is reachable from the agent + +From the agent host, test connectivity to the LAPI: + +```bash +# On host +sudo cscli lapi status + +# Docker +docker exec crowdsec-agent cscli lapi status + +# Kubernetes +kubectl exec -n crowdsec -it -- cscli lapi status +``` + +Look for errors: +- `401 Unauthorized` - credentials issue +- TLS failures - certificate problems +- Connection timeouts - network/firewall blocking + +Also verify the API endpoint in `/etc/crowdsec/config.yaml`: +- Check `api.client.credentials_path` points to correct credentials file +- Verify `url` matches your LAPI endpoint (default: `http://localhost:8080`) +- Review `ca_cert` and `insecure_skip_verify` if using TLS + +Test network connectivity: + +```bash +nc -zv 8080 +``` + +### Check if the Local API service is available + +If several agents show as offline simultaneously, the LAPI service itself might be down. + +On the LAPI machine: + +```bash +# On host +sudo systemctl status crowdsec +sudo journalctl -u crowdsec -n 50 + +# Docker +docker ps --filter name=crowdsec-lapi +docker logs crowdsec-lapi --tail 50 + +# Kubernetes +kubectl get pods -n crowdsec -l type=lapi +kubectl logs -n crowdsec -l type=lapi --tail 50 +``` + +Check `sudo cscli metrics show engine` on the LAPI to confirm it is processing events from other agents. + +## How to Resolve + +### If the service is stopped or stuck + +Restart the Log Processor service: + +```bash +# On host (systemd) +sudo systemctl restart crowdsec + +# Docker +docker restart crowdsec + +# Kubernetes +kubectl rollout restart deployment/crowdsec -n crowdsec +``` + +After the restart, verify the agent is checking in: + +```bash +# On LAPI host +sudo cscli machines list +``` + +Check that the `Last Update` timestamp is recent (within last few minutes). + +### If machine credentials need validation + +#### Using credentials (single machine setups) + +:::info +More suitable for single machine setups. +::: + +To regenerate credentials directly on the LAPI host when the agent runs locally: + +```bash +sudo cscli machines add -a +``` + +#### Using registration system (distributed setups) + +:::info +Registration system is more suitable for distributed setups. +::: + +Approve pending machines on the LAPI: + +```bash +sudo cscli machines list +sudo cscli machines validate +``` + +If credentials were removed or the agent was rebuilt, re-register it against the LAPI: + +```bash +sudo cscli lapi register --url http://:8080 --machine +sudo systemctl restart crowdsec +``` + +Update the `--url` to match your deployment. Auto-registration tokens are covered in [Machines management](/u/user_guides/machines_mgmt#machine-auto-validation). + +#### Kubernetes pod rotation (stale machines) + +In Kubernetes environments, pod restarts and scaling events create new pod identities. Old Log Processor entries may remain in the LAPI's machine list even after pods are deleted, causing the Console to show offline agents that no longer exist. + +To identify and clean up stale machines: + +1. List all registered machines and note their last update times: + ```bash + # On LAPI host + sudo cscli machines list + + # In Kubernetes + kubectl exec -n crowdsec -it $(kubectl get pods -n crowdsec -l type=lapi -o name) -- cscli machines list + ``` + +2. Identify machines that haven't checked in for 24+ hours and verify they correspond to deleted pods: + ```bash + # Check current running pods + kubectl get pods -n crowdsec -l app=crowdsec-agent -o wide + ``` + +3. Prune stale machines: + ```bash + # Delete specific stale machine + sudo cscli machines delete + + # Or prune all machines not seen in 24+ hours (use with caution) + sudo cscli machines prune + ``` + +4. After pruning, you may need to restart the agent deployment to regenerate credentials for current pods: + ```bash + kubectl rollout restart deployment/crowdsec-agent -n crowdsec + ``` + +5. Verify new pods register successfully: + ```bash + # Wait 1-2 minutes then check + kubectl exec -n crowdsec -it $(kubectl get pods -n crowdsec -l type=lapi -o name) -- cscli machines list + ``` + +:::tip +To prevent accumulation of stale machines in Kubernetes, consider using [auto-registration tokens](/u/user_guides/machines_mgmt#machine-auto-validation) which handle pod lifecycle automatically. +::: + +Once pruned, the issues concerning those pruned LPs will disappear on next SE info update *(within 30minutes)*. + +### If the central LAPI is unreachable from the agent + +Open the required port on firewalls or security groups: + +```bash +# Test connectivity +nc -zv 8080 + +# If using firewall, ensure port is open +sudo ufw allow 8080/tcp +# or +sudo firewall-cmd --add-port=8080/tcp --permanent +sudo firewall-cmd --reload +``` + +If using TLS: +- Update the agent trust store (`ca_cert` in `/etc/crowdsec/config.yaml`) if certificates were renewed +- Temporarily enable `insecure_skip_verify: true` for testing (then fix certificates properly) +- Follow [TLS authentication](/docs/local_api/tls_auth) for proper setup + +If using proxies or load balancers: +- Ensure they forward HTTP headers correctly +- Verify TLS passthrough or termination is configured properly +- Check that the LAPI endpoint is accessible through the proxy + +### If the Local API service is unavailable + +Restart the LAPI service: + +```bash +# On host (systemd) +sudo systemctl restart crowdsec + +# Kubernetes +kubectl rollout restart deployment/crowdsec-lapi -n crowdsec +``` + +If the LAPI repeatedly crashes or loses database access: + +1. Collect diagnostics: + ```bash + sudo cscli support dump + ``` + +2. Review `/var/log/crowdsec/` (or container logs) for errors +3. Check database connectivity and credentials +4. Consult the [Security Engine troubleshooting guide](/u/troubleshooting/security_engine) if issues persist + +## Verify Resolution + +After making changes: + +1. Wait 1-2 minutes for the agent to check in +2. Verify on the LAPI host: + ```bash + sudo cscli machines list + ``` +3. Check that `Last Update` timestamp is recent (within last few minutes) +4. The Console alert will clear automatically during the next polling cycle + +## Related Issues + +- [Engine No Alerts](/u/troubleshooting/issue_engine_no_alerts) - If the agent is online but not generating alerts +- [LP No Logs Read](/u/troubleshooting/issue_lp_no_logs_read) - If acquisition is not working +- [Security Engine Troubleshooting](/u/troubleshooting/security_engine) - General Security Engine issues + +## Getting Help + +If the agent still shows as offline after following these steps: + +- Check [Discourse](https://discourse.crowdsec.net/) for similar issues +- Ask on [Discord](https://discord.gg/crowdsec) with your `cscli machines list` and `cscli lapi status` output +- Share the output of `sudo cscli support dump` if the issue persists + +Consider adding a [notification rule](/u/console/notification_integrations/rule) for **Log Processor Offline** to be alerted promptly when this happens again. diff --git a/crowdsec-docs/unversioned/troubleshooting/issue_lp_no_alerts.md b/crowdsec-docs/unversioned/troubleshooting/issue_lp_no_alerts.md new file mode 100644 index 000000000..a1ce71e6d --- /dev/null +++ b/crowdsec-docs/unversioned/troubleshooting/issue_lp_no_alerts.md @@ -0,0 +1,165 @@ +--- +title: LP No Alerts +id: issue_lp_no_alerts +--- + +The **LP No Alerts** issue appears when a specific Log Processor (agent) is running and communicating with the Local API but hasn't generated any alerts in the last 48 hours. This is similar to [Engine No Alerts](/u/troubleshooting/engine_no_alerts) but applies to individual Log Processor instances in distributed setups. + +## What Triggers This Issue + +- **Trigger condition**: Log Processor online but no alerts for 48 hours +- **Criticality**: High +- **Impact**: Detection may not be working on this specific agent + +## Common Root Causes + +- **Scenarios in simulation mode**: Detection scenarios are installed but running in simulation mode on this agent. +- **Low-activity monitored service**: The service monitored by this Log Processor may genuinely have no malicious activity. + +#### Other Issues +- 🔗 **[No logs being read](/u/troubleshooting/issue_lp_no_logs_read)**: The acquisition configuration on this specific Log Processor may be missing, disabled, or pointing to empty sources. +- 🔗 **[No logs being parsed](/u/troubleshooting/issue_lp_no_logs_parsed)**: Logs are being read but parsers can't process them due to format mismatches or missing collections. + +## How to Diagnose + +If it's not due to [other issues](#other-issues), here are the diagnosis and resolutions for other root causes. + +### Identify the affected Log Processor + +Check which machine is not generating alerts: + +```bash +# On LAPI host +sudo cscli machines list +``` + +Look for the Last Update timestamp and verify which machine corresponds to the alert. + +### Check metrics on the affected agent + +Connect to the specific Log Processor host and check its metrics: + +```bash +# On the Log Processor host +sudo cscli metrics show acquisition parsers scenarios + +# Docker +docker exec crowdsec-agent cscli metrics show acquisition parsers scenarios + +# Kubernetes - for specific agent pod +kubectl exec -n crowdsec -it -- cscli metrics show acquisition parsers scenarios +``` + +Look for: +- **Acquisition Metrics**: Are log lines being read? (non-zero "Lines read") +- **Parser Metrics**: Are logs being parsed? (non-zero "Lines parsed") +- **Scenario Metrics**: Are scenarios evaluating events? + +### Check recent alerts from this agent + +```bash +# On the Log Processor host +sudo cscli alerts list + +# Or filter by origin on LAPI +sudo cscli alerts list --origin +``` + +## How to Resolve + +### If no logs are being read + +Follow the [LP No Logs Read troubleshooting guide](/u/troubleshooting/lp_no_logs_read) for detailed steps. + +**Quick checks on the affected agent:** + +```bash +# Verify acquisition configuration +sudo cat /etc/crowdsec/acquis.yaml +sudo ls -la /etc/crowdsec/acquis.d/ + +# Check log file existence and permissions +ls -la /var/log/nginx/ # or your specific log path + +# Verify CrowdSec can access logs +sudo -u crowdsec cat /var/log/nginx/access.log | head -5 +``` + +### If logs are read but not parsed + +Follow the [LP No Logs Parsed troubleshooting guide](/u/troubleshooting/lp_no_logs_parsed) for detailed steps. + +**Quick checks on the affected agent:** + +```bash +# Check installed collections +sudo cscli collections list + +# Test parsing with a sample log line +sudo cscli explain --log "" --type + +# Example for nginx +sudo cscli explain --log '192.168.1.1 - - [01/Jan/2024:12:00:00 +0000] "GET / HTTP/1.1" 200 1234' --type nginx +``` + +### If scenarios are in simulation mode + +Check and disable simulation mode on the affected agent: + +```bash +# Check simulation status +sudo cscli simulation status + +# Disable for all scenarios +sudo cscli simulation disable --all +sudo systemctl reload crowdsec + +# Or for specific scenarios +sudo cscli simulation disable crowdsecurity/ssh-bf +sudo systemctl reload crowdsec +``` + +### If this is a low-activity service + +For legitimately clean services: + +1. **Test with dummy scenarios** using the [Health Check guide](/u/getting_started/health_check) to verify the detection pipeline works +2. **Verify the agent is processing logs** with `cscli metrics show acquisition` +3. **Accept the low alert rate** if the service truly has no malicious traffic + +## Verify Resolution + +After making changes on the affected Log Processor: + +1. Restart the agent: `sudo systemctl restart crowdsec` +2. Wait a few minutes for processing +3. Check metrics: `sudo cscli metrics show scenarios` +4. Trigger a test alert: [Health Check detection tests](/u/getting_started/health_check#-detection-checks) +5. Verify alert appears: `sudo cscli alerts list` + +## Distributed Setup Considerations + +In multi-agent deployments: + +- **Each agent processes its own logs independently** +- **Agents forward alerts to the Local API** +- **One agent having no alerts doesn't affect others** + +If multiple agents show no alerts, review: +- Common configuration issues (e.g., centralized config management problems) +- Network connectivity between agents and LAPI +- Synchronized collection installations across all agents + +## Related Issues + +- [Engine No Alerts](/u/troubleshooting/engine_no_alerts) - Similar issue at the Security Engine level +- [LP No Logs Read](/u/troubleshooting/lp_no_logs_read) - If acquisition is not working +- [LP No Logs Parsed](/u/troubleshooting/lp_no_logs_parsed) - If parsing is failing +- [Log Processor Offline](/u/troubleshooting/log_processor_offline) - If the agent is not communicating at all + +## Getting Help + +If you've verified logs are being read and parsed but still see no alerts: + +- Share your setup details on [Discourse](https://discourse.crowdsec.net/) +- Ask on [Discord](https://discord.gg/crowdsec) with `cscli metrics` output diff --git a/crowdsec-docs/unversioned/troubleshooting/issue_lp_no_logs_parsed.md b/crowdsec-docs/unversioned/troubleshooting/issue_lp_no_logs_parsed.md new file mode 100644 index 000000000..553d7d575 --- /dev/null +++ b/crowdsec-docs/unversioned/troubleshooting/issue_lp_no_logs_parsed.md @@ -0,0 +1,260 @@ +--- +title: LP No Logs Parsed +id: issue_lp_no_logs_parsed +--- + +The **LP No Logs Parsed** issue appears when logs are being successfully read by the Log Processor but none are being parsed correctly in the last 48 hours. This means the acquisition is working, but parsers can't interpret the log format. + +## What Triggers This Issue + +- **Trigger condition**: Logs read but no successful parsing for 48 hours +- **Criticality**: Critical +- **Impact**: No events generated means no detection or alerts possible + +## Common Root Causes + +- **Missing collection or parsers**: The required parser collection for your log format isn't installed. +- **Custom or unexpected log format**: Logs don't match the format expected by the parser (custom format, version mismatch, etc.). + +For more advanced cases (often for custom made parsers): +- **Acquisition type mismatch**: The `type:` or `program:` label in acquisition doesn't match any installed parser's FILTER. +- **Parser FILTER not matching**: Parser exists but its FILTER clause doesn't match the acquisition label. + +## How to Diagnose + +### Check parsing metrics + +```bash +# On host +sudo cscli metrics show acquisition parsers + +# Docker +docker exec crowdsec cscli metrics show acquisition parsers + +# Kubernetes +kubectl exec -n crowdsec -it -- cscli metrics show acquisition parsers +``` + +**What to look for:** +- **Acquisition**: "Lines read" should be > 0 (confirms logs are being read) +- **Parsers**: "Lines parsed" should be > 0 (currently 0 means parsing is failing) +- **Unparsed lines**: Check if there's a high "unparsed" count + +### Use cscli explain to test parsing + +Take a sample log line and test it: + +```bash +# Test with your actual log line +sudo cscli explain --log "192.168.1.1 - - [01/Jan/2024:12:00:00 +0000] \"GET / HTTP/1.1\" 200 1234" --type nginx + +# Or test from a file +sudo cscli explain --file /var/log/nginx/access.log --type nginx +``` + +**What to look for:** +- 🔴 (red) next to parser names means the parser didn't match +- 🟢 (green) means the parser succeeded +- If all parsers show 🔴, the log format isn't being recognized + +### Check installed collections and parsers + +```bash +# List installed collections +sudo cscli collections list + +# List installed parsers +sudo cscli parsers list + +# Check specific parser details +sudo cscli parsers inspect crowdsecurity/nginx-logs +``` + +### Verify acquisition type/program label + +```bash +# Check your acquisition configuration +sudo cat /etc/crowdsec/acquis.yaml +sudo cat /etc/crowdsec/acquis.d/*.yaml +``` + +Compare the `type:` (or `program:` in Kubernetes) with installed parser names. + +## How to Resolve + +### Install missing collection + +Most services have a collection that includes parsers and scenarios: + +```bash +# Search for collections +sudo cscli collections search nginx + +# Install the collection +sudo cscli collections install crowdsecurity/nginx + +# Restart CrowdSec +sudo systemctl restart crowdsec +``` + +**Docker:** +```yaml +environment: + COLLECTIONS: "crowdsecurity/nginx crowdsecurity/linux" +``` +Then restart the container. + +**Kubernetes:** +```yaml +agent: + env: + - name: COLLECTIONS + value: "crowdsecurity/nginx crowdsecurity/traefik" +``` +Then: `helm upgrade crowdsec crowdsec/crowdsec -n crowdsec -f values.yaml` + +### Fix acquisition type/program mismatch + +The acquisition label must match a parser's FILTER: + +#### On Host or Docker + +Check your `acquis.yaml`: +```yaml +filenames: + - /var/log/nginx/access.log +labels: + type: nginx # This must match a parser FILTER +``` + +Common types: +- `nginx` - for NGINX logs +- `apache2` - for Apache logs +- `syslog` - for syslog-formatted logs (SSH, etc.) +- `mysql` - for MySQL logs +- `postgres` - for PostgreSQL logs + +#### Kubernetes + +In Kubernetes, use `program:` instead of `type:`: +```yaml +agent: + acquisition: + - namespace: production + podName: nginx-* + program: nginx # This must match parser FILTER +``` + +**After changing configuration:** +```bash +sudo systemctl restart crowdsec +# or docker restart crowdsec +# or helm upgrade (for Kubernetes) +``` + +### Handle custom log formats + +If you are using non-default log formats for your services or if they are relayed by a 3rd party service they may be changed by this proxy service. + +#### Option 1: Adjust log format to match parser +**NGINX example:** +```nginx +# In nginx.conf, use the combined format +log_format combined '$remote_addr - $remote_user [$time_local] ' + '"$request" $status $body_bytes_sent ' + '"$http_referer" "$http_user_agent"'; +access_log /var/log/nginx/access.log combined; +``` + +#### Option 2: Create a custom parser +1. Follow the [Create parsers doc](/log_processor/parsers/create) to develop and test your parser +2. Get help from our [Discord](https://discord.gg/crowdsec) community is you hit roadblocks. + +**Simple custom parser example:** +```yaml +onsuccess: next_stage +debug: false +filter: "evt.Parsed.program == 'my-custom-app'" +name: my-org/my-custom-app-logs +description: "Custom parser for my application" +grok: + pattern: '%{IPORHOST:source_ip} - %{DATA:message}' + apply_on: message +statics: + - meta: log_type + value: my_custom_app + - meta: service + value: http +``` + +#### Option 3: Use a different parser +Some services have multiple parser options. Check the [Hub](https://app.crowdsec.net/hub/parsers) for alternatives. + +### Debug parser FILTER issues + +If a parser is installed but not matching, check its FILTER: + +```bash +# View parser details +sudo cscli parsers inspect crowdsecurity/nginx-logs + +# Look for the "filter" field +# Example: filter: "evt.Parsed.program == 'nginx'" +``` + +The FILTER must match your acquisition label. If your label is `type: nginx`, the parser FILTER should check `evt.Line.Labels.type == "nginx"` or `evt.Parsed.program == "nginx"`. + +## Verify Resolution + +After making changes: + +1. **Restart CrowdSec:** + ```bash + sudo systemctl restart crowdsec + ``` + +2. **Wait 1-2 minutes for log processing** + +3. **Check metrics again:** + ```bash + sudo cscli metrics show parsers + ``` + + **"Lines parsed" should now be > 0** + +4. **Test with cscli explain:** + ```bash + sudo cscli explain --log "" --type + ``` + + **Parsers should show 🟢 (green) indicators** + +5. **Verify events are reaching scenarios:** + ```bash + sudo cscli metrics show scenarios + ``` + +## Common Parser FILTER Values + +| Service | Acquisition Label | Parser FILTER | +|---------|------------------|---------------| +| NGINX | `type: nginx` | `evt.Line.Labels.type == "nginx"` | +| Apache | `type: apache2` | `evt.Line.Labels.type == "apache2"` | +| SSH (syslog) | `type: syslog` | `evt.Line.Labels.type == "syslog"` | +| Traefik | `program: traefik` | `evt.Parsed.program == "traefik"` | +| MySQL | `type: mysql` | `evt.Line.Labels.type == "mysql"` | + +## Related Issues + +- [LP No Logs Read](/u/troubleshooting/lp_no_logs_read) - If logs aren't being read at all +- [LP No Alerts](/u/troubleshooting/lp_no_alerts) - If logs are parsed but scenarios don't trigger +- [Engine No Alerts](/u/troubleshooting/engine_no_alerts) - Similar issue at the Security Engine level + +## Getting Help + +If parsing still fails: + +- Test your logs in [CrowdSec Playground](https://playground.crowdsec.net/) +- Share your log samples and acquisition config on [Discourse](https://discourse.crowdsec.net/) +- Ask on [Discord](https://discord.gg/crowdsec) with `cscli explain` output +- Check parser documentation on the [Hub](https://app.crowdsec.net/hub/parsers) diff --git a/crowdsec-docs/unversioned/troubleshooting/issue_lp_no_logs_read.md b/crowdsec-docs/unversioned/troubleshooting/issue_lp_no_logs_read.md new file mode 100644 index 000000000..63c9838b0 --- /dev/null +++ b/crowdsec-docs/unversioned/troubleshooting/issue_lp_no_logs_read.md @@ -0,0 +1,285 @@ +--- +title: LP No Logs Read +id: issue_lp_no_logs_read +--- + +The **LP No Logs Read** issue appears when a Log Processor is running but hasn't acquired any log lines in the last 24 hours. This is the first step in the detection pipeline and must work for CrowdSec to function. + +## What Triggers This Issue + +- **Trigger condition**: No logs acquired for 24 hours +- **Criticality**: Critical +- **Impact**: Complete detection failure - no logs means no alerts + +## Common Root Causes + +- **Missing acquisition configuration**: No acquisition files exist, or they're empty. +- **Incorrect Acquisition file configuration**: Acquisition configuration points to paths that don't exist or have moved. +- **File permission issues**: CrowdSec doesn't have read access to the log files. +- **Log files are empty or not being written**: The services being monitored aren't generating logs. +- **Incorrect Acquisition endpoint configuration**: Error in endpoint config, for acquisition types listening for incoming data (httpLogs, syslog,...) +- **Acquisition type mismatch**: Wrong datasource type configured (e.g., using `file` instead of `journald`). +- **Container/Kubernetes volume issues**: In containerized deployments, logs aren't mounted or accessible to the CrowdSec container. + +## How to Diagnose + +### Check acquisition metrics + +```bash +# On host +sudo cscli metrics show acquisition + +# Docker +docker exec crowdsec cscli metrics show acquisition + +# Kubernetes +kubectl exec -n crowdsec -it -- cscli metrics show acquisition +``` + +**What to look for:** +- If the output is empty or shows 0 "Lines read", acquisition is not working +- If sources are listed but "Lines read" is 0, the source exists but isn't reading data + +### Verify acquisition configuration exists + +```bash +# On host +sudo cat /etc/crowdsec/acquis.yaml +sudo ls -la /etc/crowdsec/acquis.d/ + +# Docker +docker exec crowdsec cat /etc/crowdsec/acquis.yaml +docker exec crowdsec ls -la /etc/crowdsec/acquis.d/ + +# Kubernetes - check ConfigMap +kubectl get configmap -n crowdsec -o yaml +``` + +If these files are empty or missing, you need to create acquisition configuration. + +### Check log files exist and have content + +```bash +# Verify log file exists +ls -la /var/log/nginx/access.log + +# Check if it has recent content +tail -10 /var/log/nginx/access.log + +# Check last modification time +stat /var/log/nginx/access.log +``` + +### Check file permissions + +```bash +# Check if CrowdSec user can read the log file +sudo -u crowdsec cat /var/log/nginx/access.log | head -5 + +# Check directory permissions +ls -la /var/log/nginx/ +``` + +## How to Resolve + +### Create or fix acquisition configuration + +The acquisition configuration tells CrowdSec which logs to read. Configuration varies by deployment: + +#### On Host + +Create or edit `/etc/crowdsec/acquis.yaml` or add files to `/etc/crowdsec/acquis.d/`: + +**Example for NGINX:** +```yaml +filenames: + - /var/log/nginx/access.log + - /var/log/nginx/error.log +labels: + type: nginx +--- +``` + +**Example for SSH (via syslog):** +```yaml +filenames: + - /var/log/auth.log +labels: + type: syslog +--- +``` + +**Example for journald:** +```yaml +source: journalctl +journalctl_filter: + - "_SYSTEMD_UNIT=ssh.service" +labels: + type: syslog +--- +``` + +After creating the configuration: +```bash +sudo systemctl restart crowdsec +``` + +#### Docker + +Ensure log volumes are mounted and acquisition is configured: + +**docker-compose.yml example:** +```yaml +services: + crowdsec: + image: crowdsecurity/crowdsec:latest + volumes: + - /var/log:/var/log:ro # Mount host logs as read-only + - ./acquis.yaml:/etc/crowdsec/acquis.yaml:ro + - crowdsec-config:/etc/crowdsec + - crowdsec-data:/var/lib/crowdsec/data +``` + +**acquis.yaml for Docker:** +```yaml +filenames: + - /var/log/nginx/access.log +labels: + type: nginx +``` + +Restart the container: +```bash +docker-compose restart crowdsec +``` + +#### Kubernetes + +Configure acquisition in your Helm values: + +**values.yaml:** +```yaml +agent: + acquisition: + - namespace: production + podName: nginx-* + program: nginx + - namespace: production + podName: webapp-* + program: nginx +``` + +**Note:** In Kubernetes, use `program:` (not `type:`). The `program` field must match the FILTER in your parsers. + +Apply changes: +```bash +helm upgrade crowdsec crowdsec/crowdsec -n crowdsec -f values.yaml +``` + +### Fix file permissions + +If CrowdSec can't read log files: + +```bash +# Add CrowdSec user to the log group (e.g., adm) +sudo usermod -aG adm crowdsec + +# Or adjust log file permissions (less secure) +sudo chmod 644 /var/log/nginx/access.log + +# Restart CrowdSec to pick up group membership +sudo systemctl restart crowdsec +``` + +### Verify log files are being written + +If log files are empty: + +1. **Check the monitored service is running:** + ```bash + sudo systemctl status nginx + ``` + +2. **Generate some log activity:** + ```bash + curl http://localhost/ + tail /var/log/nginx/access.log + ``` + +3. **Check service logging configuration:** + - For NGINX: verify `access_log` directives in nginx.conf + - For Apache: verify `CustomLog` directives + - For systemd services: verify they're logging to journald or files + +### Fix container/Kubernetes volume issues + +#### Docker +Ensure volumes are correctly mounted: +```bash +# Check mounts inside container +docker exec crowdsec ls -la /var/log/nginx/ + +# If empty, verify docker-compose.yml volumes section +``` + +#### Kubernetes +Kubernetes agents read from `/var/log/containers` by default (mounted by helm chart). If logs aren't there: + +```bash +# Verify pods are writing to expected locations +kubectl logs -n production nginx-pod-name + +# Check if logs are in /var/log/containers on the node +kubectl debug node/your-node -it --image=busybox -- ls -la /var/log/containers/ +``` + +## Verify Resolution + +After making changes: + +1. **Restart CrowdSec:** + ```bash + sudo systemctl restart crowdsec + # or docker restart crowdsec + # or kubectl rollout restart deployment/crowdsec-agent -n crowdsec + ``` + +2. **Wait 1-2 minutes for acquisition to start** + +3. **Check metrics again:** + ```bash + sudo cscli metrics show acquisition + ``` + +4. **Verify "Lines read" is increasing:** + - Run metrics command twice with a delay + - Numbers should increase if logs are being actively generated + +5. **Check CrowdSec logs for errors:** + ```bash + sudo tail -50 /var/log/crowdsec.log + # or docker logs crowdsec + # or kubectl logs -n crowdsec + ``` + +## Detailed Acquisition Documentation + +For more information on acquisition configuration: +- [Datasources Documentation](/docs/log_processor/data_sources/intro) +- [File datasource](/docs/log_processor/data_sources/file) +- [Journald datasource](/docs/log_processor/data_sources/journald) +- [Hub collection pages](https://app.crowdsec.net/hub/collections) - each collection shows example acquisition config + +## Related Issues + +- [LP No Logs Parsed](/u/troubleshooting/lp_no_logs_parsed) - Next step if logs are read but not parsed +- [LP No Alerts](/u/troubleshooting/lp_no_alerts) - If logs are read and parsed but scenarios don't trigger +- [Engine No Alerts](/u/troubleshooting/engine_no_alerts) - Similar issue at the Security Engine level + +## Getting Help + +If acquisition still doesn't work: + +- Share your acquisition config on [Discourse](https://discourse.crowdsec.net/) +- Ask on [Discord](https://discord.gg/crowdsec) with your `cscli metrics` output and acquisition files +- Check for similar issues in the [GitHub repository](https://github.com/crowdsecurity/crowdsec/issues) diff --git a/crowdsec-docs/unversioned/troubleshooting/issue_rc_integration_offline.md b/crowdsec-docs/unversioned/troubleshooting/issue_rc_integration_offline.md new file mode 100644 index 000000000..a3f08df6f --- /dev/null +++ b/crowdsec-docs/unversioned/troubleshooting/issue_rc_integration_offline.md @@ -0,0 +1,299 @@ +--- +title: RC Integration Offline +id: issue_rc_integration_offline +--- + +The **RC Integration Offline** (Remediation Component Integration Offline) refers to a Blocklist-Integration of type Remediation Component has not pulled from its endpoint for more than 24 hours. + +This issue applies to Remediation Component (aka bouncers) directly connected to a Blocklist integration endpoint (aka Blocklist as a Service / BLaaS). + +## What Triggers This Issue + +- **Trigger condition**: No pull for 24 hours +- **Criticality**: Critical +- **Impact**: Latest blocklist updates not retrieved and potential malfunction of the remediation component. + +## Common Root Causes + +- **Bouncer service or process stopped**: The bouncer daemon, module, or plugin is not running. +- **Configuration errors**: Incorrect or missing API URL or API Key in bouncer's configuration file, or malformed settings. +- **Network connectivity issues**: The bouncer cannot reach the endpoint. +- **Bouncer not loaded**: Bouncer Module/plugin is installed but not enabled or started. + +## How to Diagnose + +Depending on the type of bouncer, you'll need to check its installation status, configuration, and running status. + +**Types of remediation components:** +- **Web server modules**: NGINX, Apache plugins +- **Reverse proxy integrations**: Traefik, HAProxy, Caddy middlewares +- **Application frameworks**: PHP libraries, WordPress plugins +- **Cloud service workers**: Cloudflare Workers, Fastly Compute, autonomous update daemons +- **Custom integrations**: Using the Bouncer SDK + +### Check bouncer configuration has proper parameters + +For Blocklist-as-a-Service (BLaaS) connectivity, verify the bouncer configuration has proper api url and key +:::info +Properties name may vary: *api_url, api_key or lapi_url_lapi_key* ... Check your [bouncer's doc](/u/bouncers/intro) +::: + + +1. **api_url**: Must point to your BLaaS endpoint (e.g., `https://admin.api.crowdsec.net/v1/decisions/stream`) +2. **api_key**: Your BLaaS API key *(Found in the Console in your Blocklist integration section, on creation or on "Refresh Credentials")* + +**Common configuration file locations:** +- **On host**: `/etc/crowdsec/bouncers/crowdsec--bouncer.conf` +- ie: **NGINX**: `/etc/crowdsec/bouncers/crowdsec-nginx-bouncer.conf` +- **WordPress**: Admin panel → CrowdSec → **Connection details** Section + +Check the configuration file: +```bash +# Example for NGINX bouncer +sudo cat /etc/crowdsec/bouncers/crowdsec-nginx-bouncer.conf + +# Look for: +# API_URL=https://admin.api.crowdsec.net/v1/decisions/stream +# API_KEY= +``` + +### Check bouncer service status + +Verify the bouncer is running and hasn't encountered errors. + +#### For host-based processes + +Check if the bouncer process or service is running: + +Depending on your bouncer type: + +#### Web server module bouncers + +```bash +# NGINX +sudo systemctl status nginx +sudo nginx -t # Test configuration + +# Apache +sudo systemctl status apache2 +sudo apache2ctl -t # Test configuration + +# Check if module is loaded +# NGINX: check nginx.conf for crowdsec module +# Apache: check mods-enabled/crowdsec.conf +``` + +#### Standalone bouncer daemons + +```bash +# Traefik bouncer +sudo systemctl status crowdsec-traefik-bouncer + +# HAProxy bouncer +sudo systemctl status crowdsec-haproxy-bouncer + +# Cloudflare bouncer +sudo systemctl status crowdsec-cloudflare-bouncer +``` + +### Check bouncer logs + +Bouncer logs locations vary by type: + +**Standalone daemon bouncers:** +- **Systemd services**: `sudo journalctl -u crowdsec- -n 50` +- **Traefik/HAProxy/Cloudflare**: `/var/log/crowdsec-.log` + +**Web server module bouncers:** +- **NGINX**: Check main NGINX error log (`/var/log/nginx/error.log`) +- **Apache**: Check Apache error log (`/var/log/apache2/error.log`) + +**Application framework bouncers:** +- **WordPress**: WordPress debug log or plugin settings page +- **PHP**: Application logs or web server error logs + +**Cloud service workers:** +- **Cloudflare Workers**: Cloudflare dashboard → Workers → Logs +- **Fastly Compute**: Fastly dashboard → Real-time logs + +**Look for errors like:** +- `connection refused` or `timeout` - API endpoint unreachable +- `401 Unauthorized` or `403 Forbidden` - API key invalid or missing +- `module not loaded` - Integration not enabled in web server +- `invalid configuration` - Config file syntax or parameter errors +- `rate limit exceeded` - Cloud service plan limits reached + +### Test connectivity to the endpoint + +From the bouncer host: + +```bash +# Test network connectivity +curl -I https:/// + +# Test with API key +curl -H "X-Api-Key: " https:// +``` + +## How to Resolve + +### Restart the bouncer + +#### For web server modules + +```bash +# NGINX +sudo systemctl restart nginx + +# Apache +sudo systemctl restart apache2 +``` + +#### For standalone daemons + +```bash +sudo systemctl restart crowdsec- +sudo systemctl enable crowdsec- +``` + +### Update bouncer configuration + +If the API URL or API key is incorrect, update the bouncer's configuration file: + +**NGINX bouncer** (`/etc/crowdsec/bouncers/crowdsec-nginx-bouncer.conf`): +```bash +API_URL=https://admin.api.crowdsec.net/v1/decisions/stream +API_KEY= +UPDATE_FREQUENCY=10s +``` + +**Traefik bouncer** (`/etc/crowdsec/bouncers/crowdsec-traefik-bouncer.yaml`): +```yaml +crowdsec_url: https://admin.api.crowdsec.net/v1/decisions/stream +crowdsec_api_key: +update_frequency: 10s +``` + +**HAProxy bouncer** (`/etc/crowdsec/bouncers/crowdsec-haproxy-bouncer.conf`): +```bash +CROWDSEC_URL=https://admin.api.crowdsec.net/v1/decisions/stream +CROWDSEC_API_KEY= +``` + +After updating, restart the bouncer service. + +### Fix connectivity issues + +If the bouncer cannot reach the BLaaS endpoint: + +1. **Test network connectivity:** + ```bash + curl -I https://admin.api.crowdsec.net/ + ``` + +2. **Check firewall rules:** + ```bash + # Ensure outbound HTTPS (443) is allowed + sudo ufw status + # or + sudo firewall-cmd --list-all + ``` + +3. **Test with API key:** + ```bash + curl -H "X-Api-Key: " \ + https://admin.api.crowdsec.net/v1/decisions/stream + ``` + + Should return `{"new":null,"deleted":null}` or similar if authenticated. + +4. **Check proxy settings** if using a corporate proxy - configure in bouncer's environment or config file. + +5. **For cloud workers (Cloudflare/Fastly):** + - Verify the worker is deployed and running + - Check if you've hit rate limits on your plan + - Review worker logs for errors + +### Enable the module/plugin + +Some bouncers require explicit enabling: + +#### NGINX + +Check `/etc/nginx/nginx.conf` includes the CrowdSec module: + +```nginx +load_module modules/ngx_http_crowdsec_module.so; + +http { + # CrowdSec configuration + crowdsec_enabled on; + crowdsec_api_url https://; + # ... +} +``` + +Test and reload: +```bash +sudo nginx -t +sudo systemctl reload nginx +``` + +#### Apache + +Enable the module: +```bash +sudo a2enmod crowdsec +sudo systemctl restart apache2 +``` + +## Verify Resolution + +After making changes: + +1. **Wait 1-2 minutes** for the bouncer to attempt its next pull from the endpoint + +2. **Check in the Console:** + - Navigate to your Blocklist integration + - Look at the integration tile + - Verify the "Last Pull" timestamp has updated to a recent time (within last few minutes) + - The offline alert should clear automatically + +3. **Verify bouncer is pulling decisions:** + ```bash + # For standalone daemons, check logs + sudo journalctl -u crowdsec- -n 20 + + # Look for successful pull messages like: + # "Successfully pulled X decisions" + # "Decisions updated" + ``` + +4. **Test that blocking is working** (optional but recommended): + - Check bouncer-specific documentation for test procedures + - For web servers, you can test by temporarily adding a test decision + +Once fixed, the issues concerning those RC will disappear on next SE info update *(within 30minutes)*. + +## Bouncer-Specific Documentation + +- [NGINX Bouncer](/u/bouncers/nginx) +- [Traefik Bouncer](/u/bouncers/traefik) +- [HAProxy Bouncer](/u/bouncers/haproxy) +- [Cloudflare Bouncer](/u/bouncers/cloudflare) +- [WordPress Plugin](/u/bouncers/wordpress) +- [All Bouncers](/u/bouncers/intro) + +## Related Issues + +- [Firewall Integration Offline](/u/troubleshooting/fw_integration_offline) - Similar issue for firewall bouncers +- [Remediation Components Troubleshooting](/u/troubleshooting/remediation_components) - General bouncer issues + +## Getting Help + +If your bouncer still doesn't work: + +- Check bouncer-specific documentation (links above) +- Share config and logs on [Discourse](https://discourse.crowdsec.net/) +- Ask on [Discord](https://discord.gg/crowdsec) with `cscli bouncers list` output +- Report bouncer bugs on GitHub (check bouncer's repository) diff --git a/crowdsec-docs/unversioned/troubleshooting/security_engine_offline.md b/crowdsec-docs/unversioned/troubleshooting/issue_security_engine_offline.md similarity index 84% rename from crowdsec-docs/unversioned/troubleshooting/security_engine_offline.md rename to crowdsec-docs/unversioned/troubleshooting/issue_security_engine_offline.md index 91dc11d12..0a9bcc959 100644 --- a/crowdsec-docs/unversioned/troubleshooting/security_engine_offline.md +++ b/crowdsec-docs/unversioned/troubleshooting/issue_security_engine_offline.md @@ -1,11 +1,18 @@ --- title: Security Engine Offline -id: security_engine_offline +id: issue_security_engine_offline --- The **Security Engine Offline** alert appears in the Console and notification integrations when an enrolled engine has not reported or logged in to CrowdSec for more than 48 hours. This usually means the core `crowdsec` service (Log Processor + Local API) has stopped working or communicating with our infrastructure. -## Common Root Causes & Diagnostics +## Common Root Causes + +- **Host or service down**: The crowdsec service has stopped or the host itself is unreachable. +- **Enrollment revoked or pending**: Engine enrollment was removed from the Console or is awaiting approval. +- **Console connectivity issues**: Network, firewall, or proxy blocking HTTPS calls to Console endpoints, or TLS validation failures. +- **Local API unavailable**: The Local API component has stopped and cannot gather or forward alerts to the Console. + +## Diagnostics ### Host or service down @@ -97,3 +104,10 @@ After restarting, re-run `sudo cscli console status` to ensure the heartbeat is - Investigate persistent database or authentication errors using `sudo cscli support dump`, then consult the [Security Engine troubleshooting guide](/u/troubleshooting/security_engine) if issues remain. Once the engine resumes contact, the Console clears the **Security Engine Offline** alert during the next poll. Consider enabling the **Security Engine Offline** notification in your preferred integration so future outages are caught quickly. + +## Getting Help + +If you still don't manage to resume your Security Engine hearthbeat towards CrowdSec Console: + +- Check [Discourse](https://discourse.crowdsec.net/) for similar cases +- Ask on [Discord](https://discord.gg/crowdsec) with your `sudo cscli support dump` output \ No newline at end of file diff --git a/crowdsec-docs/unversioned/troubleshooting/log_processor_offline.md b/crowdsec-docs/unversioned/troubleshooting/log_processor_offline.md deleted file mode 100644 index 2699719de..000000000 --- a/crowdsec-docs/unversioned/troubleshooting/log_processor_offline.md +++ /dev/null @@ -1,142 +0,0 @@ ---- -title: Log Processor Offline -id: log_processor_offline ---- - -When the Console or a notification rule reports **Log Processor Offline**, the local agent has not checked in with the Local API (LAPI) for more than 24 hours. The alert is different from **Log Processor No Alert**, which only means logs were parsed but no scenarios fired. Use the sections below to identify why the heartbeat stopped and how to bring the agent back online. - -## Common Root Causes & Diagnostics - -### Service stopped or stuck - -- Confirm the service state on the host: - -```bash -sudo systemctl status crowdsec -sudo journalctl -u crowdsec -n 50 -``` - -- For containerised deployments, verify the workload is still running: - -```bash -docker ps --filter name=crowdsec -kubectl get pods -n crowdsec -``` - -- On the LAPI node, run `sudo cscli machines list` and check whether the `Last Update` column is older than 24 hours for the affected machine. - -### Machine not validated or credentials revoked - -- `sudo cscli machines list` on the LAPI shows the machine in `PENDING` state or missing entirely. -- On the agent host, ensure `/etc/crowdsec/local_api_credentials.yaml` exists and contains the expected login and password. -- If you recently reinstalled or renamed the machine, it must be re-validated. See [Machines management](/u/user_guides/machines_mgmt) for details. - -### Local API unreachable - -- From the agent, run: - -```bash -sudo cscli lapi status -``` - - Errors such as `401 Unauthorized`, TLS failures, or connection timeouts indicate an authentication or network issue. - -- Verify the API endpoint declared in `/etc/crowdsec/config.yaml` (`api.client.credentials_path`, `url`, `ca_cert`, `insecure_skip_verify`) matches your LAPI setup. Refer to [Local API configuration](/docs/local_api/configuration) and [TLS authentication](/docs/local_api/tls_auth) if certificates changed. -- Confirm the network path between the agent and the LAPI host is open (default port `8080/TCP`). Firewalls or reverse proxies introduced after installation commonly block the heartbeat. - -### Local API unavailable - -- If several agents show as offline simultaneously, the LAPI service might be down. Check its status on the LAPI machine: - -```bash -sudo systemctl status crowdsec -sudo journalctl -u crowdsec -n 50 -``` - -- Inspect `/var/log/crowdsec/` (or container logs) for database or authentication errors that prevent the LAPI from responding. -- Use `sudo cscli metrics show engine` on the LAPI to confirm it is still ingesting events from other agents. See the [Health Check guide](/u/getting_started/health_check) for additional diagnostics. - -## Recovery Actions - -### Restart the Log Processor service - -- Systemd: - -```bash -sudo systemctl restart crowdsec -``` - -- Docker: - -```bash -docker restart crowdsec -``` - -- Kubernetes: - -```bash -kubectl rollout restart deployment/crowdsec -n crowdsec -``` - -After the restart, re-run `sudo cscli machines list` on the LAPI to confirm the `Last Update` timestamp is refreshed. - -### Validate or re-register the machine - -#### Using credentials - -:::info -More suitable for single machine setups. -::: - -- To regenerate credentials directly on the LAPI host when the agent runs locally, run: - -```bash -sudo cscli machines add -a -``` - -#### Using registration system - -:::info -Registration system is more suitable for distributed setups. -::: - - - -- Approve pending machines on the LAPI: - -```bash -sudo cscli machines validate -``` - -- If credentials were removed or the agent was rebuilt, re-register it against the LAPI: - -```bash -sudo cscli lapi register --url http://:8080 --machine -sudo systemctl restart crowdsec -``` - -Update the `--url` to match your deployment. Auto-registration tokens are covered in [Machines management](/u/user_guides/machines_mgmt#machine-auto-validation). - -### Restore connectivity to the Local API - -- Open the required port on firewalls or security groups and verify with: - -```bash -nc -zv 8080 -``` - -- If TLS certificates were renewed, update the agent trust store (`ca_cert`) or temporarily enable `insecure_skip_verify: true` for testing. Follow the hardening recommendations in [TLS authentication](/docs/local_api/tls_auth). -- When using proxies or load balancers, ensure they forward HTTP headers and TLS material expected by the LAPI. - -### Stabilise the Local API - -- Restart the LAPI service or pod if it was unresponsive: - -```bash -sudo systemctl restart crowdsec -kubectl rollout restart deployment/crowdsec-lapi -n crowdsec -``` - -- Run `sudo cscli support dump` to collect diagnostics if the LAPI repeatedly crashes or loses database access. Review the resulting archive for database connectivity errors and consult the [Security Engine troubleshooting guide](/u/troubleshooting/security_engine) when escalation is required. - -Once the heartbeat is restored, the Console alert clears automatically during the next polling cycle. Consider adding a [notification rule](/u/console/notification_integrations/rule) for **Log Processor Offline** so you are alerted promptly when it happens again.