Skip to content

Commit 1d8413a

Browse files
committed
Add set of standard alarms to logback.xml
This is a "working" typology which I consider open to additions or modifications. Target: trunk Patch: http://rb.dcache.org/r/5176 Merge: 2.5 Require-book: yes Require-notes: yes Acked-by: Dmitry Tested deployed dCache as far as feasible to ensure that the various types were recognized and displayed on the alarms page. RELEASE NOTES: The following alarm definitions are now included with the standard dCache distribution: TYPE [SEVERITY] -------------------------------------------------------------------------------------- SERVICE_CREATION_FAILURE [CRITICAL] DB_OUT_OF_CONNECTIONS [CRITICAL] DB_UNAVAILABLE [CRITICAL] JVM_OUT_OF_MEMORY [CRITICAL] OUT_OF_FILE_DESCRIPTORS [CRITICAL] IO_ERROR [HIGH] HSM_READ_FAILURE [HIGH] HSM_WRITE_FAILURE [HIGH] LOCATION_MANAGER_UNAVAILABLE [HIGH] POOL_MANAGER_UNAVAILABLE [HIGH] POOL_DISABLED [MODERATE] CHECKSUM [MODERATE] Their full definitions may be inspected in the logback.xml file found in /etc/dcache.
1 parent a5c9ff0 commit 1d8413a

File tree

1 file changed

+82
-2
lines changed

1 file changed

+82
-2
lines changed

skel/etc/logback.xml

Lines changed: 82 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -122,13 +122,93 @@
122122
to its embedded child appender(s)
123123
-->
124124
<filter class="org.dcache.alarms.logback.AlarmDefinitionFilter">
125+
<alarmType>
126+
regex:"(.+) from ac_create",
127+
type:SERVICE_CREATION_FAILURE,
128+
level:ERROR,
129+
severity:CRITICAL,
130+
include-in-key:group1 type host domain service
131+
</alarmType>
132+
<alarmType>
133+
regex:"Failed to acquire connection.+Attempts left: 0",
134+
type:DB_OUT_OF_CONNECTIONS,
135+
level:ERROR,
136+
severity:CRITICAL,
137+
include-in-key:type host
138+
</alarmType>
139+
<alarmType>
140+
regex:"Unable to open a test connection to the given database|Connections could not be acquired from the underlying database",
141+
match-exception:true,
142+
depth:1,
143+
type:DB_UNAVAILABLE,
144+
level:ERROR,
145+
severity:CRITICAL,
146+
include-in-key:type host
147+
</alarmType>
148+
<alarmType>
149+
regex:"OutOfMemory",
150+
type:JVM_OUT_OF_MEMORY,
151+
level:ERROR,
152+
severity:CRITICAL,
153+
include-in-key:type host domain
154+
</alarmType>
155+
<alarmType>
156+
regex:"[Tt]oo many open files",
157+
type:OUT_OF_FILE_DESCRIPTORS,
158+
match-exception:true,
159+
level:ERROR,
160+
severity:CRITICAL,
161+
include-in-key:type host domain
162+
</alarmType>
163+
<alarmType>
164+
regex:"I/O.*failed(.+)|I/O error occur.*ed(.+)",
165+
type:IO_ERROR,
166+
level:WARN,
167+
severity:HIGH,
168+
include-in-key:group1 type host service domain
169+
</alarmType>
170+
<alarmType>
171+
regex:"Fetch failed: HSM script failed",
172+
type:HSM_READ_FAILURE,
173+
level:WARN,
174+
severity:HIGH,
175+
include-in-key:type host service domain
176+
</alarmType>
177+
<alarmType>
178+
regex:"Store failed: HSM script failed",
179+
type:HSM_WRITE_FAILURE,
180+
level:WARN,
181+
severity:HIGH,
182+
include-in-key:type host service domain
183+
</alarmType>
184+
<alarmType>
185+
regex:"Timeout querying location manager",
186+
type:LOCATION_MANAGER_UNAVAILABLE,
187+
level:WARN,
188+
severity:HIGH,
189+
include-in-key:type host service domain
190+
</alarmType>
191+
<alarmType>
192+
regex:"PoolManager.+not found",
193+
type:POOL_MANAGER_UNAVAILABLE,
194+
level:WARN,
195+
severity:HIGH,
196+
include-in-key:type host service domain
197+
</alarmType>
198+
<alarmType>
199+
regex:"Pool mode changed to disabled",
200+
type:POOL_DISABLED,
201+
level:WARN,
202+
severity:MODERATE,
203+
include-in-key:type host service domain
204+
</alarmType>
125205
<alarmType>
126206
logger:org.dcache.pool.classic.ChecksumScanner,
127-
regex:"Checksum mismatch",
207+
regex:"Checksum mismatch detected for (.+) - marking as BROKEN",
128208
type:CHECKSUM,
129209
level:ERROR,
130210
severity:MODERATE,
131-
include-in-key:message type host service domain
211+
include-in-key:group1 type host service domain
132212
</alarmType>
133213
</filter>
134214
<appender-ref ref="remote"/>

0 commit comments

Comments
 (0)