Permalink
Browse files

1.0.25.21: handling of potential corruptions

- add corruption_warning_and_maybe_lose that prints a warning and
  loses depending on lose_on_corruption_p (false by default)
- use corruption_warning_and_maybe_lose when the control stack is
  exhausted and on memory faults
- use corruption_warning_and_maybe_lose on the lisp handlers of
  SIGILL, SIGBUS and SIGEMT, as invoking them is surely not a good
  sign.
- add --lose-on-corruption as a runtime option
- add --disable-ldb as a runtime option
- update the man page and the user manual
- HEAP-EXHAUSTED fixes:
  - exit pseduo atomic properly and do pending interrupt if needed
  - signalling HEAP-EXHAUSTED in a WITHOUT-INTERRUPTS is dangerous
- use --lose-on-corruption in make-target*.sh

Also, block blockable signals on lose() to prevent other threads,
timers and such from interfering. If only all threads could be stopped
somehow.
  • Loading branch information...
1 parent c26d6a3 commit d1873cc3f7a09f9891bb9c05f206af1774876c0c Gabor Melis committed Feb 16, 2009
Showing with 130 additions and 14 deletions.
  1. +8 −0 NEWS
  2. +13 −0 doc/manual/start-stop.texinfo
  3. +16 −3 doc/sbcl.1
  4. +2 −0 make-target-2.sh
  5. +2 −1 make-target-contrib.sh
  6. +13 −1 src/runtime/gencgc.c
  7. +51 −7 src/runtime/interr.c
  8. +2 −0 src/runtime/interr.h
  9. +11 −0 src/runtime/interrupt.c
  10. +11 −1 src/runtime/runtime.c
  11. +1 −1 version.lisp-expr
View
@@ -1,4 +1,12 @@
;;;; -*- coding: utf-8; fill-column: 78 -*-
+changes in sbcl-1.0.26 relative to 1.0.25:
+ * new feature: runtime option --disable-ldb
+ * new feature: runtime option --lose-on-corruption to die at the
+ slightest hint of possibly non-recoverable errors: running out of
+ memory, stack, alien stack, binding stack, encountering a memory
+ fault, etc. In the absence of --lose-on-corruption a warning is
+ printed to stderr.
+
changes in sbcl-1.0.25 relative to 1.0.24:
* incompatible change: SB-INTROSPECT:FUNCTION-ARGLIST is deprecated, to be
removed later. Please use SB-INTROSPECT:FUNCTION-LAMBDA-LIST instead.
@@ -214,8 +214,21 @@ startup. This makes it easier to write Lisp programs which work
cleanly in Unix pipelines. See also the @code{--noprint} and
@code{--disable-debugger} options.
+@item --disable-ldb
+Disable the low-level debugger. Only effective if SBCL is compiled
+with LDB.
+
+@item --lose-on-corruption
+There are some dangerous low level errors (for instance, control stack
+exhausted, memory fault) that (or whose handlers) can corrupt the
+image. By default SBCL prints a warning, then tries to continue and
+handle the error in Lisp, but this will not always work and SBCL may
+malfunction or even hang. With this option, upon encountering such an
+error SBCL will invoke ldb (if present and enabled) or else exit.
+
@item --script @var{filename}
As a runtime option this is equivalent to @code{--noinform}
+@code{--disable-ldb} @code{--lose-on-corruption}
@code{--end-runtime-options} @code{--script} @var{filename}. See the
description of @code{--script} as a toplevel option below.
View
@@ -105,10 +105,22 @@ startup. (This makes it easier to write Lisp programs which work
cleanly in Unix pipelines. See also the "\-\-noprint" and
"\-\-disable\-debugger" options.)
.TP 3
+.B \-\-disable\-ldb
+Disable the low-level debugger. Only effective if SBCL is compiled with LDB.
+.TP 3
+.B \-\-lose\-on\-corruption
+There are some dangerous low level errors (for instance, control stack
+exhausted, memory fault) that (or whose handlers) can corrupt the
+image. By default SBCL prints a warning, then tries to continue and
+handle the error in Lisp, but this will not always work and SBCL may
+malfunction or even hang. With this option, upon encountering such an
+error SBCL will invoke ldb (if present and enabled) or else exit.
+.TP 3
.B \-\-script <filename>
-As a runtime option equivalent to \-\-noinform
-\-\-end\-toplevel\-options \-\-script <filename>. See the description
-of \-\-script as a toplevel option below.
+As a runtime option equivalent to \-\-noinform \-\-disable\-ldb
+\-\-lose\-on\-corruption \-\-end\-toplevel\-options \-\-script
+<filename>. See the description of \-\-script as a toplevel option
+below.
.TP 3
.B \-\-help
Print some basic information about SBCL, then exit.
@@ -166,6 +178,7 @@ debugger, allowing interactive diagnosis and possible intercession.
This option disables the debugger, causing errors to print a backtrace
and exit with status 1 instead -- which is a mode of operation better suited
for batch processing. See the User Manual on \f(CRSB\-EXT:DISABLE\-DEBUGGER\fR for details.
+.TP 3
.B \-\-script <filename>
Implies \-\-no-sysinit \-\-no-userinit \-\-disable-debugger
\-\-end\-toplevel\-options.
View
@@ -32,8 +32,10 @@ export LANG LC_ALL
echo //doing warm init - compilation phase
./src/runtime/sbcl \
--core output/cold-sbcl.core \
+--lose-on-corruption \
--no-sysinit --no-userinit < make-target-2.lisp
echo //doing warm init - load and dump phase
./src/runtime/sbcl \
--core output/cold-sbcl.core \
+--lose-on-corruption \
--no-sysinit --no-userinit < make-target-2-load.lisp
@@ -32,7 +32,8 @@ if [ "$OSTYPE" = "cygwin" ] ; then
SBCL_PWD=`echo $SBCL_PWD | sed s/\ /\\\\\\\\\ /g`
fi
-SBCL="$SBCL_PWD/src/runtime/sbcl --noinform --core $SBCL_PWD/output/sbcl.core --disable-debugger --no-sysinit --no-userinit"
+SBCL="$SBCL_PWD/src/runtime/sbcl --noinform --core $SBCL_PWD/output/sbcl.core \
+--lose-on-corruption --disable-debugger --no-sysinit --no-userinit"
SBCL_BUILDING_CONTRIB=1
export SBCL SBCL_BUILDING_CONTRIB
View
@@ -1154,6 +1154,7 @@ static page_index_t gencgc_alloc_start_page = -1;
void
gc_heap_exhausted_error_or_lose (long available, long requested)
{
+ struct thread *thread = arch_os_get_current_thread();
/* Write basic information before doing anything else: if we don't
* call to lisp this is a must, and even if we do there is always
* the danger that we bounce back here before the error has been
@@ -1166,7 +1167,6 @@ gc_heap_exhausted_error_or_lose (long available, long requested)
/* If we are in GC, or totally out of memory there is no way
* to sanely transfer control to the lisp-side of things.
*/
- struct thread *thread = arch_os_get_current_thread();
print_generation_stats(1);
fprintf(stderr, "GC control variables:\n");
fprintf(stderr, " *GC-INHIBIT* = %s\n *GC-PENDING* = %s\n",
@@ -1181,6 +1181,18 @@ gc_heap_exhausted_error_or_lose (long available, long requested)
else {
/* FIXME: assert free_pages_lock held */
(void)thread_mutex_unlock(&free_pages_lock);
+ gc_assert(get_pseudo_atomic_atomic(thread));
+ clear_pseudo_atomic_atomic(thread);
+ if (get_pseudo_atomic_interrupted(thread))
+ do_pending_interrupt();
+ /* Another issue is that signalling HEAP-EXHAUSTED error leads
+ * to running user code at arbitrary places, even in a
+ * WITHOUT-INTERRUPTS which may lead to a deadlock without
+ * running out of the heap. So at this point all bets are
+ * off. */
+ if (SymbolValue(INTERRUPTS_ENABLED,thread) == NIL)
+ corruption_warning_and_maybe_lose
+ ("Signalling HEAP-EXHAUSTED in a WITHOUT-INTERRUPTS.");
funcall2(StaticSymbolFunction(HEAP_EXHAUSTED_ERROR),
alloc_number(available), alloc_number(requested));
lose("HEAP-EXHAUSTED-ERROR fell through");
View
@@ -48,26 +48,70 @@ void disable_lossage_handler(void)
lossage_handler = default_lossage_handler;
}
-void
-lose(char *fmt, ...)
+static
+void print_message(char *fmt, va_list ap)
{
- va_list ap;
- fprintf(stderr, "fatal error encountered in SBCL pid %d",getpid());
+ fprintf(stderr, " in SBCL pid %d",getpid());
#if defined(LISP_FEATURE_SB_THREAD)
fprintf(stderr, "(tid %lu)", (unsigned long) thread_self());
#endif
if (fmt) {
fprintf(stderr, ":\n");
- va_start(ap, fmt);
vfprintf(stderr, fmt, ap);
- va_end(ap);
}
fprintf(stderr, "\n");
- fflush(stderr);
+}
+
+static inline void
+call_lossage_handler() never_returns;
+
+static inline void
+call_lossage_handler()
+{
lossage_handler();
fprintf(stderr, "Argh! lossage_handler() returned, total confusion..\n");
exit(1);
}
+
+void
+lose(char *fmt, ...)
+{
+ va_list ap;
+ /* Block signals to prevent other threads, timers and such from
+ * interfering. If only all threads could be stopped somehow. */
+ block_blockable_signals();
+ fprintf(stderr, "fatal error encountered");
+ va_start(ap, fmt);
+ print_message(fmt, ap);
+ va_end(ap);
+ fprintf(stderr, "\n");
+ fflush(stderr);
+ call_lossage_handler();
+}
+
+boolean lose_on_corruption_p = 0;
+
+void
+corruption_warning_and_maybe_lose(char *fmt, ...)
+{
+ va_list ap;
+ sigset_t oldset;
+ thread_sigmask(SIG_BLOCK, &blockable_sigset, &oldset);
+ fprintf(stderr, "CORRUPTION WARNING");
+ va_start(ap, fmt);
+ print_message(fmt, ap);
+ va_end(ap);
+ fprintf(stderr, "The integrity of this image is possibly compromised.\n");
+ if (lose_on_corruption_p)
+ fprintf(stderr, "Exiting.\n");
+ else
+ fprintf(stderr, "Continuing with fingers crossed.\n");
+ fflush(stderr);
+ if (lose_on_corruption_p)
+ call_lossage_handler();
+ else
+ thread_sigmask(SIG_SETMASK,&oldset,0);
+}
/* internal error handler for when the Lisp error system doesn't exist
*
@@ -13,6 +13,8 @@
#define _INTERR_H_
extern void lose(char *fmt, ...) never_returns;
+extern boolean lose_on_corruption_p;
+extern void corruption_warning_and_maybe_lose(char *fmt, ...);
extern void enable_lossage_handler(void);
extern void disable_lossage_handler(void);
extern void describe_internal_error(os_context_t *context);
@@ -810,6 +810,14 @@ interrupt_handle_now_handler(int signal, siginfo_t *info, void *void_context)
os_context_t *context = arch_os_get_context(&void_context);
#if defined(LISP_FEATURE_LINUX) || defined(RESTORE_FP_CONTROL_FROM_CONTEXT)
os_restore_fp_control(context);
+#ifndef LISP_FEATURE_WIN32
+ if ((signal == SIGILL) || (signal == SIGBUS)
+#ifndef LISP_FEATURE_LINUX
+ || (signal == SIGEMT)
+#endif
+ )
+ corruption_warning_and_maybe_lose("Signal %d recieved", signal);
+#endif
#endif
interrupt_handle_now(signal, info, context);
}
@@ -1045,6 +1053,7 @@ handle_guard_page_triggered(os_context_t *context,os_vm_address_t addr)
* protection so the error handler has some headroom, protect the
* previous page so that we can catch returns from the guard page
* and restore it. */
+ corruption_warning_and_maybe_lose("Control stack exhausted");
protect_control_stack_guard_page(0);
protect_control_stack_return_guard_page(1);
@@ -1300,6 +1309,8 @@ lisp_memory_fault_error(os_context_t *context, os_vm_address_t addr)
* now -- some address is better then no address in this case.
*/
current_memory_fault_address = addr;
+ /* To allow debugging memory faults in signal handlers and such. */
+ corruption_warning_and_maybe_lose("Memory fault");
arrange_return_to_lisp_function(context,
StaticSymbolFunction(MEMORY_FAULT_ERROR));
}
View
@@ -229,6 +229,7 @@ main(int argc, char *argv[], char *envp[])
/* other command line options */
boolean noinform = 0;
boolean end_runtime_options = 0;
+ boolean disable_lossage_handler_p = 0;
lispobj initial_function;
const char *sbcl_home = getenv("SBCL_HOME");
@@ -275,6 +276,8 @@ main(int argc, char *argv[], char *envp[])
* TOPLEVEL-INIT sees the option. */
noinform = 1;
end_runtime_options = 1;
+ disable_lossage_handler_p = 1;
+ lose_on_corruption_p = 1;
break;
} else if (0 == strcmp(arg, "--noinform")) {
noinform = 1;
@@ -337,6 +340,12 @@ main(int argc, char *argv[], char *envp[])
++n;
}
++argi;
+ } else if (0 == strcmp(arg, "--disable-ldb")) {
+ disable_lossage_handler_p = 1;
+ ++argi;
+ } else if (0 == strcmp(arg, "--lose-on-corruption")) {
+ lose_on_corruption_p = 1;
+ ++argi;
} else if (0 == strcmp(arg, "--end-runtime-options")) {
end_runtime_options = 1;
++argi;
@@ -426,7 +435,8 @@ main(int argc, char *argv[], char *envp[])
define_var("nil", NIL, 1);
define_var("t", T, 1);
- enable_lossage_handler();
+ if (!disable_lossage_handler_p)
+ enable_lossage_handler();
globals_init();
View
@@ -17,4 +17,4 @@
;;; checkins which aren't released. (And occasionally for internal
;;; versions, especially for internal versions off the main CVS
;;; branch, it gets hairier, e.g. "0.pre7.14.flaky4.13".)
-"1.0.25.20"
+"1.0.25.21"

0 comments on commit d1873cc

Please sign in to comment.