Add proper CFI annotations to the x86 and x86_64 code

This ensures that gdb is able to generate a correct backtrace when stopped at any instruction in the inline asm. It also makes backtraces accessible to tools that only track frame pointer chains, like perf or dtrace.
edef1c · Sep 3, 2016 · 705f931 · 705f931
1 parent 22db868
commit 705f931
Show file tree

Hide file tree

Showing 2 changed files with 177 additions and 61 deletions.
diff --git a/src/arch/x86.rs b/src/arch/x86.rs
@@ -1,6 +1,7 @@
 // This file is part of libfringe, a low-level green threading library.
 // Copyright (c) Nathan Zadoks <nathan@nathan7.eu>,
 //               whitequark <whitequark@whitequark.org>
+//               Amanieu d'Antras <amanieu@gmail.com>
 // Licensed under the Apache License, Version 2.0, <LICENSE-APACHE or
 // http://apache.org/licenses/LICENSE-2.0> or the MIT license <LICENSE-MIT or
 // http://opensource.org/licenses/MIT>, at your option. This file may not be
@@ -32,13 +33,14 @@
 // * Simulating return is as easy as restoring register values from the CFI table
 //   and then setting stack pointer to CFA.
 //
-// A high-level overview of the function of the trampolines is:
+// A high-level overview of the function of the trampolines when unwinding is:
 // * The 2nd init trampoline puts a controlled value (written in swap to `new_cfa`)
-//   into %ebx.
-// * The 1st init trampoline tells the unwinder to set %esp to %ebx, thus continuing
+//   into %ebp. This is then used as the CFA for the 1st trampoline.
+// * This controlled value points to the bottom of the stack of the parent context,
+//   which holds the saved %ebp and return address from the call to swap().
+// * The 1st init trampoline tells the unwinder to restore %ebp and its return
+//   address from the stack frame at %ebp (in the parent stack), thus continuing
 //   unwinding at the swap call site instead of falling off the end of context stack.
-// * The 1st init trampoline together with the swap trampoline also restore %ebp
-//   when unwinding as well as returning normally, because LLVM does not do it for us.
 use stack::Stack;
 
 pub const STACK_ALIGNMENT: usize = 16;
@@ -47,6 +49,7 @@ pub const STACK_ALIGNMENT: usize = 16;
 pub struct StackPointer(*mut usize);
 
 pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackPointer {
+  #[cfg(not(target_vendor = "apple"))]
   #[naked]
   unsafe extern "C" fn trampoline_1() {
     asm!(
@@ -59,20 +62,47 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackP
       __morestack:
       .local __morestack
 
-        # Set up the first part of our DWARF CFI linking stacks together.
+        # Set up the first part of our DWARF CFI linking stacks together. When
+        # we reach this function from unwinding, %ebp will be pointing at the bottom
+        # of the parent linked stack. This link is set each time swap() is called.
         # When unwinding the frame corresponding to this function, a DWARF unwinder
-        # will use %ebx as the next call frame address, restore return address
+        # will use %ebp+8 as the next call frame address, restore return address
         # from CFA-4 and restore %ebp from CFA-8. This mirrors what the second half
         # of `swap_trampoline` does.
-        .cfi_def_cfa %ebx, 0
-        .cfi_offset %ebp, -8
-        # Call the next trampoline.
-        call   ${0:c}
+        .cfi_def_cfa ebp, 8
+        .cfi_offset ebp, -8
+
+        # This nop is here so that the initial swap doesn't return to the start
+        # of the trampoline, which confuses the unwinder since it will look for
+        # frame information in the previous symbol rather than this one. It is
+        # never actually executed.
+        nop
+
+        # Stack unwinding in some versions of libunwind doesn't seem to like
+        # 1-byte symbols, so we add a second nop here. This instruction isn't
+        # executed either, it is only here to pad the symbol size.
+        nop
 
       .Lend:
       .size __morestack, .Lend-__morestack
       "#
-      : : "s" (trampoline_2 as usize) : : "volatile")
+      : : : : "volatile")
+  }
+
+  #[cfg(target_vendor = "apple")]
+  #[naked]
+  unsafe extern "C" fn trampoline_1() {
+    asm!(
+      r#"
+        # Same as above; however, .local and .size are not supported in Mach-O.
+      __morestack:
+      .private_extern __morestack
+        .cfi_def_cfa ebp, 8
+        .cfi_offset ebp, -8
+        nop
+        nop
+      "#
+      : : : : "volatile")
   }
 
   #[naked]
@@ -81,13 +111,22 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackP
       r#"
         # Set up the second part of our DWARF CFI.
         # When unwinding the frame corresponding to this function, a DWARF unwinder
-        # will restore %ebx (and thus CFA of the first trampoline) from the stack slot.
-        .cfi_offset %ebx, 4
+        # will restore %ebp (and thus CFA of the first trampoline) from the stack slot.
+        # This stack slot is updated every time swap() is called to point to the bottom
+        # of the stack of the context switch just switched from.
+        .cfi_def_cfa ebp, 8
+        .cfi_offset ebp, -8
+
+        # This nop is here so that the initial swap doesn't return to the start
+        # of the trampoline, which confuses the unwinder since it will look for
+        # frame information in the previous symbol rather than this one. It is
+        # never actually executed.
+        nop
+
         # Push argument.
-        .cfi_def_cfa_offset 8
         pushl   %eax
         # Call the provided function.
-        call    *8(%esp)
+        call    *12(%esp)
       "#
       : : : : "volatile")
   }
@@ -97,19 +136,37 @@ pub unsafe fn init(stack: &Stack, f: unsafe extern "C" fn(usize) -> !) -> StackP
     *sp.0 = val
   }
 
+  // We set up the stack in a somewhat special way so that to the unwinder it
+  // looks like trampoline_1 has called trampoline_2, which has in turn called
+  // swap::trampoline.
+  //
+  // There are 2 call frames in this setup, each containing the return address
+  // followed by the %ebp value for that frame. This setup supports unwinding
+  // using DWARF CFI as well as the frame pointer-based unwinding used by tools
+  // such as perf or dtrace.
   let mut sp = StackPointer(stack.base() as *mut usize);
+
+  push(&mut sp, f as usize); // Function that trampoline_2 should call
+
+  // Call frame for trampoline_2. The CFA slot is updated by swap::trampoline
+  // each time a context switch is performed.
+  push(&mut sp, trampoline_1 as usize + 2); // Return after the 2 nops
   push(&mut sp, 0xdead0cfa); // CFA slot
-  push(&mut sp, f as usize); // function
-  push(&mut sp, trampoline_1 as usize);
-  push(&mut sp, 0xdeadbbbb); // saved %ebp
+
+  // Call frame for swap::trampoline. We set up the %ebp value to point to the
+  // parent call frame.
+  let frame = sp;
+  push(&mut sp, trampoline_2 as usize + 1); // Return after the nop
+  push(&mut sp, frame.0 as usize); // Pointer to parent call frame
+
   sp
 }
 
 #[inline(always)]
 pub unsafe fn swap(arg: usize, old_sp: *mut StackPointer, new_sp: StackPointer,
                    new_stack: &Stack) -> usize {
   // Address of the topmost CFA stack slot.
-  let new_cfa = (new_stack.base() as *mut usize).offset(-1);
+  let new_cfa = (new_stack.base() as *mut usize).offset(-3);
 
   #[naked]
   unsafe extern "C" fn trampoline() {
@@ -119,28 +176,41 @@ pub unsafe fn swap(arg: usize, old_sp: *mut StackPointer, new_sp: StackPointer,
         # the caller, and so it has to have the correct value immediately after
         # the call instruction that invoked the trampoline.
         pushl   %ebp
+        .cfi_adjust_cfa_offset 4
+        .cfi_rel_offset ebp, 0
+
+        # Link the call stacks together by writing the current stack bottom
+        # address to the CFA slot in the new stack.
+        movl    %esp, (%edi)
+
+        # Switch to the new stack for unwinding purposes. The old stack may no
+        # longer be valid now that we have modified the link.
+        .cfi_def_cfa_register edx
 
         # Save stack pointer of the old context.
         movl    %esp, (%esi)
         # Load stack pointer of the new context.
         movl    %edx, %esp
+        .cfi_def_cfa_register esp
 
         # Restore frame pointer of the new context.
         popl    %ebp
+        .cfi_adjust_cfa_offset -4
+        .cfi_restore ebp
 
         # Return into the new context. Use `pop` and `jmp` instead of a `ret`
         # to avoid return address mispredictions (~8ns per `ret` on Ivy Bridge).
-        popl    %ebx
-        jmpl    *%ebx
+        popl    %ecx
+        .cfi_adjust_cfa_offset -4
+        .cfi_register eip, ecx
+        jmpl    *%ecx
       "#
       : : : : "volatile")
   }
 
   let ret: usize;
   asm!(
     r#"
-      # Link the call stacks together.
-      movl    %esp, (%edi)
       # Push instruction pointer of the old context and switch to
       # the new context.
       call    ${1:c}