@@ -438,14 +438,16 @@ enum {
438438};
439439/*
440440 * Used for which event context the event is in.
441- * NMI = 0
442- * IRQ = 1
443- * SOFTIRQ = 2
444- * NORMAL = 3
441+ * TRANSITION = 0
442+ * NMI = 1
443+ * IRQ = 2
444+ * SOFTIRQ = 3
445+ * NORMAL = 4
445446 *
446447 * See trace_recursive_lock() comment below for more details.
447448 */
448449enum {
450+ RB_CTX_TRANSITION ,
449451 RB_CTX_NMI ,
450452 RB_CTX_IRQ ,
451453 RB_CTX_SOFTIRQ ,
@@ -3014,10 +3016,10 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
30143016 * a bit of overhead in something as critical as function tracing,
30153017 * we use a bitmask trick.
30163018 *
3017- * bit 0 = NMI context
3018- * bit 1 = IRQ context
3019- * bit 2 = SoftIRQ context
3020- * bit 3 = normal context.
3019+ * bit 1 = NMI context
3020+ * bit 2 = IRQ context
3021+ * bit 3 = SoftIRQ context
3022+ * bit 4 = normal context.
30213023 *
30223024 * This works because this is the order of contexts that can
30233025 * preempt other contexts. A SoftIRQ never preempts an IRQ
@@ -3040,6 +3042,30 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
30403042 * The least significant bit can be cleared this way, and it
30413043 * just so happens that it is the same bit corresponding to
30423044 * the current context.
3045+ *
3046+ * Now the TRANSITION bit breaks the above slightly. The TRANSITION bit
3047+ * is set when a recursion is detected at the current context, and if
3048+ * the TRANSITION bit is already set, it will fail the recursion.
3049+ * This is needed because there's a lag between the changing of
3050+ * interrupt context and updating the preempt count. In this case,
3051+ * a false positive will be found. To handle this, one extra recursion
3052+ * is allowed, and this is done by the TRANSITION bit. If the TRANSITION
3053+ * bit is already set, then it is considered a recursion and the function
3054+ * ends. Otherwise, the TRANSITION bit is set, and that bit is returned.
3055+ *
3056+ * On the trace_recursive_unlock(), the TRANSITION bit will be the first
3057+ * to be cleared. Even if it wasn't the context that set it. That is,
3058+ * if an interrupt comes in while NORMAL bit is set and the ring buffer
3059+ * is called before preempt_count() is updated, since the check will
3060+ * be on the NORMAL bit, the TRANSITION bit will then be set. If an
3061+ * NMI then comes in, it will set the NMI bit, but when the NMI code
3062+ * does the trace_recursive_unlock() it will clear the TRANSTION bit
3063+ * and leave the NMI bit set. But this is fine, because the interrupt
3064+ * code that set the TRANSITION bit will then clear the NMI bit when it
3065+ * calls trace_recursive_unlock(). If another NMI comes in, it will
3066+ * set the TRANSITION bit and continue.
3067+ *
3068+ * Note: The TRANSITION bit only handles a single transition between context.
30433069 */
30443070
30453071static __always_inline int
@@ -3055,8 +3081,16 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
30553081 bit = pc & NMI_MASK ? RB_CTX_NMI :
30563082 pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ ;
30573083
3058- if (unlikely (val & (1 << (bit + cpu_buffer -> nest ))))
3059- return 1 ;
3084+ if (unlikely (val & (1 << (bit + cpu_buffer -> nest )))) {
3085+ /*
3086+ * It is possible that this was called by transitioning
3087+ * between interrupt context, and preempt_count() has not
3088+ * been updated yet. In this case, use the TRANSITION bit.
3089+ */
3090+ bit = RB_CTX_TRANSITION ;
3091+ if (val & (1 << (bit + cpu_buffer -> nest )))
3092+ return 1 ;
3093+ }
30603094
30613095 val |= (1 << (bit + cpu_buffer -> nest ));
30623096 cpu_buffer -> current_context = val ;
@@ -3071,8 +3105,8 @@ trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
30713105 cpu_buffer -> current_context - (1 << cpu_buffer -> nest );
30723106}
30733107
3074- /* The recursive locking above uses 4 bits */
3075- #define NESTED_BITS 4
3108+ /* The recursive locking above uses 5 bits */
3109+ #define NESTED_BITS 5
30763110
30773111/**
30783112 * ring_buffer_nest_start - Allow to trace while nested
0 commit comments