Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement new POTel span processor #3223

Merged
merged 14 commits into from
Jul 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions sentry_sdk/integrations/opentelemetry/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@

SENTRY_TRACE_KEY = create_key("sentry-trace")
SENTRY_BAGGAGE_KEY = create_key("sentry-baggage")
OTEL_SENTRY_CONTEXT = "otel"
SPAN_ORIGIN = "auto.otel"
26 changes: 19 additions & 7 deletions sentry_sdk/integrations/opentelemetry/contextvars_context.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,26 @@
from opentelemetry.context.context import Context
from opentelemetry.context import Context, create_key, get_value, set_value
from opentelemetry.context.contextvars_context import ContextVarsRuntimeContext

from sentry_sdk.scope import Scope


_SCOPES_KEY = create_key("sentry_scopes")


class SentryContextVarsRuntimeContext(ContextVarsRuntimeContext):
def attach(self, context):
# type: (Context) -> object
# TODO-neel-potel do scope management
return super().attach(context)
scopes = get_value(_SCOPES_KEY, context)

if scopes and isinstance(scopes, tuple):
(current_scope, isolation_scope) = scopes
else:
current_scope = Scope.get_current_scope()
isolation_scope = Scope.get_isolation_scope()

# TODO-neel-potel fork isolation_scope too like JS
# once we setup our own apis to pass through to otel
new_scopes = (current_scope.fork(), isolation_scope)
new_context = set_value(_SCOPES_KEY, new_scopes, context)

def detach(self, token):
# type: (object) -> None
# TODO-neel-potel not sure if we need anything here, see later
super().detach(token)
return super().attach(new_context)
149 changes: 143 additions & 6 deletions sentry_sdk/integrations/opentelemetry/potel_span_processor.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,24 @@
from opentelemetry.sdk.trace import SpanProcessor
from collections import deque, defaultdict

from opentelemetry.trace import format_trace_id, format_span_id
from opentelemetry.context import Context
from opentelemetry.sdk.trace import Span, ReadableSpan, SpanProcessor

from sentry_sdk import capture_event
from sentry_sdk.integrations.opentelemetry.utils import (
is_sentry_span,
convert_otel_timestamp,
extract_span_data,
)
from sentry_sdk.integrations.opentelemetry.consts import (
OTEL_SENTRY_CONTEXT,
SPAN_ORIGIN,
)
from sentry_sdk._types import TYPE_CHECKING

if TYPE_CHECKING:
from typing import Optional
from opentelemetry.sdk.trace import ReadableSpan
from typing import Optional, List, Any, Deque, DefaultDict
from sentry_sdk._types import Event


class PotelSentrySpanProcessor(SpanProcessor):
Expand All @@ -22,15 +35,25 @@ def __new__(cls):

def __init__(self):
# type: () -> None
pass
self._children_spans = defaultdict(
list
) # type: DefaultDict[int, List[ReadableSpan]]

def on_start(self, span, parent_context=None):
# type: (ReadableSpan, Optional[Context]) -> None
# type: (Span, Optional[Context]) -> None
pass

def on_end(self, span):
# type: (ReadableSpan) -> None
pass
if is_sentry_span(span):
return

# TODO-neel-potel-remote only take parent if not remote
if span.parent:
self._children_spans[span.parent.span_id].append(span)
else:
# if have a root span ending, we build a transaction and send it
self._flush_root_span(span)

Comment on lines +55 to +56
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm thinking what happens if there's a child span that, for whatever reason (weird async cases? wonky instrumentation?), hasn't finished before the root span. Since we're not using on_start at all, there'll be virtually no record of the span at this point and the parent transaction will be sent. Once the child finishes, we'll run on_end with the now orphaned span and it'll be saved in _children_spans, but never sent, since the parent transaction is already gone. So we might have a potential leak there.

Do we need some sort of cleanup of orphaned spans? Should we wait a bit before flushing the transaction to account for child spans possibly ending very close to the parent span, but a bit late? IIRC I've noticed JS also having a small sleep in place.

TBH not sure how much of a real world problem late child spans are, but I can imagine they happen.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another option: Also use on_start to create some record of the span and then in on_end, if a transaction comes and we detect that it has unfinished child spans, wait for a grace period to give them time to finish?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

some heuristic cleanup logic will be done in a follow up PR yes, I haven't thought through exactly how we'll do it but JS just has a cutoff logic of 5 minutes

# TODO-neel-potel not sure we need a clear like JS
def shutdown(self):
Expand All @@ -42,3 +65,117 @@ def shutdown(self):
def force_flush(self, timeout_millis=30000):
# type: (int) -> bool
return True

def _flush_root_span(self, span):
# type: (ReadableSpan) -> None
transaction_event = self._root_span_to_transaction_event(span)
if not transaction_event:
return

spans = []
for child in self._collect_children(span):
span_json = self._span_to_json(child)
if span_json:
spans.append(span_json)
transaction_event["spans"] = spans
# TODO-neel-potel sort and cutoff max spans

capture_event(transaction_event)

def _collect_children(self, span):
# type: (ReadableSpan) -> List[ReadableSpan]
if not span.context:
return []

children = []
bfs_queue = deque() # type: Deque[int]
bfs_queue.append(span.context.span_id)

while bfs_queue:
parent_span_id = bfs_queue.popleft()
node_children = self._children_spans.pop(parent_span_id, [])
children.extend(node_children)
bfs_queue.extend(
[child.context.span_id for child in node_children if child.context]
)

return children

# we construct the event from scratch here
# and not use the current Transaction class for easier refactoring
def _root_span_to_transaction_event(self, span):
Comment on lines +104 to +105
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is cool, haven't thought of completely bypassing this. I'll have to think about the implications for the granular instrumenter if we isolate OTel and our instrumentation like this.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so for now I'm just creating a raw dict, but I'm thinking eventually of a TypedDict for Span too and the current Span/Transaction classes will basically be completely removed.

# type: (ReadableSpan) -> Optional[Event]
if not span.context:
return None
if not span.start_time:
return None
if not span.end_time:
return None

trace_id = format_trace_id(span.context.trace_id)
span_id = format_span_id(span.context.span_id)
parent_span_id = format_span_id(span.parent.span_id) if span.parent else None

(op, description, _) = extract_span_data(span)

trace_context = {
"trace_id": trace_id,
"span_id": span_id,
"origin": SPAN_ORIGIN,
"op": op,
"status": "ok", # TODO-neel-potel span status mapping
} # type: dict[str, Any]

if parent_span_id:
trace_context["parent_span_id"] = parent_span_id
if span.attributes:
trace_context["data"] = dict(span.attributes)

contexts = {"trace": trace_context}
if span.resource.attributes:
contexts[OTEL_SENTRY_CONTEXT] = {"resource": dict(span.resource.attributes)}

event = {
"type": "transaction",
"transaction": description,
# TODO-neel-potel tx source based on integration
"transaction_info": {"source": "custom"},
"contexts": contexts,
"start_timestamp": convert_otel_timestamp(span.start_time),
"timestamp": convert_otel_timestamp(span.end_time),
} # type: Event

return event

def _span_to_json(self, span):
# type: (ReadableSpan) -> Optional[dict[str, Any]]
if not span.context:
return None
if not span.start_time:
return None
if not span.end_time:
return None

trace_id = format_trace_id(span.context.trace_id)
span_id = format_span_id(span.context.span_id)
parent_span_id = format_span_id(span.parent.span_id) if span.parent else None

(op, description, _) = extract_span_data(span)

span_json = {
"trace_id": trace_id,
"span_id": span_id,
"origin": SPAN_ORIGIN,
"op": op,
"description": description,
"status": "ok", # TODO-neel-potel span status mapping
"start_timestamp": convert_otel_timestamp(span.start_time),
"timestamp": convert_otel_timestamp(span.end_time),
} # type: dict[str, Any]

if parent_span_id:
span_json["parent_span_id"] = parent_span_id
if span.attributes:
span_json["data"] = dict(span.attributes)

return span_json
Loading
Loading